1 #ifndef SSE2NEON_H
2 #define SSE2NEON_H
3 
4 // This header file provides a simple API translation layer
5 // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
6 //
7 // This header file does not yet translate all of the SSE intrinsics.
8 //
9 // Contributors to this work are:
10 //   John W. Ratcliff <jratcliffscarab@gmail.com>
11 //   Brandon Rowlett <browlett@nvidia.com>
12 //   Ken Fast <kfast@gdeb.com>
13 //   Eric van Beurden <evanbeurden@nvidia.com>
14 //   Alexander Potylitsin <apotylitsin@nvidia.com>
15 //   Hasindu Gamaarachchi <hasindu2008@gmail.com>
16 //   Jim Huang <jserv@biilabs.io>
17 //   Mark Cheng <marktwtn@biilabs.io>
18 //   Malcolm James MacLeod <malcolm@gulden.com>
19 //   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
20 //   Sebastian Pop <spop@amazon.com>
21 //   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
22 //   Danila Kutenin <danilak@google.com>
23 //   François Turban (JishinMaster) <francois.turban@gmail.com>
24 //   Pei-Hsuan Hung <afcidk@gmail.com>
25 //   Yang-Hao Yuan <yanghau@biilabs.io>
26 //   Syoyo Fujita <syoyo@lighttransport.com>
27 //   Brecht Van Lommel <brecht@blender.org>
28 
29 /*
30  * sse2neon is freely redistributable under the MIT License.
31  *
32  * Permission is hereby granted, free of charge, to any person obtaining a copy
33  * of this software and associated documentation files (the "Software"), to deal
34  * in the Software without restriction, including without limitation the rights
35  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
36  * copies of the Software, and to permit persons to whom the Software is
37  * furnished to do so, subject to the following conditions:
38  *
39  * The above copyright notice and this permission notice shall be included in
40  * all copies or substantial portions of the Software.
41  *
42  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
43  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
44  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
47  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
48  * SOFTWARE.
49  */
50 
51 /* Tunable configurations */
52 
53 /* Enable precise implementation of math operations
54  * This would slow down the computation a bit, but gives consistent result with
55  * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
56  */
57 /* _mm_min_ps and _mm_max_ps */
58 #ifndef SSE2NEON_PRECISE_MINMAX
59 #define SSE2NEON_PRECISE_MINMAX (0)
60 #endif
61 /* _mm_rcp_ps and _mm_div_ps */
62 #ifndef SSE2NEON_PRECISE_DIV
63 #define SSE2NEON_PRECISE_DIV (0)
64 #endif
65 /* _mm_sqrt_ps and _mm_rsqrt_ps */
66 #ifndef SSE2NEON_PRECISE_SQRT
67 #define SSE2NEON_PRECISE_SQRT (0)
68 #endif
69 #ifndef SSE2NEON_PRECISE_RSQRT
70 #define SSE2NEON_PRECISE_RSQRT (0)
71 #endif
72 
73 #if defined(__GNUC__) || defined(__clang__)
74 #pragma push_macro("FORCE_INLINE")
75 #pragma push_macro("ALIGN_STRUCT")
76 #define FORCE_INLINE static inline __attribute__((always_inline))
77 #define ALIGN_STRUCT(x) __attribute__((aligned(x)))
78 #ifndef likely
79 #define likely(x) __builtin_expect(!!(x), 1)
80 #endif
81 #ifndef unlikely
82 #define unlikely(x) __builtin_expect(!!(x), 0)
83 #endif
84 #else
85 #error "Macro name collisions may happen with unsupported compiler."
86 #ifdef FORCE_INLINE
87 #undef FORCE_INLINE
88 #endif
89 #define FORCE_INLINE static inline
90 #ifndef ALIGN_STRUCT
91 #define ALIGN_STRUCT(x) __declspec(align(x))
92 #endif
93 #endif
94 #ifndef likely
95 #define likely(x) (x)
96 #endif
97 #ifndef unlikely
98 #define unlikely(x) (x)
99 #endif
100 
101 #include <stdint.h>
102 #include <stdlib.h>
103 
104 /* Architecture-specific build options */
105 /* FIXME: #pragma GCC push_options is only available on GCC */
106 #if defined(__GNUC__)
107 #if defined(__arm__) && __ARM_ARCH == 7
108 /* According to ARM C Language Extensions Architecture specification,
109  * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
110  * architecture supported.
111  */
112 #if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
113 #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
114 #endif
115 #if !defined(__clang__)
116 #pragma GCC push_options
117 #pragma GCC target("fpu=neon")
118 #endif
119 #elif defined(__aarch64__)
120 #if !defined(__clang__)
121 #pragma GCC push_options
122 #pragma GCC target("+simd")
123 #endif
124 #else
125 #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
126 #endif
127 #endif
128 
129 #include <arm_neon.h>
130 
131 /* Rounding functions require either Aarch64 instructions or libm failback */
132 #if !defined(__aarch64__)
133 #include <math.h>
134 #endif
135 
136 /* "__has_builtin" can be used to query support for built-in functions
137  * provided by gcc/clang and other compilers that support it.
138  */
139 #ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
140 /* Compatibility with gcc <= 9 */
141 #if __GNUC__ <= 9
142 #define __has_builtin(x) HAS##x
143 #define HAS__builtin_popcount 1
144 #define HAS__builtin_popcountll 1
145 #else
146 #define __has_builtin(x) 0
147 #endif
148 #endif
149 
150 /**
151  * MACRO for shuffle parameter for _mm_shuffle_ps().
152  * Argument fp3 is a digit[0123] that represents the fp from argument "b"
153  * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
154  * for fp2 in result. fp1 is a digit[0123] that represents the fp from
155  * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
156  * fp0 is the same for fp0 of result.
157  */
158 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
159     (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
160 
161 /* Rounding mode macros. */
162 #define _MM_FROUND_TO_NEAREST_INT 0x00
163 #define _MM_FROUND_TO_NEG_INF 0x01
164 #define _MM_FROUND_TO_POS_INF 0x02
165 #define _MM_FROUND_TO_ZERO 0x03
166 #define _MM_FROUND_CUR_DIRECTION 0x04
167 #define _MM_FROUND_NO_EXC 0x08
168 #define _MM_ROUND_NEAREST 0x0000
169 #define _MM_ROUND_DOWN 0x2000
170 #define _MM_ROUND_UP 0x4000
171 #define _MM_ROUND_TOWARD_ZERO 0x6000
172 
173 /* indicate immediate constant argument in a given range */
174 #define __constrange(a, b) const
175 
176 /* A few intrinsics accept traditional data types like ints or floats, but
177  * most operate on data types that are specific to SSE.
178  * If a vector type ends in d, it contains doubles, and if it does not have
179  * a suffix, it contains floats. An integer vector type can contain any type
180  * of integer, from chars to shorts to unsigned long longs.
181  */
182 typedef int64x1_t __m64;
183 typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
184 // On ARM 32-bit architecture, the float64x2_t is not supported.
185 // The data type __m128d should be represented in a different way for related
186 // intrinsic conversion.
187 #if defined(__aarch64__)
188 typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
189 #else
190 typedef float32x4_t __m128d;
191 #endif
192 typedef int64x2_t __m128i; /* 128-bit vector containing integers */
193 
194 /* type-safe casting between types */
195 
196 #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
197 #define vreinterpretq_m128_f32(x) (x)
198 #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
199 
200 #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
201 #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
202 #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
203 #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
204 
205 #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
206 #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
207 #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
208 #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
209 
210 #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
211 #define vreinterpretq_f32_m128(x) (x)
212 #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
213 
214 #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
215 #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
216 #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
217 #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
218 
219 #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
220 #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
221 #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
222 #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
223 
224 #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
225 #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
226 #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
227 #define vreinterpretq_m128i_s64(x) (x)
228 
229 #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
230 #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
231 #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
232 #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
233 
234 #define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
235 #define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
236 
237 #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
238 #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
239 #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
240 #define vreinterpretq_s64_m128i(x) (x)
241 
242 #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
243 #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
244 #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
245 #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
246 
247 #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
248 #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
249 #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
250 #define vreinterpret_m64_s64(x) (x)
251 
252 #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
253 #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
254 #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
255 #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
256 
257 #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
258 #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
259 #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
260 
261 #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
262 #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
263 #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
264 #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
265 
266 #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
267 #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
268 #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
269 #define vreinterpret_s64_m64(x) (x)
270 
271 #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
272 
273 #if defined(__aarch64__)
274 #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
275 #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
276 
277 #define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
278 
279 #define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
280 #define vreinterpretq_m128d_f64(x) (x)
281 
282 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
283 
284 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
285 
286 #define vreinterpretq_f64_m128d(x) (x)
287 #define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
288 #else
289 #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
290 #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
291 
292 #define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
293 #define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
294 
295 #define vreinterpretq_m128d_f32(x) (x)
296 
297 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
298 
299 #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
300 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
301 
302 #define vreinterpretq_f32_m128d(x) (x)
303 #endif
304 
305 // A struct is defined in this header file called 'SIMDVec' which can be used
306 // by applications which attempt to access the contents of an _m128 struct
307 // directly.  It is important to note that accessing the __m128 struct directly
308 // is bad coding practice by Microsoft: @see:
309 // https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
310 //
311 // However, some legacy source code may try to access the contents of an __m128
312 // struct directly so the developer can use the SIMDVec as an alias for it.  Any
313 // casting must be done manually by the developer, as you cannot cast or
314 // otherwise alias the base NEON data type for intrinsic operations.
315 //
316 // union intended to allow direct access to an __m128 variable using the names
317 // that the MSVC compiler provides.  This union should really only be used when
318 // trying to access the members of the vector as integer values.  GCC/clang
319 // allow native access to the float members through a simple array access
320 // operator (in C since 4.6, in C++ since 4.8).
321 //
322 // Ideally direct accesses to SIMD vectors should not be used since it can cause
323 // a performance hit.  If it really is needed however, the original __m128
324 // variable can be aliased with a pointer to this union and used to access
325 // individual components.  The use of this union should be hidden behind a macro
326 // that is used throughout the codebase to access the members instead of always
327 // declaring this type of variable.
328 typedef union ALIGN_STRUCT(16) SIMDVec {
329     float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
330     int8_t m128_i8[16];    // as signed 8-bit integers.
331     int16_t m128_i16[8];   // as signed 16-bit integers.
332     int32_t m128_i32[4];   // as signed 32-bit integers.
333     int64_t m128_i64[2];   // as signed 64-bit integers.
334     uint8_t m128_u8[16];   // as unsigned 8-bit integers.
335     uint16_t m128_u16[8];  // as unsigned 16-bit integers.
336     uint32_t m128_u32[4];  // as unsigned 32-bit integers.
337     uint64_t m128_u64[2];  // as unsigned 64-bit integers.
338 } SIMDVec;
339 
340 // casting using SIMDVec
341 #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
342 #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
343 #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
344 
345 /* Backwards compatibility for compilers with lack of specific type support */
346 
347 // Older gcc does not define vld1q_u8_x4 type
348 #if defined(__GNUC__) && !defined(__clang__) &&   \
349     ((__GNUC__ == 10 && (__GNUC_MINOR__ <= 1)) || \
350      (__GNUC__ == 9 && (__GNUC_MINOR__ <= 3)) ||  \
351      (__GNUC__ == 8 && (__GNUC_MINOR__ <= 4)) || __GNUC__ <= 7)
_sse2neon_vld1q_u8_x4(const uint8_t * p)352 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
353 {
354     uint8x16x4_t ret;
355     ret.val[0] = vld1q_u8(p + 0);
356     ret.val[1] = vld1q_u8(p + 16);
357     ret.val[2] = vld1q_u8(p + 32);
358     ret.val[3] = vld1q_u8(p + 48);
359     return ret;
360 }
361 #else
362 // Wraps vld1q_u8_x4
_sse2neon_vld1q_u8_x4(const uint8_t * p)363 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
364 {
365     return vld1q_u8_x4(p);
366 }
367 #endif
368 
369 /* Function Naming Conventions
370  * The naming convention of SSE intrinsics is straightforward. A generic SSE
371  * intrinsic function is given as follows:
372  *   _mm_<name>_<data_type>
373  *
374  * The parts of this format are given as follows:
375  * 1. <name> describes the operation performed by the intrinsic
376  * 2. <data_type> identifies the data type of the function's primary arguments
377  *
378  * This last part, <data_type>, is a little complicated. It identifies the
379  * content of the input values, and can be set to any of the following values:
380  * + ps - vectors contain floats (ps stands for packed single-precision)
381  * + pd - vectors cantain doubles (pd stands for packed double-precision)
382  * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
383  *                            signed integers
384  * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
385  *                            unsigned integers
386  * + si128 - unspecified 128-bit vector or 256-bit vector
387  * + m128/m128i/m128d - identifies input vector types when they are different
388  *                      than the type of the returned vector
389  *
390  * For example, _mm_setzero_ps. The _mm implies that the function returns
391  * a 128-bit vector. The _ps at the end implies that the argument vectors
392  * contain floats.
393  *
394  * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
395  *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
396  *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
397  *   // Set packed 8-bit integers
398  *   // 128 bits, 16 chars, per 8 bits
399  *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
400  *                                  4, 5, 12, 13, 6, 7, 14, 15);
401  *   // Shuffle packed 8-bit integers
402  *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
403  *
404  * Data (Number, Binary, Byte Index):
405     +------+------+-------------+------+------+-------------+
406     |      1      |      2      |      3      |      4      | Number
407     +------+------+------+------+------+------+------+------+
408     | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
409     +------+------+------+------+------+------+------+------+
410     |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
411     +------+------+------+------+------+------+------+------+
412 
413     +------+------+------+------+------+------+------+------+
414     |      5      |      6      |      7      |      8      | Number
415     +------+------+------+------+------+------+------+------+
416     | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
417     +------+------+------+------+------+------+------+------+
418     |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
419     +------+------+------+------+------+------+------+------+
420  * Index (Byte Index):
421     +------+------+------+------+------+------+------+------+
422     |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
423     +------+------+------+------+------+------+------+------+
424 
425     +------+------+------+------+------+------+------+------+
426     |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
427     +------+------+------+------+------+------+------+------+
428  * Result:
429     +------+------+------+------+------+------+------+------+
430     |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
431     +------+------+------+------+------+------+------+------+
432     | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
433     +------+------+------+------+------+------+------+------+
434     |     256     |      2      |      5      |      6      | Number
435     +------+------+------+------+------+------+------+------+
436 
437     +------+------+------+------+------+------+------+------+
438     |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
439     +------+------+------+------+------+------+------+------+
440     | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
441     +------+------+------+------+------+------+------+------+
442     |      3      |      7      |      4      |      8      | Number
443     +------+------+------+------+------+------+-------------+
444  */
445 
446 /* Set/get methods */
447 
448 /* Constants for use with _mm_prefetch.  */
449 enum _mm_hint {
450     _MM_HINT_NTA = 0,  /* load data to L1 and L2 cache, mark it as NTA */
451     _MM_HINT_T0 = 1,   /* load data to L1 and L2 cache */
452     _MM_HINT_T1 = 2,   /* load data to L2 cache only */
453     _MM_HINT_T2 = 3,   /* load data to L2 cache only, mark it as NTA */
454     _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
455     _MM_HINT_ET0 = 5,  /* exclusive version of _MM_HINT_T0 */
456     _MM_HINT_ET1 = 6,  /* exclusive version of _MM_HINT_T1 */
457     _MM_HINT_ET2 = 7   /* exclusive version of _MM_HINT_T2 */
458 };
459 
460 // Loads one cache line of data from address p to a location closer to the
461 // processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
_mm_prefetch(const void * p,int i)462 FORCE_INLINE void _mm_prefetch(const void *p, int i)
463 {
464     (void) i;
465     __builtin_prefetch(p);
466 }
467 
468 // Pause the processor. This is typically used in spin-wait loops and depending
469 // on the x86 processor typical values are in the 40-100 cycle range. The
470 // 'yield' instruction isn't a good fit beacuse it's effectively a nop on most
471 // Arm cores. Experience with several databases has shown has shown an 'isb' is
472 // a reasonable approximation.
_mm_pause()473 FORCE_INLINE void _mm_pause()
474 {
475     __asm__ __volatile__("isb\n");
476 }
477 
478 // Copy the lower single-precision (32-bit) floating-point element of a to dst.
479 //
480 //   dst[31:0] := a[31:0]
481 //
482 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
_mm_cvtss_f32(__m128 a)483 FORCE_INLINE float _mm_cvtss_f32(__m128 a)
484 {
485     return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
486 }
487 
488 // Convert the lower single-precision (32-bit) floating-point element in b to a
489 // double-precision (64-bit) floating-point element, store the result in the
490 // lower element of dst, and copy the upper element from a to the upper element
491 // of dst.
492 //
493 //   dst[63:0] := Convert_FP32_To_FP64(b[31:0])
494 //   dst[127:64] := a[127:64]
495 //
496 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
_mm_cvtss_sd(__m128d a,__m128 b)497 FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
498 {
499     double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
500 #if defined(__aarch64__)
501     return vreinterpretq_m128d_f64(
502         vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
503 #else
504     return vreinterpretq_m128d_s64(
505         vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
506 #endif
507 }
508 
509 // Convert the lower single-precision (32-bit) floating-point element in a to a
510 // 32-bit integer, and store the result in dst.
511 //
512 //   dst[31:0] := Convert_FP32_To_Int32(a[31:0])
513 //
514 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
515 #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
516 
517 // Convert the lower single-precision (32-bit) floating-point element in a to a
518 // 64-bit integer, and store the result in dst.
519 //
520 //   dst[63:0] := Convert_FP32_To_Int64(a[31:0])
521 //
522 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
_mm_cvtss_si64(__m128 a)523 FORCE_INLINE int _mm_cvtss_si64(__m128 a)
524 {
525 #if defined(__aarch64__)
526     return vgetq_lane_s64(
527         vreinterpretq_s64_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))), 0);
528 #else
529     float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
530     float32_t diff = data - floor(data);
531     if (diff > 0.5)
532         return (int64_t) ceil(data);
533     if (unlikely(diff == 0.5)) {
534         int64_t f = (int64_t) floor(data);
535         int64_t c = (int64_t) ceil(data);
536         return c & 1 ? f : c;
537     }
538     return (int64_t) floor(data);
539 #endif
540 }
541 
542 // Convert packed single-precision (32-bit) floating-point elements in a to
543 // packed 32-bit integers with truncation, and store the results in dst.
544 //
545 //   FOR j := 0 to 1
546 //      i := 32*j
547 //      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
548 //   ENDFOR
549 //
550 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
_mm_cvtt_ps2pi(__m128 a)551 FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
552 {
553     return vreinterpret_m64_s32(
554         vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
555 }
556 
557 // Convert the lower single-precision (32-bit) floating-point element in a to a
558 // 32-bit integer with truncation, and store the result in dst.
559 //
560 //   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
561 //
562 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
_mm_cvtt_ss2si(__m128 a)563 FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
564 {
565     return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
566 }
567 
568 // Convert packed single-precision (32-bit) floating-point elements in a to
569 // packed 32-bit integers with truncation, and store the results in dst.
570 //
571 //   FOR j := 0 to 1
572 //      i := 32*j
573 //      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
574 //   ENDFOR
575 //
576 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
577 #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
578 
579 // Convert the lower single-precision (32-bit) floating-point element in a to a
580 // 32-bit integer with truncation, and store the result in dst.
581 //
582 //   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
583 //
584 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
585 #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
586 
587 // Convert the lower single-precision (32-bit) floating-point element in a to a
588 // 64-bit integer with truncation, and store the result in dst.
589 //
590 //   dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
591 //
592 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
_mm_cvttss_si64(__m128 a)593 FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
594 {
595     return vgetq_lane_s64(
596         vmovl_s32(vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))), 0);
597 }
598 
599 // Sets the 128-bit value to zero
600 // https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
_mm_setzero_si128(void)601 FORCE_INLINE __m128i _mm_setzero_si128(void)
602 {
603     return vreinterpretq_m128i_s32(vdupq_n_s32(0));
604 }
605 
606 // Clears the four single-precision, floating-point values.
607 // https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
_mm_setzero_ps(void)608 FORCE_INLINE __m128 _mm_setzero_ps(void)
609 {
610     return vreinterpretq_m128_f32(vdupq_n_f32(0));
611 }
612 
613 // Return vector of type __m128d with all elements set to zero.
614 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
_mm_setzero_pd(void)615 FORCE_INLINE __m128d _mm_setzero_pd(void)
616 {
617 #if defined(__aarch64__)
618     return vreinterpretq_m128d_f64(vdupq_n_f64(0));
619 #else
620     return vreinterpretq_m128d_f32(vdupq_n_f32(0));
621 #endif
622 }
623 
624 // Sets the four single-precision, floating-point values to w.
625 //
626 //   r0 := r1 := r2 := r3 := w
627 //
628 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
_mm_set1_ps(float _w)629 FORCE_INLINE __m128 _mm_set1_ps(float _w)
630 {
631     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
632 }
633 
634 // Sets the four single-precision, floating-point values to w.
635 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
_mm_set_ps1(float _w)636 FORCE_INLINE __m128 _mm_set_ps1(float _w)
637 {
638     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
639 }
640 
641 // Sets the four single-precision, floating-point values to the four inputs.
642 // https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
_mm_set_ps(float w,float z,float y,float x)643 FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
644 {
645     float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
646     return vreinterpretq_m128_f32(vld1q_f32(data));
647 }
648 
649 // Copy single-precision (32-bit) floating-point element a to the lower element
650 // of dst, and zero the upper 3 elements.
651 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
_mm_set_ss(float a)652 FORCE_INLINE __m128 _mm_set_ss(float a)
653 {
654     float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
655     return vreinterpretq_m128_f32(vld1q_f32(data));
656 }
657 
658 // Sets the four single-precision, floating-point values to the four inputs in
659 // reverse order.
660 // https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
_mm_setr_ps(float w,float z,float y,float x)661 FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
662 {
663     float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
664     return vreinterpretq_m128_f32(vld1q_f32(data));
665 }
666 
667 // Sets the 8 signed 16-bit integer values in reverse order.
668 //
669 // Return Value
670 //   r0 := w0
671 //   r1 := w1
672 //   ...
673 //   r7 := w7
_mm_setr_epi16(short w0,short w1,short w2,short w3,short w4,short w5,short w6,short w7)674 FORCE_INLINE __m128i _mm_setr_epi16(short w0,
675                                     short w1,
676                                     short w2,
677                                     short w3,
678                                     short w4,
679                                     short w5,
680                                     short w6,
681                                     short w7)
682 {
683     int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
684     return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
685 }
686 
687 // Sets the 4 signed 32-bit integer values in reverse order
688 // https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
_mm_setr_epi32(int i3,int i2,int i1,int i0)689 FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
690 {
691     int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
692     return vreinterpretq_m128i_s32(vld1q_s32(data));
693 }
694 
695 // Set packed 64-bit integers in dst with the supplied values in reverse order.
696 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
_mm_setr_epi64(__m64 e1,__m64 e0)697 FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
698 {
699     return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
700 }
701 
702 // Sets the 16 signed 8-bit integer values to b.
703 //
704 //   r0 := b
705 //   r1 := b
706 //   ...
707 //   r15 := b
708 //
709 // https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
_mm_set1_epi8(signed char w)710 FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
711 {
712     return vreinterpretq_m128i_s8(vdupq_n_s8(w));
713 }
714 
715 // Broadcast double-precision (64-bit) floating-point value a to all elements of
716 // dst.
717 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
_mm_set1_pd(double d)718 FORCE_INLINE __m128d _mm_set1_pd(double d)
719 {
720 #if defined(__aarch64__)
721     return vreinterpretq_m128d_f64(vdupq_n_f64(d));
722 #else
723     return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
724 #endif
725 }
726 
727 // Sets the 8 signed 16-bit integer values to w.
728 //
729 //   r0 := w
730 //   r1 := w
731 //   ...
732 //   r7 := w
733 //
734 // https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
_mm_set1_epi16(short w)735 FORCE_INLINE __m128i _mm_set1_epi16(short w)
736 {
737     return vreinterpretq_m128i_s16(vdupq_n_s16(w));
738 }
739 
740 // Sets the 16 signed 8-bit integer values.
741 // https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
_mm_set_epi8(signed char b15,signed char b14,signed char b13,signed char b12,signed char b11,signed char b10,signed char b9,signed char b8,signed char b7,signed char b6,signed char b5,signed char b4,signed char b3,signed char b2,signed char b1,signed char b0)742 FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
743                                   signed char b14,
744                                   signed char b13,
745                                   signed char b12,
746                                   signed char b11,
747                                   signed char b10,
748                                   signed char b9,
749                                   signed char b8,
750                                   signed char b7,
751                                   signed char b6,
752                                   signed char b5,
753                                   signed char b4,
754                                   signed char b3,
755                                   signed char b2,
756                                   signed char b1,
757                                   signed char b0)
758 {
759     int8_t ALIGN_STRUCT(16)
760         data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
761                     (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
762                     (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
763                     (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
764     return (__m128i) vld1q_s8(data);
765 }
766 
767 // Sets the 8 signed 16-bit integer values.
768 // https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
_mm_set_epi16(short i7,short i6,short i5,short i4,short i3,short i2,short i1,short i0)769 FORCE_INLINE __m128i _mm_set_epi16(short i7,
770                                    short i6,
771                                    short i5,
772                                    short i4,
773                                    short i3,
774                                    short i2,
775                                    short i1,
776                                    short i0)
777 {
778     int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
779     return vreinterpretq_m128i_s16(vld1q_s16(data));
780 }
781 
782 // Sets the 16 signed 8-bit integer values in reverse order.
783 // https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
_mm_setr_epi8(signed char b0,signed char b1,signed char b2,signed char b3,signed char b4,signed char b5,signed char b6,signed char b7,signed char b8,signed char b9,signed char b10,signed char b11,signed char b12,signed char b13,signed char b14,signed char b15)784 FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
785                                    signed char b1,
786                                    signed char b2,
787                                    signed char b3,
788                                    signed char b4,
789                                    signed char b5,
790                                    signed char b6,
791                                    signed char b7,
792                                    signed char b8,
793                                    signed char b9,
794                                    signed char b10,
795                                    signed char b11,
796                                    signed char b12,
797                                    signed char b13,
798                                    signed char b14,
799                                    signed char b15)
800 {
801     int8_t ALIGN_STRUCT(16)
802         data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
803                     (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
804                     (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
805                     (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
806     return (__m128i) vld1q_s8(data);
807 }
808 
809 // Sets the 4 signed 32-bit integer values to i.
810 //
811 //   r0 := i
812 //   r1 := i
813 //   r2 := i
814 //   r3 := I
815 //
816 // https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
_mm_set1_epi32(int _i)817 FORCE_INLINE __m128i _mm_set1_epi32(int _i)
818 {
819     return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
820 }
821 
822 // Sets the 2 signed 64-bit integer values to i.
823 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
_mm_set1_epi64(__m64 _i)824 FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
825 {
826     return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
827 }
828 
829 // Sets the 2 signed 64-bit integer values to i.
830 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
_mm_set1_epi64x(int64_t _i)831 FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
832 {
833     return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
834 }
835 
836 // Sets the 4 signed 32-bit integer values.
837 // https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
_mm_set_epi32(int i3,int i2,int i1,int i0)838 FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
839 {
840     int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
841     return vreinterpretq_m128i_s32(vld1q_s32(data));
842 }
843 
844 // Returns the __m128i structure with its two 64-bit integer values
845 // initialized to the values of the two 64-bit integers passed in.
846 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
_mm_set_epi64x(int64_t i1,int64_t i2)847 FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
848 {
849     return vreinterpretq_m128i_s64(
850         vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
851 }
852 
853 // Returns the __m128i structure with its two 64-bit integer values
854 // initialized to the values of the two 64-bit integers passed in.
855 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
_mm_set_epi64(__m64 i1,__m64 i2)856 FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
857 {
858     return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
859 }
860 
861 // Set packed double-precision (64-bit) floating-point elements in dst with the
862 // supplied values.
863 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
_mm_set_pd(double e1,double e0)864 FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
865 {
866     double ALIGN_STRUCT(16) data[2] = {e0, e1};
867 #if defined(__aarch64__)
868     return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
869 #else
870     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
871 #endif
872 }
873 
874 // Set packed double-precision (64-bit) floating-point elements in dst with the
875 // supplied values in reverse order.
876 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
_mm_setr_pd(double e1,double e0)877 FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
878 {
879     return _mm_set_pd(e0, e1);
880 }
881 
882 // Copy double-precision (64-bit) floating-point element a to the lower element
883 // of dst, and zero the upper element.
884 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
_mm_set_sd(double a)885 FORCE_INLINE __m128d _mm_set_sd(double a)
886 {
887     return _mm_set_pd(0, a);
888 }
889 
890 // Broadcast double-precision (64-bit) floating-point value a to all elements of
891 // dst.
892 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
893 #define _mm_set_pd1 _mm_set1_pd
894 
895 // Stores four single-precision, floating-point values.
896 // https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
_mm_store_ps(float * p,__m128 a)897 FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
898 {
899     vst1q_f32(p, vreinterpretq_f32_m128(a));
900 }
901 
902 // Store the lower single-precision (32-bit) floating-point element from a into
903 // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
904 // boundary or a general-protection exception may be generated.
905 //
906 //   MEM[mem_addr+31:mem_addr] := a[31:0]
907 //   MEM[mem_addr+63:mem_addr+32] := a[31:0]
908 //   MEM[mem_addr+95:mem_addr+64] := a[31:0]
909 //   MEM[mem_addr+127:mem_addr+96] := a[31:0]
910 //
911 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
_mm_store_ps1(float * p,__m128 a)912 FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
913 {
914     float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
915     vst1q_f32(p, vdupq_n_f32(a0));
916 }
917 
918 // Store the lower single-precision (32-bit) floating-point element from a into
919 // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
920 // boundary or a general-protection exception may be generated.
921 //
922 //   MEM[mem_addr+31:mem_addr] := a[31:0]
923 //   MEM[mem_addr+63:mem_addr+32] := a[31:0]
924 //   MEM[mem_addr+95:mem_addr+64] := a[31:0]
925 //   MEM[mem_addr+127:mem_addr+96] := a[31:0]
926 //
927 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
928 #define _mm_store1_ps _mm_store_ps1
929 
930 // Store 4 single-precision (32-bit) floating-point elements from a into memory
931 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
932 // general-protection exception may be generated.
933 //
934 //   MEM[mem_addr+31:mem_addr] := a[127:96]
935 //   MEM[mem_addr+63:mem_addr+32] := a[95:64]
936 //   MEM[mem_addr+95:mem_addr+64] := a[63:32]
937 //   MEM[mem_addr+127:mem_addr+96] := a[31:0]
938 //
939 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
_mm_storer_ps(float * p,__m128 a)940 FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
941 {
942     float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
943     float32x4_t rev = vextq_f32(tmp, tmp, 2);
944     vst1q_f32(p, rev);
945 }
946 
947 // Stores four single-precision, floating-point values.
948 // https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
_mm_storeu_ps(float * p,__m128 a)949 FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
950 {
951     vst1q_f32(p, vreinterpretq_f32_m128(a));
952 }
953 
954 // Stores four 32-bit integer values as (as a __m128i value) at the address p.
955 // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
_mm_store_si128(__m128i * p,__m128i a)956 FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
957 {
958     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
959 }
960 
961 // Stores four 32-bit integer values as (as a __m128i value) at the address p.
962 // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
_mm_storeu_si128(__m128i * p,__m128i a)963 FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
964 {
965     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
966 }
967 
968 // Stores the lower single - precision, floating - point value.
969 // https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
_mm_store_ss(float * p,__m128 a)970 FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
971 {
972     vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
973 }
974 
975 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
976 // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
977 // or a general-protection exception may be generated.
978 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
_mm_store_pd(double * mem_addr,__m128d a)979 FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
980 {
981 #if defined(__aarch64__)
982     vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
983 #else
984     vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
985 #endif
986 }
987 
988 // Store the upper double-precision (64-bit) floating-point element from a into
989 // memory.
990 //
991 //   MEM[mem_addr+63:mem_addr] := a[127:64]
992 //
993 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
_mm_storeh_pd(double * mem_addr,__m128d a)994 FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
995 {
996 #if defined(__aarch64__)
997     vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
998 #else
999     vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
1000 #endif
1001 }
1002 
1003 // Store the lower double-precision (64-bit) floating-point element from a into
1004 // memory.
1005 //
1006 //   MEM[mem_addr+63:mem_addr] := a[63:0]
1007 //
1008 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
_mm_storel_pd(double * mem_addr,__m128d a)1009 FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
1010 {
1011 #if defined(__aarch64__)
1012     vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
1013 #else
1014     vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
1015 #endif
1016 }
1017 
1018 // Store 2 double-precision (64-bit) floating-point elements from a into memory
1019 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1020 // general-protection exception may be generated.
1021 //
1022 //   MEM[mem_addr+63:mem_addr] := a[127:64]
1023 //   MEM[mem_addr+127:mem_addr+64] := a[63:0]
1024 //
1025 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
_mm_storer_pd(double * mem_addr,__m128d a)1026 FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
1027 {
1028     float32x4_t f = vreinterpretq_f32_m128d(a);
1029     _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
1030 }
1031 
1032 // Store the lower double-precision (64-bit) floating-point element from a into
1033 // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
1034 // boundary or a general-protection exception may be generated.
1035 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
_mm_store_pd1(double * mem_addr,__m128d a)1036 FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
1037 {
1038 #if defined(__aarch64__)
1039     float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
1040     vst1q_f64((float64_t *) mem_addr,
1041               vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
1042 #else
1043     float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
1044     vst1q_f32((float32_t *) mem_addr,
1045               vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
1046 #endif
1047 }
1048 
1049 // Store the lower double-precision (64-bit) floating-point element from a into
1050 // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
1051 // boundary or a general-protection exception may be generated.
1052 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
1053 #define _mm_store1_pd _mm_store_pd1
1054 
1055 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
1056 // elements) from a into memory. mem_addr does not need to be aligned on any
1057 // particular boundary.
1058 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
_mm_storeu_pd(double * mem_addr,__m128d a)1059 FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
1060 {
1061     _mm_store_pd(mem_addr, a);
1062 }
1063 
1064 // Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
1065 // https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
_mm_storel_epi64(__m128i * a,__m128i b)1066 FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
1067 {
1068     uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
1069     uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
1070     *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
1071 }
1072 
1073 // Stores the lower two single-precision floating point values of a to the
1074 // address p.
1075 //
1076 //   *p0 := a0
1077 //   *p1 := a1
1078 //
1079 // https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
_mm_storel_pi(__m64 * p,__m128 a)1080 FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
1081 {
1082     *p = vreinterpret_m64_f32(vget_low_f32(a));
1083 }
1084 
1085 // Stores the upper two single-precision, floating-point values of a to the
1086 // address p.
1087 //
1088 //   *p0 := a2
1089 //   *p1 := a3
1090 //
1091 // https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
_mm_storeh_pi(__m64 * p,__m128 a)1092 FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
1093 {
1094     *p = vreinterpret_m64_f32(vget_high_f32(a));
1095 }
1096 
1097 // Loads a single single-precision, floating-point value, copying it into all
1098 // four words
1099 // https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
_mm_load1_ps(const float * p)1100 FORCE_INLINE __m128 _mm_load1_ps(const float *p)
1101 {
1102     return vreinterpretq_m128_f32(vld1q_dup_f32(p));
1103 }
1104 
1105 // Load a single-precision (32-bit) floating-point element from memory into all
1106 // elements of dst.
1107 //
1108 //   dst[31:0] := MEM[mem_addr+31:mem_addr]
1109 //   dst[63:32] := MEM[mem_addr+31:mem_addr]
1110 //   dst[95:64] := MEM[mem_addr+31:mem_addr]
1111 //   dst[127:96] := MEM[mem_addr+31:mem_addr]
1112 //
1113 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
1114 #define _mm_load_ps1 _mm_load1_ps
1115 
1116 // Sets the lower two single-precision, floating-point values with 64
1117 // bits of data loaded from the address p; the upper two values are passed
1118 // through from a.
1119 //
1120 // Return Value
1121 //   r0 := *p0
1122 //   r1 := *p1
1123 //   r2 := a2
1124 //   r3 := a3
1125 //
1126 // https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
_mm_loadl_pi(__m128 a,__m64 const * p)1127 FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
1128 {
1129     return vreinterpretq_m128_f32(
1130         vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
1131 }
1132 
1133 // Load 4 single-precision (32-bit) floating-point elements from memory into dst
1134 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1135 // general-protection exception may be generated.
1136 //
1137 //   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
1138 //   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
1139 //   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
1140 //   dst[127:96] := MEM[mem_addr+31:mem_addr]
1141 //
1142 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
_mm_loadr_ps(const float * p)1143 FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
1144 {
1145     float32x4_t v = vrev64q_f32(vld1q_f32(p));
1146     return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
1147 }
1148 
1149 // Sets the upper two single-precision, floating-point values with 64
1150 // bits of data loaded from the address p; the lower two values are passed
1151 // through from a.
1152 //
1153 //   r0 := a0
1154 //   r1 := a1
1155 //   r2 := *p0
1156 //   r3 := *p1
1157 //
1158 // https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
_mm_loadh_pi(__m128 a,__m64 const * p)1159 FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
1160 {
1161     return vreinterpretq_m128_f32(
1162         vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
1163 }
1164 
1165 // Loads four single-precision, floating-point values.
1166 // https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
_mm_load_ps(const float * p)1167 FORCE_INLINE __m128 _mm_load_ps(const float *p)
1168 {
1169     return vreinterpretq_m128_f32(vld1q_f32(p));
1170 }
1171 
1172 // Loads four single-precision, floating-point values.
1173 // https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
_mm_loadu_ps(const float * p)1174 FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
1175 {
1176     // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
1177     // equivalent for neon
1178     return vreinterpretq_m128_f32(vld1q_f32(p));
1179 }
1180 
1181 // Load unaligned 16-bit integer from memory into the first element of dst.
1182 //
1183 //   dst[15:0] := MEM[mem_addr+15:mem_addr]
1184 //   dst[MAX:16] := 0
1185 //
1186 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
_mm_loadu_si16(const void * p)1187 FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
1188 {
1189     return vreinterpretq_m128i_s16(
1190         vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
1191 }
1192 
1193 // Load unaligned 64-bit integer from memory into the first element of dst.
1194 //
1195 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
1196 //   dst[MAX:64] := 0
1197 //
1198 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
_mm_loadu_si64(const void * p)1199 FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
1200 {
1201     return vreinterpretq_m128i_s64(
1202         vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
1203 }
1204 
1205 // Load a double-precision (64-bit) floating-point element from memory into the
1206 // lower of dst, and zero the upper element. mem_addr does not need to be
1207 // aligned on any particular boundary.
1208 //
1209 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
1210 //   dst[127:64] := 0
1211 //
1212 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
_mm_load_sd(const double * p)1213 FORCE_INLINE __m128d _mm_load_sd(const double *p)
1214 {
1215 #if defined(__aarch64__)
1216     return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
1217 #else
1218     const float *fp = (const float *) p;
1219     float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
1220     return vreinterpretq_m128d_f32(vld1q_f32(data));
1221 #endif
1222 }
1223 
1224 // Loads two double-precision from 16-byte aligned memory, floating-point
1225 // values.
1226 //
1227 //   dst[127:0] := MEM[mem_addr+127:mem_addr]
1228 //
1229 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
_mm_load_pd(const double * p)1230 FORCE_INLINE __m128d _mm_load_pd(const double *p)
1231 {
1232 #if defined(__aarch64__)
1233     return vreinterpretq_m128d_f64(vld1q_f64(p));
1234 #else
1235     const float *fp = (const float *) p;
1236     float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
1237     return vreinterpretq_m128d_f32(vld1q_f32(data));
1238 #endif
1239 }
1240 
1241 // Loads two double-precision from unaligned memory, floating-point values.
1242 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
_mm_loadu_pd(const double * p)1243 FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
1244 {
1245     return _mm_load_pd(p);
1246 }
1247 
1248 // Loads an single - precision, floating - point value into the low word and
1249 // clears the upper three words.
1250 // https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
_mm_load_ss(const float * p)1251 FORCE_INLINE __m128 _mm_load_ss(const float *p)
1252 {
1253     return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1254 }
1255 
1256 // Load 64-bit integer from memory into the first element of dst.
1257 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
_mm_loadl_epi64(__m128i const * p)1258 FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
1259 {
1260     /* Load the lower 64 bits of the value pointed to by p into the
1261      * lower 64 bits of the result, zeroing the upper 64 bits of the result.
1262      */
1263     return vreinterpretq_m128i_s32(
1264         vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
1265 }
1266 
1267 // Load a double-precision (64-bit) floating-point element from memory into the
1268 // lower element of dst, and copy the upper element from a to dst. mem_addr does
1269 // not need to be aligned on any particular boundary.
1270 //
1271 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
1272 //   dst[127:64] := a[127:64]
1273 //
1274 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
_mm_loadl_pd(__m128d a,const double * p)1275 FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
1276 {
1277 #if defined(__aarch64__)
1278     return vreinterpretq_m128d_f64(
1279         vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
1280 #else
1281     return vreinterpretq_m128d_f32(
1282         vcombine_f32(vld1_f32((const float *) p),
1283                      vget_high_f32(vreinterpretq_f32_m128d(a))));
1284 #endif
1285 }
1286 
1287 // Load 2 double-precision (64-bit) floating-point elements from memory into dst
1288 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1289 // general-protection exception may be generated.
1290 //
1291 //   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
1292 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
1293 //
1294 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
_mm_loadr_pd(const double * p)1295 FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
1296 {
1297 #if defined(__aarch64__)
1298     float64x2_t v = vld1q_f64(p);
1299     return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
1300 #else
1301     int64x2_t v = vld1q_s64((const int64_t *) p);
1302     return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
1303 #endif
1304 }
1305 
1306 // Sets the low word to the single-precision, floating-point value of b
1307 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
_mm_move_ss(__m128 a,__m128 b)1308 FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
1309 {
1310     return vreinterpretq_m128_f32(
1311         vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
1312                        vreinterpretq_f32_m128(a), 0));
1313 }
1314 
1315 // Move the lower double-precision (64-bit) floating-point element from b to the
1316 // lower element of dst, and copy the upper element from a to the upper element
1317 // of dst.
1318 //
1319 //   dst[63:0] := b[63:0]
1320 //   dst[127:64] := a[127:64]
1321 //
1322 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
_mm_move_sd(__m128d a,__m128d b)1323 FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
1324 {
1325     return vreinterpretq_m128d_f32(
1326         vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
1327                      vget_high_f32(vreinterpretq_f32_m128d(a))));
1328 }
1329 
1330 // Copy the lower 64-bit integer in a to the lower element of dst, and zero the
1331 // upper element.
1332 //
1333 //   dst[63:0] := a[63:0]
1334 //   dst[127:64] := 0
1335 //
1336 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
_mm_move_epi64(__m128i a)1337 FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
1338 {
1339     return vreinterpretq_m128i_s64(
1340         vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
1341 }
1342 
1343 // Return vector of type __m128 with undefined elements.
1344 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
_mm_undefined_ps(void)1345 FORCE_INLINE __m128 _mm_undefined_ps(void)
1346 {
1347 #if defined(__GNUC__) || defined(__clang__)
1348 #pragma GCC diagnostic push
1349 #pragma GCC diagnostic ignored "-Wuninitialized"
1350 #endif
1351     __m128 a;
1352     return a;
1353 #if defined(__GNUC__) || defined(__clang__)
1354 #pragma GCC diagnostic pop
1355 #endif
1356 }
1357 
1358 /* Logic/Binary operations */
1359 
1360 // Computes the bitwise AND-NOT of the four single-precision, floating-point
1361 // values of a and b.
1362 //
1363 //   r0 := ~a0 & b0
1364 //   r1 := ~a1 & b1
1365 //   r2 := ~a2 & b2
1366 //   r3 := ~a3 & b3
1367 //
1368 // https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
_mm_andnot_ps(__m128 a,__m128 b)1369 FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
1370 {
1371     return vreinterpretq_m128_s32(
1372         vbicq_s32(vreinterpretq_s32_m128(b),
1373                   vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
1374 }
1375 
1376 // Compute the bitwise NOT of packed double-precision (64-bit) floating-point
1377 // elements in a and then AND with b, and store the results in dst.
1378 //
1379 //   FOR j := 0 to 1
1380 // 	     i := j*64
1381 // 	     dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
1382 //   ENDFOR
1383 //
1384 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
_mm_andnot_pd(__m128d a,__m128d b)1385 FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
1386 {
1387     // *NOTE* argument swap
1388     return vreinterpretq_m128d_s64(
1389         vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
1390 }
1391 
1392 // Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
1393 // 128-bit value in a.
1394 //
1395 //   r := (~a) & b
1396 //
1397 // https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
_mm_andnot_si128(__m128i a,__m128i b)1398 FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
1399 {
1400     return vreinterpretq_m128i_s32(
1401         vbicq_s32(vreinterpretq_s32_m128i(b),
1402                   vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
1403 }
1404 
1405 // Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
1406 // b.
1407 //
1408 //   r := a & b
1409 //
1410 // https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
_mm_and_si128(__m128i a,__m128i b)1411 FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
1412 {
1413     return vreinterpretq_m128i_s32(
1414         vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1415 }
1416 
1417 // Computes the bitwise AND of the four single-precision, floating-point values
1418 // of a and b.
1419 //
1420 //   r0 := a0 & b0
1421 //   r1 := a1 & b1
1422 //   r2 := a2 & b2
1423 //   r3 := a3 & b3
1424 //
1425 // https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
_mm_and_ps(__m128 a,__m128 b)1426 FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
1427 {
1428     return vreinterpretq_m128_s32(
1429         vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1430 }
1431 
1432 // Compute the bitwise AND of packed double-precision (64-bit) floating-point
1433 // elements in a and b, and store the results in dst.
1434 //
1435 //   FOR j := 0 to 1
1436 //     i := j*64
1437 //     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
1438 //   ENDFOR
1439 //
1440 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
_mm_and_pd(__m128d a,__m128d b)1441 FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
1442 {
1443     return vreinterpretq_m128d_s64(
1444         vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
1445 }
1446 
1447 // Computes the bitwise OR of the four single-precision, floating-point values
1448 // of a and b.
1449 // https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
_mm_or_ps(__m128 a,__m128 b)1450 FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
1451 {
1452     return vreinterpretq_m128_s32(
1453         vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1454 }
1455 
1456 // Computes bitwise EXOR (exclusive-or) of the four single-precision,
1457 // floating-point values of a and b.
1458 // https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
_mm_xor_ps(__m128 a,__m128 b)1459 FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
1460 {
1461     return vreinterpretq_m128_s32(
1462         veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1463 }
1464 
1465 // Compute the bitwise XOR of packed double-precision (64-bit) floating-point
1466 // elements in a and b, and store the results in dst.
1467 //
1468 //   FOR j := 0 to 1
1469 //      i := j*64
1470 //      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
1471 //   ENDFOR
1472 //
1473 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
_mm_xor_pd(__m128d a,__m128d b)1474 FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
1475 {
1476     return vreinterpretq_m128d_s64(
1477         veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
1478 }
1479 
1480 // Compute the bitwise OR of packed double-precision (64-bit) floating-point
1481 // elements in a and b, and store the results in dst.
1482 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
_mm_or_pd(__m128d a,__m128d b)1483 FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
1484 {
1485     return vreinterpretq_m128d_s64(
1486         vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
1487 }
1488 
1489 // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
1490 //
1491 //   r := a | b
1492 //
1493 // https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
_mm_or_si128(__m128i a,__m128i b)1494 FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
1495 {
1496     return vreinterpretq_m128i_s32(
1497         vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1498 }
1499 
1500 // Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
1501 // b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
_mm_xor_si128(__m128i a,__m128i b)1502 FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
1503 {
1504     return vreinterpretq_m128i_s32(
1505         veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1506 }
1507 
1508 // Duplicate the low double-precision (64-bit) floating-point element from a,
1509 // and store the results in dst.
1510 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
_mm_movedup_pd(__m128d a)1511 FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
1512 {
1513 #if (__aarch64__)
1514     return vreinterpretq_m128d_f64(
1515         vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
1516 #else
1517     return vreinterpretq_m128d_u64(
1518         vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
1519 #endif
1520 }
1521 
1522 // Duplicate odd-indexed single-precision (32-bit) floating-point elements
1523 // from a, and store the results in dst.
1524 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
_mm_movehdup_ps(__m128 a)1525 FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
1526 {
1527 #if __has_builtin(__builtin_shufflevector)
1528     return vreinterpretq_m128_f32(__builtin_shufflevector(
1529         vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
1530 #else
1531     float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
1532     float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
1533     float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
1534     return vreinterpretq_m128_f32(vld1q_f32(data));
1535 #endif
1536 }
1537 
1538 // Duplicate even-indexed single-precision (32-bit) floating-point elements
1539 // from a, and store the results in dst.
1540 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
_mm_moveldup_ps(__m128 a)1541 FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
1542 {
1543 #if __has_builtin(__builtin_shufflevector)
1544     return vreinterpretq_m128_f32(__builtin_shufflevector(
1545         vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
1546 #else
1547     float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1548     float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
1549     float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
1550     return vreinterpretq_m128_f32(vld1q_f32(data));
1551 #endif
1552 }
1553 
1554 // Moves the upper two values of B into the lower two values of A.
1555 //
1556 //   r3 := a3
1557 //   r2 := a2
1558 //   r1 := b3
1559 //   r0 := b2
_mm_movehl_ps(__m128 __A,__m128 __B)1560 FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
1561 {
1562     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
1563     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
1564     return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
1565 }
1566 
1567 // Moves the lower two values of B into the upper two values of A.
1568 //
1569 //   r3 := b1
1570 //   r2 := b0
1571 //   r1 := a1
1572 //   r0 := a0
_mm_movelh_ps(__m128 __A,__m128 __B)1573 FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
1574 {
1575     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
1576     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
1577     return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
1578 }
1579 
1580 // Compute the absolute value of packed signed 32-bit integers in a, and store
1581 // the unsigned results in dst.
1582 //
1583 //   FOR j := 0 to 3
1584 //     i := j*32
1585 //     dst[i+31:i] := ABS(a[i+31:i])
1586 //   ENDFOR
1587 //
1588 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
_mm_abs_epi32(__m128i a)1589 FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
1590 {
1591     return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
1592 }
1593 
1594 // Compute the absolute value of packed signed 16-bit integers in a, and store
1595 // the unsigned results in dst.
1596 //
1597 //   FOR j := 0 to 7
1598 //     i := j*16
1599 //     dst[i+15:i] := ABS(a[i+15:i])
1600 //   ENDFOR
1601 //
1602 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
_mm_abs_epi16(__m128i a)1603 FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
1604 {
1605     return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
1606 }
1607 
1608 // Compute the absolute value of packed signed 8-bit integers in a, and store
1609 // the unsigned results in dst.
1610 //
1611 //   FOR j := 0 to 15
1612 //     i := j*8
1613 //     dst[i+7:i] := ABS(a[i+7:i])
1614 //   ENDFOR
1615 //
1616 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
_mm_abs_epi8(__m128i a)1617 FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
1618 {
1619     return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
1620 }
1621 
1622 // Compute the absolute value of packed signed 32-bit integers in a, and store
1623 // the unsigned results in dst.
1624 //
1625 //   FOR j := 0 to 1
1626 //     i := j*32
1627 //     dst[i+31:i] := ABS(a[i+31:i])
1628 //   ENDFOR
1629 //
1630 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
_mm_abs_pi32(__m64 a)1631 FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
1632 {
1633     return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
1634 }
1635 
1636 // Compute the absolute value of packed signed 16-bit integers in a, and store
1637 // the unsigned results in dst.
1638 //
1639 //   FOR j := 0 to 3
1640 //     i := j*16
1641 //     dst[i+15:i] := ABS(a[i+15:i])
1642 //   ENDFOR
1643 //
1644 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
_mm_abs_pi16(__m64 a)1645 FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
1646 {
1647     return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
1648 }
1649 
1650 // Compute the absolute value of packed signed 8-bit integers in a, and store
1651 // the unsigned results in dst.
1652 //
1653 //   FOR j := 0 to 7
1654 //     i := j*8
1655 //     dst[i+7:i] := ABS(a[i+7:i])
1656 //   ENDFOR
1657 //
1658 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
_mm_abs_pi8(__m64 a)1659 FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
1660 {
1661     return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
1662 }
1663 
1664 // Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
1665 // the result right by imm8 bytes, and store the low 16 bytes in dst.
1666 //
1667 //   tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
1668 //   dst[127:0] := tmp[127:0]
1669 //
1670 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
1671 #define _mm_alignr_epi8(a, b, imm)                                            \
1672     __extension__({                                                           \
1673         __m128i ret;                                                          \
1674         if (unlikely((imm) >= 32)) {                                          \
1675             ret = _mm_setzero_si128();                                        \
1676         } else {                                                              \
1677             uint8x16_t tmp_low, tmp_high;                                     \
1678             if (imm >= 16) {                                                  \
1679                 const int idx = imm - 16;                                     \
1680                 tmp_low = vreinterpretq_u8_m128i(a);                          \
1681                 tmp_high = vdupq_n_u8(0);                                     \
1682                 ret =                                                         \
1683                     vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
1684             } else {                                                          \
1685                 const int idx = imm;                                          \
1686                 tmp_low = vreinterpretq_u8_m128i(b);                          \
1687                 tmp_high = vreinterpretq_u8_m128i(a);                         \
1688                 ret =                                                         \
1689                     vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
1690             }                                                                 \
1691         }                                                                     \
1692         ret;                                                                  \
1693     })
1694 
1695 // Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
1696 // the result right by imm8 bytes, and store the low 8 bytes in dst.
1697 //
1698 //   tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
1699 //   dst[63:0] := tmp[63:0]
1700 //
1701 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
1702 #define _mm_alignr_pi8(a, b, imm)                                           \
1703     __extension__({                                                         \
1704         __m64 ret;                                                          \
1705         if (unlikely((imm) >= 16)) {                                        \
1706             ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
1707         } else {                                                            \
1708             uint8x8_t tmp_low, tmp_high;                                    \
1709             if (imm >= 8) {                                                 \
1710                 const int idx = imm - 8;                                    \
1711                 tmp_low = vreinterpret_u8_m64(a);                           \
1712                 tmp_high = vdup_n_u8(0);                                    \
1713                 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
1714             } else {                                                        \
1715                 const int idx = imm;                                        \
1716                 tmp_low = vreinterpret_u8_m64(b);                           \
1717                 tmp_high = vreinterpret_u8_m64(a);                          \
1718                 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
1719             }                                                               \
1720         }                                                                   \
1721         ret;                                                                \
1722     })
1723 
1724 // Takes the upper 64 bits of a and places it in the low end of the result
1725 // Takes the lower 64 bits of b and places it into the high end of the result.
_mm_shuffle_ps_1032(__m128 a,__m128 b)1726 FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
1727 {
1728     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
1729     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1730     return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
1731 }
1732 
1733 // takes the lower two 32-bit values from a and swaps them and places in high
1734 // end of result takes the higher two 32 bit values from b and swaps them and
1735 // places in low end of result.
_mm_shuffle_ps_2301(__m128 a,__m128 b)1736 FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
1737 {
1738     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1739     float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
1740     return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
1741 }
1742 
_mm_shuffle_ps_0321(__m128 a,__m128 b)1743 FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
1744 {
1745     float32x2_t a21 = vget_high_f32(
1746         vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
1747     float32x2_t b03 = vget_low_f32(
1748         vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
1749     return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
1750 }
1751 
_mm_shuffle_ps_2103(__m128 a,__m128 b)1752 FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
1753 {
1754     float32x2_t a03 = vget_low_f32(
1755         vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
1756     float32x2_t b21 = vget_high_f32(
1757         vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
1758     return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
1759 }
1760 
_mm_shuffle_ps_1010(__m128 a,__m128 b)1761 FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
1762 {
1763     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1764     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1765     return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
1766 }
1767 
_mm_shuffle_ps_1001(__m128 a,__m128 b)1768 FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
1769 {
1770     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1771     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1772     return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
1773 }
1774 
_mm_shuffle_ps_0101(__m128 a,__m128 b)1775 FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
1776 {
1777     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1778     float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
1779     return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
1780 }
1781 
1782 // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
1783 // high
_mm_shuffle_ps_3210(__m128 a,__m128 b)1784 FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
1785 {
1786     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1787     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
1788     return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
1789 }
1790 
_mm_shuffle_ps_0011(__m128 a,__m128 b)1791 FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
1792 {
1793     float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
1794     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1795     return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
1796 }
1797 
_mm_shuffle_ps_0022(__m128 a,__m128 b)1798 FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
1799 {
1800     float32x2_t a22 =
1801         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
1802     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1803     return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
1804 }
1805 
_mm_shuffle_ps_2200(__m128 a,__m128 b)1806 FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
1807 {
1808     float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
1809     float32x2_t b22 =
1810         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
1811     return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
1812 }
1813 
_mm_shuffle_ps_3202(__m128 a,__m128 b)1814 FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
1815 {
1816     float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1817     float32x2_t a22 =
1818         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
1819     float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
1820     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
1821     return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
1822 }
1823 
_mm_shuffle_ps_1133(__m128 a,__m128 b)1824 FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
1825 {
1826     float32x2_t a33 =
1827         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
1828     float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
1829     return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
1830 }
1831 
_mm_shuffle_ps_2010(__m128 a,__m128 b)1832 FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
1833 {
1834     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1835     float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
1836     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1837     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1838     return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
1839 }
1840 
_mm_shuffle_ps_2001(__m128 a,__m128 b)1841 FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
1842 {
1843     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1844     float32_t b2 = vgetq_lane_f32(b, 2);
1845     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1846     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1847     return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
1848 }
1849 
_mm_shuffle_ps_2032(__m128 a,__m128 b)1850 FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
1851 {
1852     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
1853     float32_t b2 = vgetq_lane_f32(b, 2);
1854     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1855     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1856     return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
1857 }
1858 
1859 // NEON does not support a general purpose permute intrinsic
1860 // Selects four specific single-precision, floating-point values from a and b,
1861 // based on the mask i.
1862 //
1863 // C equivalent:
1864 //   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
1865 //                                 __constrange(0, 255) int imm) {
1866 //       __m128 ret;
1867 //       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
1868 //       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
1869 //       return ret;
1870 //   }
1871 //
1872 // https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
1873 #define _mm_shuffle_ps_default(a, b, imm)                                  \
1874     __extension__({                                                        \
1875         float32x4_t ret;                                                   \
1876         ret = vmovq_n_f32(                                                 \
1877             vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
1878         ret = vsetq_lane_f32(                                              \
1879             vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
1880             ret, 1);                                                       \
1881         ret = vsetq_lane_f32(                                              \
1882             vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
1883             ret, 2);                                                       \
1884         ret = vsetq_lane_f32(                                              \
1885             vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
1886             ret, 3);                                                       \
1887         vreinterpretq_m128_f32(ret);                                       \
1888     })
1889 
1890 // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
1891 // int imm)
1892 #if __has_builtin(__builtin_shufflevector)
1893 #define _mm_shuffle_ps(a, b, imm)                                \
1894     __extension__({                                              \
1895         float32x4_t _input1 = vreinterpretq_f32_m128(a);         \
1896         float32x4_t _input2 = vreinterpretq_f32_m128(b);         \
1897         float32x4_t _shuf = __builtin_shufflevector(             \
1898             _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
1899             (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
1900         vreinterpretq_m128_f32(_shuf);                           \
1901     })
1902 #else  // generic
1903 #define _mm_shuffle_ps(a, b, imm)                          \
1904     __extension__({                                        \
1905         __m128 ret;                                        \
1906         switch (imm) {                                     \
1907         case _MM_SHUFFLE(1, 0, 3, 2):                      \
1908             ret = _mm_shuffle_ps_1032((a), (b));           \
1909             break;                                         \
1910         case _MM_SHUFFLE(2, 3, 0, 1):                      \
1911             ret = _mm_shuffle_ps_2301((a), (b));           \
1912             break;                                         \
1913         case _MM_SHUFFLE(0, 3, 2, 1):                      \
1914             ret = _mm_shuffle_ps_0321((a), (b));           \
1915             break;                                         \
1916         case _MM_SHUFFLE(2, 1, 0, 3):                      \
1917             ret = _mm_shuffle_ps_2103((a), (b));           \
1918             break;                                         \
1919         case _MM_SHUFFLE(1, 0, 1, 0):                      \
1920             ret = _mm_movelh_ps((a), (b));                 \
1921             break;                                         \
1922         case _MM_SHUFFLE(1, 0, 0, 1):                      \
1923             ret = _mm_shuffle_ps_1001((a), (b));           \
1924             break;                                         \
1925         case _MM_SHUFFLE(0, 1, 0, 1):                      \
1926             ret = _mm_shuffle_ps_0101((a), (b));           \
1927             break;                                         \
1928         case _MM_SHUFFLE(3, 2, 1, 0):                      \
1929             ret = _mm_shuffle_ps_3210((a), (b));           \
1930             break;                                         \
1931         case _MM_SHUFFLE(0, 0, 1, 1):                      \
1932             ret = _mm_shuffle_ps_0011((a), (b));           \
1933             break;                                         \
1934         case _MM_SHUFFLE(0, 0, 2, 2):                      \
1935             ret = _mm_shuffle_ps_0022((a), (b));           \
1936             break;                                         \
1937         case _MM_SHUFFLE(2, 2, 0, 0):                      \
1938             ret = _mm_shuffle_ps_2200((a), (b));           \
1939             break;                                         \
1940         case _MM_SHUFFLE(3, 2, 0, 2):                      \
1941             ret = _mm_shuffle_ps_3202((a), (b));           \
1942             break;                                         \
1943         case _MM_SHUFFLE(3, 2, 3, 2):                      \
1944             ret = _mm_movehl_ps((b), (a));                 \
1945             break;                                         \
1946         case _MM_SHUFFLE(1, 1, 3, 3):                      \
1947             ret = _mm_shuffle_ps_1133((a), (b));           \
1948             break;                                         \
1949         case _MM_SHUFFLE(2, 0, 1, 0):                      \
1950             ret = _mm_shuffle_ps_2010((a), (b));           \
1951             break;                                         \
1952         case _MM_SHUFFLE(2, 0, 0, 1):                      \
1953             ret = _mm_shuffle_ps_2001((a), (b));           \
1954             break;                                         \
1955         case _MM_SHUFFLE(2, 0, 3, 2):                      \
1956             ret = _mm_shuffle_ps_2032((a), (b));           \
1957             break;                                         \
1958         default:                                           \
1959             ret = _mm_shuffle_ps_default((a), (b), (imm)); \
1960             break;                                         \
1961         }                                                  \
1962         ret;                                               \
1963     })
1964 #endif
1965 
1966 // Takes the upper 64 bits of a and places it in the low end of the result
1967 // Takes the lower 64 bits of a and places it into the high end of the result.
_mm_shuffle_epi_1032(__m128i a)1968 FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
1969 {
1970     int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1971     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1972     return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
1973 }
1974 
1975 // takes the lower two 32-bit values from a and swaps them and places in low end
1976 // of result takes the higher two 32 bit values from a and swaps them and places
1977 // in high end of result.
_mm_shuffle_epi_2301(__m128i a)1978 FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
1979 {
1980     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1981     int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
1982     return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
1983 }
1984 
1985 // rotates the least significant 32 bits into the most signficant 32 bits, and
1986 // shifts the rest down
_mm_shuffle_epi_0321(__m128i a)1987 FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
1988 {
1989     return vreinterpretq_m128i_s32(
1990         vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
1991 }
1992 
1993 // rotates the most significant 32 bits into the least signficant 32 bits, and
1994 // shifts the rest up
_mm_shuffle_epi_2103(__m128i a)1995 FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
1996 {
1997     return vreinterpretq_m128i_s32(
1998         vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
1999 }
2000 
2001 // gets the lower 64 bits of a, and places it in the upper 64 bits
2002 // gets the lower 64 bits of a and places it in the lower 64 bits
_mm_shuffle_epi_1010(__m128i a)2003 FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
2004 {
2005     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
2006     return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
2007 }
2008 
2009 // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
2010 // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
_mm_shuffle_epi_1001(__m128i a)2011 FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
2012 {
2013     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
2014     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
2015     return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
2016 }
2017 
2018 // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
2019 // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
2020 // places it in the lower 64 bits
_mm_shuffle_epi_0101(__m128i a)2021 FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
2022 {
2023     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
2024     return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
2025 }
2026 
_mm_shuffle_epi_2211(__m128i a)2027 FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
2028 {
2029     int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
2030     int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
2031     return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
2032 }
2033 
_mm_shuffle_epi_0122(__m128i a)2034 FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
2035 {
2036     int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
2037     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
2038     return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
2039 }
2040 
_mm_shuffle_epi_3332(__m128i a)2041 FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
2042 {
2043     int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
2044     int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
2045     return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
2046 }
2047 
2048 // Shuffle packed 8-bit integers in a according to shuffle control mask in the
2049 // corresponding 8-bit element of b, and store the results in dst.
2050 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
_mm_shuffle_epi8(__m128i a,__m128i b)2051 FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
2052 {
2053     int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
2054     uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
2055     uint8x16_t idx_masked =
2056         vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
2057 #if defined(__aarch64__)
2058     return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
2059 #elif defined(__GNUC__)
2060     int8x16_t ret;
2061     // %e and %f represent the even and odd D registers
2062     // respectively.
2063     __asm__ __volatile__(
2064         "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
2065         "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
2066         : [ret] "=&w"(ret)
2067         : [tbl] "w"(tbl), [idx] "w"(idx_masked));
2068     return vreinterpretq_m128i_s8(ret);
2069 #else
2070     // use this line if testing on aarch64
2071     int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
2072     return vreinterpretq_m128i_s8(
2073         vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
2074                     vtbl2_s8(a_split, vget_high_u8(idx_masked))));
2075 #endif
2076 }
2077 
2078 // C equivalent:
2079 //   __m128i _mm_shuffle_epi32_default(__m128i a,
2080 //                                     __constrange(0, 255) int imm) {
2081 //       __m128i ret;
2082 //       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
2083 //       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
2084 //       return ret;
2085 //   }
2086 #define _mm_shuffle_epi32_default(a, imm)                                   \
2087     __extension__({                                                         \
2088         int32x4_t ret;                                                      \
2089         ret = vmovq_n_s32(                                                  \
2090             vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
2091         ret = vsetq_lane_s32(                                               \
2092             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
2093             ret, 1);                                                        \
2094         ret = vsetq_lane_s32(                                               \
2095             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
2096             ret, 2);                                                        \
2097         ret = vsetq_lane_s32(                                               \
2098             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
2099             ret, 3);                                                        \
2100         vreinterpretq_m128i_s32(ret);                                       \
2101     })
2102 
2103 // FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
2104 // int imm)
2105 #if defined(__aarch64__)
2106 #define _mm_shuffle_epi32_splat(a, imm)                          \
2107     __extension__({                                              \
2108         vreinterpretq_m128i_s32(                                 \
2109             vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
2110     })
2111 #else
2112 #define _mm_shuffle_epi32_splat(a, imm)                                      \
2113     __extension__({                                                          \
2114         vreinterpretq_m128i_s32(                                             \
2115             vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
2116     })
2117 #endif
2118 
2119 // Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
2120 // https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
2121 // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
2122 //                                        __constrange(0,255) int imm)
2123 #if __has_builtin(__builtin_shufflevector)
2124 #define _mm_shuffle_epi32(a, imm)                              \
2125     __extension__({                                            \
2126         int32x4_t _input = vreinterpretq_s32_m128i(a);         \
2127         int32x4_t _shuf = __builtin_shufflevector(             \
2128             _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2129             ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
2130         vreinterpretq_m128i_s32(_shuf);                        \
2131     })
2132 #else  // generic
2133 #define _mm_shuffle_epi32(a, imm)                        \
2134     __extension__({                                      \
2135         __m128i ret;                                     \
2136         switch (imm) {                                   \
2137         case _MM_SHUFFLE(1, 0, 3, 2):                    \
2138             ret = _mm_shuffle_epi_1032((a));             \
2139             break;                                       \
2140         case _MM_SHUFFLE(2, 3, 0, 1):                    \
2141             ret = _mm_shuffle_epi_2301((a));             \
2142             break;                                       \
2143         case _MM_SHUFFLE(0, 3, 2, 1):                    \
2144             ret = _mm_shuffle_epi_0321((a));             \
2145             break;                                       \
2146         case _MM_SHUFFLE(2, 1, 0, 3):                    \
2147             ret = _mm_shuffle_epi_2103((a));             \
2148             break;                                       \
2149         case _MM_SHUFFLE(1, 0, 1, 0):                    \
2150             ret = _mm_shuffle_epi_1010((a));             \
2151             break;                                       \
2152         case _MM_SHUFFLE(1, 0, 0, 1):                    \
2153             ret = _mm_shuffle_epi_1001((a));             \
2154             break;                                       \
2155         case _MM_SHUFFLE(0, 1, 0, 1):                    \
2156             ret = _mm_shuffle_epi_0101((a));             \
2157             break;                                       \
2158         case _MM_SHUFFLE(2, 2, 1, 1):                    \
2159             ret = _mm_shuffle_epi_2211((a));             \
2160             break;                                       \
2161         case _MM_SHUFFLE(0, 1, 2, 2):                    \
2162             ret = _mm_shuffle_epi_0122((a));             \
2163             break;                                       \
2164         case _MM_SHUFFLE(3, 3, 3, 2):                    \
2165             ret = _mm_shuffle_epi_3332((a));             \
2166             break;                                       \
2167         case _MM_SHUFFLE(0, 0, 0, 0):                    \
2168             ret = _mm_shuffle_epi32_splat((a), 0);       \
2169             break;                                       \
2170         case _MM_SHUFFLE(1, 1, 1, 1):                    \
2171             ret = _mm_shuffle_epi32_splat((a), 1);       \
2172             break;                                       \
2173         case _MM_SHUFFLE(2, 2, 2, 2):                    \
2174             ret = _mm_shuffle_epi32_splat((a), 2);       \
2175             break;                                       \
2176         case _MM_SHUFFLE(3, 3, 3, 3):                    \
2177             ret = _mm_shuffle_epi32_splat((a), 3);       \
2178             break;                                       \
2179         default:                                         \
2180             ret = _mm_shuffle_epi32_default((a), (imm)); \
2181             break;                                       \
2182         }                                                \
2183         ret;                                             \
2184     })
2185 #endif
2186 
2187 // Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
2188 // by imm.
2189 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
2190 // FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
2191 //                                                   __constrange(0,255) int
2192 //                                                   imm)
2193 #define _mm_shufflelo_epi16_function(a, imm)                                  \
2194     __extension__({                                                           \
2195         int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
2196         int16x4_t lowBits = vget_low_s16(ret);                                \
2197         ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
2198         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
2199                              1);                                              \
2200         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
2201                              2);                                              \
2202         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
2203                              3);                                              \
2204         vreinterpretq_m128i_s16(ret);                                         \
2205     })
2206 
2207 // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
2208 //                                          __constrange(0,255) int imm)
2209 #if __has_builtin(__builtin_shufflevector)
2210 #define _mm_shufflelo_epi16(a, imm)                                  \
2211     __extension__({                                                  \
2212         int16x8_t _input = vreinterpretq_s16_m128i(a);               \
2213         int16x8_t _shuf = __builtin_shufflevector(                   \
2214             _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
2215             (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
2216         vreinterpretq_m128i_s16(_shuf);                              \
2217     })
2218 #else  // generic
2219 #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
2220 #endif
2221 
2222 // Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
2223 // by imm.
2224 // https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
2225 // FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
2226 //                                                   __constrange(0,255) int
2227 //                                                   imm)
2228 #define _mm_shufflehi_epi16_function(a, imm)                                   \
2229     __extension__({                                                            \
2230         int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
2231         int16x4_t highBits = vget_high_s16(ret);                               \
2232         ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
2233         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
2234                              5);                                               \
2235         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
2236                              6);                                               \
2237         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
2238                              7);                                               \
2239         vreinterpretq_m128i_s16(ret);                                          \
2240     })
2241 
2242 // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
2243 //                                          __constrange(0,255) int imm)
2244 #if __has_builtin(__builtin_shufflevector)
2245 #define _mm_shufflehi_epi16(a, imm)                             \
2246     __extension__({                                             \
2247         int16x8_t _input = vreinterpretq_s16_m128i(a);          \
2248         int16x8_t _shuf = __builtin_shufflevector(              \
2249             _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
2250             (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
2251             (((imm) >> 6) & 0x3) + 4);                          \
2252         vreinterpretq_m128i_s16(_shuf);                         \
2253     })
2254 #else  // generic
2255 #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
2256 #endif
2257 
2258 // Shuffle double-precision (64-bit) floating-point elements using the control
2259 // in imm8, and store the results in dst.
2260 //
2261 //   dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
2262 //   dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
2263 //
2264 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
2265 #if __has_builtin(__builtin_shufflevector)
2266 #define _mm_shuffle_pd(a, b, imm8)                                          \
2267     vreinterpretq_m128d_s64(__builtin_shufflevector(                        \
2268         vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
2269         ((imm8 & 0x2) >> 1) + 2))
2270 #else
2271 #define _mm_shuffle_pd(a, b, imm8)                                     \
2272     _mm_castsi128_pd(_mm_set_epi64x(                                   \
2273         vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
2274         vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
2275 #endif
2276 
2277 // Blend packed 16-bit integers from a and b using control mask imm8, and store
2278 // the results in dst.
2279 //
2280 //   FOR j := 0 to 7
2281 //       i := j*16
2282 //       IF imm8[j]
2283 //           dst[i+15:i] := b[i+15:i]
2284 //       ELSE
2285 //           dst[i+15:i] := a[i+15:i]
2286 //       FI
2287 //   ENDFOR
2288 // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
2289 //                                      __constrange(0,255) int imm)
2290 #define _mm_blend_epi16(a, b, imm)                                        \
2291     __extension__({                                                       \
2292         const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000,  \
2293                                    ((imm) & (1 << 1)) ? 0xFFFF : 0x0000,  \
2294                                    ((imm) & (1 << 2)) ? 0xFFFF : 0x0000,  \
2295                                    ((imm) & (1 << 3)) ? 0xFFFF : 0x0000,  \
2296                                    ((imm) & (1 << 4)) ? 0xFFFF : 0x0000,  \
2297                                    ((imm) & (1 << 5)) ? 0xFFFF : 0x0000,  \
2298                                    ((imm) & (1 << 6)) ? 0xFFFF : 0x0000,  \
2299                                    ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \
2300         uint16x8_t _mask_vec = vld1q_u16(_mask);                          \
2301         uint16x8_t _a = vreinterpretq_u16_m128i(a);                       \
2302         uint16x8_t _b = vreinterpretq_u16_m128i(b);                       \
2303         vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));            \
2304     })
2305 
2306 // Blend packed 8-bit integers from a and b using mask, and store the results in
2307 // dst.
2308 //
2309 //   FOR j := 0 to 15
2310 //       i := j*8
2311 //       IF mask[i+7]
2312 //           dst[i+7:i] := b[i+7:i]
2313 //       ELSE
2314 //           dst[i+7:i] := a[i+7:i]
2315 //       FI
2316 //   ENDFOR
_mm_blendv_epi8(__m128i _a,__m128i _b,__m128i _mask)2317 FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
2318 {
2319     // Use a signed shift right to create a mask with the sign bit
2320     uint8x16_t mask =
2321         vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
2322     uint8x16_t a = vreinterpretq_u8_m128i(_a);
2323     uint8x16_t b = vreinterpretq_u8_m128i(_b);
2324     return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
2325 }
2326 
2327 /* Shifts */
2328 
2329 
2330 // Shift packed 16-bit integers in a right by imm while shifting in sign
2331 // bits, and store the results in dst.
2332 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
_mm_srai_epi16(__m128i a,int imm)2333 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
2334 {
2335     const int count = (imm & ~15) ? 15 : imm;
2336     return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
2337 }
2338 
2339 // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
2340 // shifting in zeros.
2341 //
2342 //   r0 := a0 << count
2343 //   r1 := a1 << count
2344 //   ...
2345 //   r7 := a7 << count
2346 //
2347 // https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
2348 #define _mm_slli_epi16(a, imm)                                   \
2349     __extension__({                                              \
2350         __m128i ret;                                             \
2351         if (unlikely((imm)) <= 0) {                              \
2352             ret = a;                                             \
2353         }                                                        \
2354         if (unlikely((imm) > 15)) {                              \
2355             ret = _mm_setzero_si128();                           \
2356         } else {                                                 \
2357             ret = vreinterpretq_m128i_s16(                       \
2358                 vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
2359         }                                                        \
2360         ret;                                                     \
2361     })
2362 
2363 // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
2364 // shifting in zeros. :
2365 // https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
2366 // FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
_mm_slli_epi32(__m128i a,int imm)2367 FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
2368 {
2369     if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */
2370         return a;
2371     if (unlikely(imm > 31))
2372         return _mm_setzero_si128();
2373     return vreinterpretq_m128i_s32(
2374         vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
2375 }
2376 
2377 // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
2378 // store the results in dst.
_mm_slli_epi64(__m128i a,int imm)2379 FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
2380 {
2381     if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */
2382         return a;
2383     if (unlikely(imm > 63))
2384         return _mm_setzero_si128();
2385     return vreinterpretq_m128i_s64(
2386         vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
2387 }
2388 
2389 // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
2390 // store the results in dst.
2391 //
2392 //   FOR j := 0 to 7
2393 //     i := j*16
2394 //     IF imm8[7:0] > 15
2395 //       dst[i+15:i] := 0
2396 //     ELSE
2397 //       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
2398 //     FI
2399 //   ENDFOR
2400 //
2401 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
2402 #define _mm_srli_epi16(a, imm)                                             \
2403     __extension__({                                                        \
2404         __m128i ret;                                                       \
2405         if (unlikely(imm) == 0) {                                          \
2406             ret = a;                                                       \
2407         }                                                                  \
2408         if (likely(0 < (imm) && (imm) < 16)) {                             \
2409             ret = vreinterpretq_m128i_u16(                                 \
2410                 vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
2411         } else {                                                           \
2412             ret = _mm_setzero_si128();                                     \
2413         }                                                                  \
2414         ret;                                                               \
2415     })
2416 
2417 // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
2418 // store the results in dst.
2419 //
2420 //   FOR j := 0 to 3
2421 //     i := j*32
2422 //     IF imm8[7:0] > 31
2423 //       dst[i+31:i] := 0
2424 //     ELSE
2425 //       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
2426 //     FI
2427 //   ENDFOR
2428 //
2429 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
2430 // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
2431 #define _mm_srli_epi32(a, imm)                                             \
2432     __extension__({                                                        \
2433         __m128i ret;                                                       \
2434         if (unlikely((imm) == 0)) {                                        \
2435             ret = a;                                                       \
2436         }                                                                  \
2437         if (likely(0 < (imm) && (imm) < 32)) {                             \
2438             ret = vreinterpretq_m128i_u32(                                 \
2439                 vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
2440         } else {                                                           \
2441             ret = _mm_setzero_si128();                                     \
2442         }                                                                  \
2443         ret;                                                               \
2444     })
2445 
2446 // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
2447 // store the results in dst.
2448 //
2449 //   FOR j := 0 to 1
2450 //     i := j*64
2451 //     IF imm8[7:0] > 63
2452 //       dst[i+63:i] := 0
2453 //     ELSE
2454 //       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
2455 //     FI
2456 //   ENDFOR
2457 //
2458 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
2459 #define _mm_srli_epi64(a, imm)                                             \
2460     __extension__({                                                        \
2461         __m128i ret;                                                       \
2462         if (unlikely((imm) == 0)) {                                        \
2463             ret = a;                                                       \
2464         }                                                                  \
2465         if (likely(0 < (imm) && (imm) < 64)) {                             \
2466             ret = vreinterpretq_m128i_u64(                                 \
2467                 vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
2468         } else {                                                           \
2469             ret = _mm_setzero_si128();                                     \
2470         }                                                                  \
2471         ret;                                                               \
2472     })
2473 
2474 // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
2475 // and store the results in dst.
2476 //
2477 //   FOR j := 0 to 3
2478 //     i := j*32
2479 //     IF imm8[7:0] > 31
2480 //       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
2481 //     ELSE
2482 //       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
2483 //     FI
2484 //   ENDFOR
2485 //
2486 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
2487 // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
2488 #define _mm_srai_epi32(a, imm)                                             \
2489     __extension__({                                                        \
2490         __m128i ret;                                                       \
2491         if (unlikely((imm) == 0)) {                                        \
2492             ret = a;                                                       \
2493         }                                                                  \
2494         if (likely(0 < (imm) && (imm) < 32)) {                             \
2495             ret = vreinterpretq_m128i_s32(                                 \
2496                 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
2497         } else {                                                           \
2498             ret = vreinterpretq_m128i_s32(                                 \
2499                 vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));              \
2500         }                                                                  \
2501         ret;                                                               \
2502     })
2503 
2504 // Shifts the 128 - bit value in a right by imm bytes while shifting in
2505 // zeros.imm must be an immediate.
2506 //
2507 //   r := srl(a, imm*8)
2508 //
2509 // https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
2510 // FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
2511 #define _mm_srli_si128(a, imm)                                              \
2512     __extension__({                                                         \
2513         __m128i ret;                                                        \
2514         if (unlikely((imm) <= 0)) {                                         \
2515             ret = a;                                                        \
2516         }                                                                   \
2517         if (unlikely((imm) > 15)) {                                         \
2518             ret = _mm_setzero_si128();                                      \
2519         } else {                                                            \
2520             ret = vreinterpretq_m128i_s8(                                   \
2521                 vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
2522         }                                                                   \
2523         ret;                                                                \
2524     })
2525 
2526 // Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
2527 // must be an immediate.
2528 //
2529 //   r := a << (imm * 8)
2530 //
2531 // https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
2532 // FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
2533 #define _mm_slli_si128(a, imm)                                          \
2534     __extension__({                                                     \
2535         __m128i ret;                                                    \
2536         if (unlikely((imm) <= 0)) {                                     \
2537             ret = a;                                                    \
2538         }                                                               \
2539         if (unlikely((imm) > 15)) {                                     \
2540             ret = _mm_setzero_si128();                                  \
2541         } else {                                                        \
2542             ret = vreinterpretq_m128i_s8(vextq_s8(                      \
2543                 vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
2544         }                                                               \
2545         ret;                                                            \
2546     })
2547 
2548 // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
2549 // shifting in zeros.
2550 //
2551 //   r0 := a0 << count
2552 //   r1 := a1 << count
2553 //   ...
2554 //   r7 := a7 << count
2555 //
2556 // https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
_mm_sll_epi16(__m128i a,__m128i count)2557 FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
2558 {
2559     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2560     if (unlikely(c > 15))
2561         return _mm_setzero_si128();
2562 
2563     int16x8_t vc = vdupq_n_s16((int16_t) c);
2564     return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
2565 }
2566 
2567 // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
2568 // shifting in zeros.
2569 //
2570 // r0 := a0 << count
2571 // r1 := a1 << count
2572 // r2 := a2 << count
2573 // r3 := a3 << count
2574 //
2575 // https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
_mm_sll_epi32(__m128i a,__m128i count)2576 FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
2577 {
2578     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2579     if (unlikely(c > 31))
2580         return _mm_setzero_si128();
2581 
2582     int32x4_t vc = vdupq_n_s32((int32_t) c);
2583     return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
2584 }
2585 
2586 // Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
2587 // shifting in zeros.
2588 //
2589 // r0 := a0 << count
2590 // r1 := a1 << count
2591 //
2592 // https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
_mm_sll_epi64(__m128i a,__m128i count)2593 FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
2594 {
2595     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2596     if (unlikely(c > 63))
2597         return _mm_setzero_si128();
2598 
2599     int64x2_t vc = vdupq_n_s64((int64_t) c);
2600     return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
2601 }
2602 
2603 // Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
2604 // while shifting in zeros.
2605 //
2606 // r0 := srl(a0, count)
2607 // r1 := srl(a1, count)
2608 // ...
2609 // r7 := srl(a7, count)
2610 //
2611 // https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
_mm_srl_epi16(__m128i a,__m128i count)2612 FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
2613 {
2614     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2615     if (unlikely(c > 15))
2616         return _mm_setzero_si128();
2617 
2618     int16x8_t vc = vdupq_n_s16(-(int16_t) c);
2619     return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
2620 }
2621 
2622 // Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
2623 // while shifting in zeros.
2624 //
2625 // r0 := srl(a0, count)
2626 // r1 := srl(a1, count)
2627 // r2 := srl(a2, count)
2628 // r3 := srl(a3, count)
2629 //
2630 // https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
_mm_srl_epi32(__m128i a,__m128i count)2631 FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
2632 {
2633     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2634     if (unlikely(c > 31))
2635         return _mm_setzero_si128();
2636 
2637     int32x4_t vc = vdupq_n_s32(-(int32_t) c);
2638     return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
2639 }
2640 
2641 // Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
2642 // while shifting in zeros.
2643 //
2644 // r0 := srl(a0, count)
2645 // r1 := srl(a1, count)
2646 //
2647 // https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
_mm_srl_epi64(__m128i a,__m128i count)2648 FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
2649 {
2650     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2651     if (unlikely(c > 63))
2652         return _mm_setzero_si128();
2653 
2654     int64x2_t vc = vdupq_n_s64(-(int64_t) c);
2655     return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
2656 }
2657 
2658 // NEON does not provide a version of this function.
2659 // Creates a 16-bit mask from the most significant bits of the 16 signed or
2660 // unsigned 8-bit integers in a and zero extends the upper bits.
2661 // https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
_mm_movemask_epi8(__m128i a)2662 FORCE_INLINE int _mm_movemask_epi8(__m128i a)
2663 {
2664     // Use increasingly wide shifts+adds to collect the sign bits
2665     // together.
2666     // Since the widening shifts would be rather confusing to follow in little
2667     // endian, everything will be illustrated in big endian order instead. This
2668     // has a different result - the bits would actually be reversed on a big
2669     // endian machine.
2670 
2671     // Starting input (only half the elements are shown):
2672     // 89 ff 1d c0 00 10 99 33
2673     uint8x16_t input = vreinterpretq_u8_m128i(a);
2674 
2675     // Shift out everything but the sign bits with an unsigned shift right.
2676     //
2677     // Bytes of the vector::
2678     // 89 ff 1d c0 00 10 99 33
2679     // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
2680     //  |  |  |  |  |  |  |  |
2681     // 01 01 00 01 00 00 01 00
2682     //
2683     // Bits of first important lane(s):
2684     // 10001001 (89)
2685     // \______
2686     //        |
2687     // 00000001 (01)
2688     uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
2689 
2690     // Merge the even lanes together with a 16-bit unsigned shift right + add.
2691     // 'xx' represents garbage data which will be ignored in the final result.
2692     // In the important bytes, the add functions like a binary OR.
2693     //
2694     // 01 01 00 01 00 00 01 00
2695     //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
2696     //    \|    \|    \|    \|
2697     // xx 03 xx 01 xx 00 xx 02
2698     //
2699     // 00000001 00000001 (01 01)
2700     //        \_______ |
2701     //                \|
2702     // xxxxxxxx xxxxxx11 (xx 03)
2703     uint32x4_t paired16 =
2704         vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
2705 
2706     // Repeat with a wider 32-bit shift + add.
2707     // xx 03 xx 01 xx 00 xx 02
2708     //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
2709     //     14))
2710     //          \|          \|
2711     // xx xx xx 0d xx xx xx 02
2712     //
2713     // 00000011 00000001 (03 01)
2714     //        \\_____ ||
2715     //         '----.\||
2716     // xxxxxxxx xxxx1101 (xx 0d)
2717     uint64x2_t paired32 =
2718         vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
2719 
2720     // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
2721     // lanes. xx xx xx 0d xx xx xx 02
2722     //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
2723     //            28))
2724     //                      \|
2725     // xx xx xx xx xx xx xx d2
2726     //
2727     // 00001101 00000010 (0d 02)
2728     //     \   \___ |  |
2729     //      '---.  \|  |
2730     // xxxxxxxx 11010010 (xx d2)
2731     uint8x16_t paired64 =
2732         vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
2733 
2734     // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
2735     // xx xx xx xx xx xx xx d2
2736     //                      ||  return paired64[0]
2737     //                      d2
2738     // Note: Little endian would return the correct value 4b (01001011) instead.
2739     return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
2740 }
2741 
2742 // Copy the lower 64-bit integer in a to dst.
2743 //
2744 //   dst[63:0] := a[63:0]
2745 //
2746 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
_mm_movepi64_pi64(__m128i a)2747 FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
2748 {
2749     return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
2750 }
2751 
2752 // Copy the 64-bit integer a to the lower element of dst, and zero the upper
2753 // element.
2754 //
2755 //   dst[63:0] := a[63:0]
2756 //   dst[127:64] := 0
2757 //
2758 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
_mm_movpi64_epi64(__m64 a)2759 FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
2760 {
2761     return vreinterpretq_m128i_s64(
2762         vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
2763 }
2764 
2765 // NEON does not provide this method
2766 // Creates a 4-bit mask from the most significant bits of the four
2767 // single-precision, floating-point values.
2768 // https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
_mm_movemask_ps(__m128 a)2769 FORCE_INLINE int _mm_movemask_ps(__m128 a)
2770 {
2771     uint32x4_t input = vreinterpretq_u32_m128(a);
2772 #if defined(__aarch64__)
2773     static const int32x4_t shift = {0, 1, 2, 3};
2774     uint32x4_t tmp = vshrq_n_u32(input, 31);
2775     return vaddvq_u32(vshlq_u32(tmp, shift));
2776 #else
2777     // Uses the exact same method as _mm_movemask_epi8, see that for details.
2778     // Shift out everything but the sign bits with a 32-bit unsigned shift
2779     // right.
2780     uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2781     // Merge the two pairs together with a 64-bit unsigned shift right + add.
2782     uint8x16_t paired =
2783         vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2784     // Extract the result.
2785     return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2786 #endif
2787 }
2788 
2789 // Compute the bitwise NOT of a and then AND with a 128-bit vector containing
2790 // all 1's, and return 1 if the result is zero, otherwise return 0.
2791 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
_mm_test_all_ones(__m128i a)2792 FORCE_INLINE int _mm_test_all_ones(__m128i a)
2793 {
2794     return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
2795            ~(uint64_t) 0;
2796 }
2797 
2798 // Compute the bitwise AND of 128 bits (representing integer data) in a and
2799 // mask, and return 1 if the result is zero, otherwise return 0.
2800 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
_mm_test_all_zeros(__m128i a,__m128i mask)2801 FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
2802 {
2803     int64x2_t a_and_mask =
2804         vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
2805     return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
2806                                                                            : 1;
2807 }
2808 
2809 /* Math operations */
2810 
2811 // Subtracts the four single-precision, floating-point values of a and b.
2812 //
2813 //   r0 := a0 - b0
2814 //   r1 := a1 - b1
2815 //   r2 := a2 - b2
2816 //   r3 := a3 - b3
2817 //
2818 // https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
_mm_sub_ps(__m128 a,__m128 b)2819 FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2820 {
2821     return vreinterpretq_m128_f32(
2822         vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2823 }
2824 
2825 // Subtract the lower single-precision (32-bit) floating-point element in b from
2826 // the lower single-precision (32-bit) floating-point element in a, store the
2827 // result in the lower element of dst, and copy the upper 3 packed elements from
2828 // a to the upper elements of dst.
2829 //
2830 //   dst[31:0] := a[31:0] - b[31:0]
2831 //   dst[127:32] := a[127:32]
2832 //
2833 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
_mm_sub_ss(__m128 a,__m128 b)2834 FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2835 {
2836     return _mm_move_ss(a, _mm_sub_ps(a, b));
2837 }
2838 
2839 // Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
2840 // and store the results in dst.
2841 //    r0 := a0 - b0
2842 //    r1 := a1 - b1
_mm_sub_epi64(__m128i a,__m128i b)2843 FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
2844 {
2845     return vreinterpretq_m128i_s64(
2846         vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2847 }
2848 
2849 // Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
2850 // unsigned 32-bit integers of a.
2851 //
2852 //   r0 := a0 - b0
2853 //   r1 := a1 - b1
2854 //   r2 := a2 - b2
2855 //   r3 := a3 - b3
2856 //
2857 // https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
_mm_sub_epi32(__m128i a,__m128i b)2858 FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
2859 {
2860     return vreinterpretq_m128i_s32(
2861         vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2862 }
2863 
2864 // Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
2865 // store the results in dst.
2866 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
_mm_sub_epi16(__m128i a,__m128i b)2867 FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
2868 {
2869     return vreinterpretq_m128i_s16(
2870         vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2871 }
2872 
2873 // Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
2874 // store the results in dst.
2875 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
_mm_sub_epi8(__m128i a,__m128i b)2876 FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
2877 {
2878     return vreinterpretq_m128i_s8(
2879         vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2880 }
2881 
2882 // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
2883 //
2884 //   dst[63:0] := a[63:0] - b[63:0]
2885 //
2886 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
_mm_sub_si64(__m64 a,__m64 b)2887 FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
2888 {
2889     return vreinterpret_m64_s64(
2890         vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2891 }
2892 
2893 // Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
2894 // integers of a and saturates..
2895 // https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
_mm_subs_epu16(__m128i a,__m128i b)2896 FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
2897 {
2898     return vreinterpretq_m128i_u16(
2899         vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
2900 }
2901 
2902 // Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
2903 // integers of a and saturates.
2904 //
2905 //   r0 := UnsignedSaturate(a0 - b0)
2906 //   r1 := UnsignedSaturate(a1 - b1)
2907 //   ...
2908 //   r15 := UnsignedSaturate(a15 - b15)
2909 //
2910 // https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
_mm_subs_epu8(__m128i a,__m128i b)2911 FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
2912 {
2913     return vreinterpretq_m128i_u8(
2914         vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2915 }
2916 
2917 // Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
2918 // of a and saturates.
2919 //
2920 //   r0 := SignedSaturate(a0 - b0)
2921 //   r1 := SignedSaturate(a1 - b1)
2922 //   ...
2923 //   r15 := SignedSaturate(a15 - b15)
2924 //
2925 // https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
_mm_subs_epi8(__m128i a,__m128i b)2926 FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
2927 {
2928     return vreinterpretq_m128i_s8(
2929         vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2930 }
2931 
2932 // Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
2933 // of a and saturates.
2934 //
2935 //   r0 := SignedSaturate(a0 - b0)
2936 //   r1 := SignedSaturate(a1 - b1)
2937 //   ...
2938 //   r7 := SignedSaturate(a7 - b7)
2939 //
2940 // https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
_mm_subs_epi16(__m128i a,__m128i b)2941 FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
2942 {
2943     return vreinterpretq_m128i_s16(
2944         vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2945 }
2946 
2947 // Subtract packed double-precision (64-bit) floating-point elements in b from
2948 // packed double-precision (64-bit) floating-point elements in a, and store the
2949 // results in dst.
2950 //
2951 //   FOR j := 0 to 1
2952 //     i := j*64
2953 //     dst[i+63:i] := a[i+63:i] - b[i+63:i]
2954 //   ENDFOR
2955 //
2956 //  https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
_mm_sub_pd(__m128d a,__m128d b)2957 FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
2958 {
2959 #if defined(__aarch64__)
2960     return vreinterpretq_m128d_f64(
2961         vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2962 #else
2963     double *da = (double *) &a;
2964     double *db = (double *) &b;
2965     double c[2];
2966     c[0] = da[0] - db[0];
2967     c[1] = da[1] - db[1];
2968     return vld1q_f32((float32_t *) c);
2969 #endif
2970 }
2971 
2972 // Subtract the lower double-precision (64-bit) floating-point element in b from
2973 // the lower double-precision (64-bit) floating-point element in a, store the
2974 // result in the lower element of dst, and copy the upper element from a to the
2975 // upper element of dst.
2976 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
_mm_sub_sd(__m128d a,__m128d b)2977 FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
2978 {
2979     return _mm_move_sd(a, _mm_sub_pd(a, b));
2980 }
2981 
2982 // Add packed unsigned 16-bit integers in a and b using saturation, and store
2983 // the results in dst.
2984 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
_mm_adds_epu16(__m128i a,__m128i b)2985 FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
2986 {
2987     return vreinterpretq_m128i_u16(
2988         vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
2989 }
2990 
2991 // Negate packed 8-bit integers in a when the corresponding signed
2992 // 8-bit integer in b is negative, and store the results in dst.
2993 // Element in dst are zeroed out when the corresponding element
2994 // in b is zero.
2995 //
2996 //   for i in 0..15
2997 //     if b[i] < 0
2998 //       r[i] := -a[i]
2999 //     else if b[i] == 0
3000 //       r[i] := 0
3001 //     else
3002 //       r[i] := a[i]
3003 //     fi
3004 //   done
_mm_sign_epi8(__m128i _a,__m128i _b)3005 FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
3006 {
3007     int8x16_t a = vreinterpretq_s8_m128i(_a);
3008     int8x16_t b = vreinterpretq_s8_m128i(_b);
3009 
3010     // signed shift right: faster than vclt
3011     // (b < 0) ? 0xFF : 0
3012     uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
3013 
3014     // (b == 0) ? 0xFF : 0
3015 #if defined(__aarch64__)
3016     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
3017 #else
3018     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
3019 #endif
3020 
3021     // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
3022     // based on ltMask
3023     int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
3024     // res = masked & (~zeroMask)
3025     int8x16_t res = vbicq_s8(masked, zeroMask);
3026 
3027     return vreinterpretq_m128i_s8(res);
3028 }
3029 
3030 // Negate packed 16-bit integers in a when the corresponding signed
3031 // 16-bit integer in b is negative, and store the results in dst.
3032 // Element in dst are zeroed out when the corresponding element
3033 // in b is zero.
3034 //
3035 //   for i in 0..7
3036 //     if b[i] < 0
3037 //       r[i] := -a[i]
3038 //     else if b[i] == 0
3039 //       r[i] := 0
3040 //     else
3041 //       r[i] := a[i]
3042 //     fi
3043 //   done
_mm_sign_epi16(__m128i _a,__m128i _b)3044 FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
3045 {
3046     int16x8_t a = vreinterpretq_s16_m128i(_a);
3047     int16x8_t b = vreinterpretq_s16_m128i(_b);
3048 
3049     // signed shift right: faster than vclt
3050     // (b < 0) ? 0xFFFF : 0
3051     uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
3052     // (b == 0) ? 0xFFFF : 0
3053 #if defined(__aarch64__)
3054     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
3055 #else
3056     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
3057 #endif
3058 
3059     // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
3060     // 'a') based on ltMask
3061     int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
3062     // res = masked & (~zeroMask)
3063     int16x8_t res = vbicq_s16(masked, zeroMask);
3064     return vreinterpretq_m128i_s16(res);
3065 }
3066 
3067 // Negate packed 32-bit integers in a when the corresponding signed
3068 // 32-bit integer in b is negative, and store the results in dst.
3069 // Element in dst are zeroed out when the corresponding element
3070 // in b is zero.
3071 //
3072 //   for i in 0..3
3073 //     if b[i] < 0
3074 //       r[i] := -a[i]
3075 //     else if b[i] == 0
3076 //       r[i] := 0
3077 //     else
3078 //       r[i] := a[i]
3079 //     fi
3080 //   done
_mm_sign_epi32(__m128i _a,__m128i _b)3081 FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
3082 {
3083     int32x4_t a = vreinterpretq_s32_m128i(_a);
3084     int32x4_t b = vreinterpretq_s32_m128i(_b);
3085 
3086     // signed shift right: faster than vclt
3087     // (b < 0) ? 0xFFFFFFFF : 0
3088     uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
3089 
3090     // (b == 0) ? 0xFFFFFFFF : 0
3091 #if defined(__aarch64__)
3092     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
3093 #else
3094     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
3095 #endif
3096 
3097     // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
3098     // 'a') based on ltMask
3099     int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
3100     // res = masked & (~zeroMask)
3101     int32x4_t res = vbicq_s32(masked, zeroMask);
3102     return vreinterpretq_m128i_s32(res);
3103 }
3104 
3105 // Negate packed 16-bit integers in a when the corresponding signed 16-bit
3106 // integer in b is negative, and store the results in dst. Element in dst are
3107 // zeroed out when the corresponding element in b is zero.
3108 //
3109 //   FOR j := 0 to 3
3110 //      i := j*16
3111 //      IF b[i+15:i] < 0
3112 //        dst[i+15:i] := -(a[i+15:i])
3113 //      ELSE IF b[i+15:i] == 0
3114 //        dst[i+15:i] := 0
3115 //      ELSE
3116 //        dst[i+15:i] := a[i+15:i]
3117 //      FI
3118 //   ENDFOR
3119 //
3120 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
_mm_sign_pi16(__m64 _a,__m64 _b)3121 FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
3122 {
3123     int16x4_t a = vreinterpret_s16_m64(_a);
3124     int16x4_t b = vreinterpret_s16_m64(_b);
3125 
3126     // signed shift right: faster than vclt
3127     // (b < 0) ? 0xFFFF : 0
3128     uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
3129 
3130     // (b == 0) ? 0xFFFF : 0
3131 #if defined(__aarch64__)
3132     int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
3133 #else
3134     int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
3135 #endif
3136 
3137     // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
3138     // based on ltMask
3139     int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
3140     // res = masked & (~zeroMask)
3141     int16x4_t res = vbic_s16(masked, zeroMask);
3142 
3143     return vreinterpret_m64_s16(res);
3144 }
3145 
3146 // Negate packed 32-bit integers in a when the corresponding signed 32-bit
3147 // integer in b is negative, and store the results in dst. Element in dst are
3148 // zeroed out when the corresponding element in b is zero.
3149 //
3150 //   FOR j := 0 to 1
3151 //      i := j*32
3152 //      IF b[i+31:i] < 0
3153 //        dst[i+31:i] := -(a[i+31:i])
3154 //      ELSE IF b[i+31:i] == 0
3155 //        dst[i+31:i] := 0
3156 //      ELSE
3157 //        dst[i+31:i] := a[i+31:i]
3158 //      FI
3159 //   ENDFOR
3160 //
3161 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
_mm_sign_pi32(__m64 _a,__m64 _b)3162 FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
3163 {
3164     int32x2_t a = vreinterpret_s32_m64(_a);
3165     int32x2_t b = vreinterpret_s32_m64(_b);
3166 
3167     // signed shift right: faster than vclt
3168     // (b < 0) ? 0xFFFFFFFF : 0
3169     uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
3170 
3171     // (b == 0) ? 0xFFFFFFFF : 0
3172 #if defined(__aarch64__)
3173     int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
3174 #else
3175     int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
3176 #endif
3177 
3178     // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
3179     // based on ltMask
3180     int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
3181     // res = masked & (~zeroMask)
3182     int32x2_t res = vbic_s32(masked, zeroMask);
3183 
3184     return vreinterpret_m64_s32(res);
3185 }
3186 
3187 // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
3188 // in b is negative, and store the results in dst. Element in dst are zeroed out
3189 // when the corresponding element in b is zero.
3190 //
3191 //   FOR j := 0 to 7
3192 //      i := j*8
3193 //      IF b[i+7:i] < 0
3194 //        dst[i+7:i] := -(a[i+7:i])
3195 //      ELSE IF b[i+7:i] == 0
3196 //        dst[i+7:i] := 0
3197 //      ELSE
3198 //        dst[i+7:i] := a[i+7:i]
3199 //      FI
3200 //   ENDFOR
3201 //
3202 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
_mm_sign_pi8(__m64 _a,__m64 _b)3203 FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
3204 {
3205     int8x8_t a = vreinterpret_s8_m64(_a);
3206     int8x8_t b = vreinterpret_s8_m64(_b);
3207 
3208     // signed shift right: faster than vclt
3209     // (b < 0) ? 0xFF : 0
3210     uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
3211 
3212     // (b == 0) ? 0xFF : 0
3213 #if defined(__aarch64__)
3214     int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
3215 #else
3216     int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
3217 #endif
3218 
3219     // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
3220     // based on ltMask
3221     int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
3222     // res = masked & (~zeroMask)
3223     int8x8_t res = vbic_s8(masked, zeroMask);
3224 
3225     return vreinterpret_m64_s8(res);
3226 }
3227 
3228 // Average packed unsigned 16-bit integers in a and b, and store the results in
3229 // dst.
3230 //
3231 //   FOR j := 0 to 3
3232 //     i := j*16
3233 //     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
3234 //   ENDFOR
3235 //
3236 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
_mm_avg_pu16(__m64 a,__m64 b)3237 FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
3238 {
3239     return vreinterpret_m64_u16(
3240         vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
3241 }
3242 
3243 // Average packed unsigned 8-bit integers in a and b, and store the results in
3244 // dst.
3245 //
3246 //   FOR j := 0 to 7
3247 //     i := j*8
3248 //     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
3249 //   ENDFOR
3250 //
3251 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
_mm_avg_pu8(__m64 a,__m64 b)3252 FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
3253 {
3254     return vreinterpret_m64_u8(
3255         vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3256 }
3257 
3258 // Average packed unsigned 8-bit integers in a and b, and store the results in
3259 // dst.
3260 //
3261 //   FOR j := 0 to 7
3262 //     i := j*8
3263 //     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
3264 //   ENDFOR
3265 //
3266 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
3267 #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
3268 
3269 // Average packed unsigned 16-bit integers in a and b, and store the results in
3270 // dst.
3271 //
3272 //   FOR j := 0 to 3
3273 //     i := j*16
3274 //     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
3275 //   ENDFOR
3276 //
3277 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
3278 #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
3279 
3280 // Extract a 16-bit integer from a, selected with imm8, and store the result in
3281 // the lower element of dst.
3282 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
3283 #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
3284 
3285 // Copy a to dst, and insert the 16-bit integer i into dst at the location
3286 // specified by imm8.
3287 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
3288 #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
3289 
3290 // Compare packed signed 16-bit integers in a and b, and store packed maximum
3291 // values in dst.
3292 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
3293 #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
3294 
3295 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
3296 // values in dst.
3297 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
3298 #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
3299 
3300 // Compare packed signed 16-bit integers in a and b, and store packed minimum
3301 // values in dst.
3302 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
3303 #define _m_pminsw(a, b) _mm_min_pi16(a, b)
3304 
3305 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
3306 // values in dst.
3307 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
3308 #define _m_pminub(a, b) _mm_min_pu8(a, b)
3309 
3310 // Computes the average of the 16 unsigned 8-bit integers in a and the 16
3311 // unsigned 8-bit integers in b and rounds.
3312 //
3313 //   r0 := (a0 + b0) / 2
3314 //   r1 := (a1 + b1) / 2
3315 //   ...
3316 //   r15 := (a15 + b15) / 2
3317 //
3318 // https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
_mm_avg_epu8(__m128i a,__m128i b)3319 FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
3320 {
3321     return vreinterpretq_m128i_u8(
3322         vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3323 }
3324 
3325 // Computes the average of the 8 unsigned 16-bit integers in a and the 8
3326 // unsigned 16-bit integers in b and rounds.
3327 //
3328 //   r0 := (a0 + b0) / 2
3329 //   r1 := (a1 + b1) / 2
3330 //   ...
3331 //   r7 := (a7 + b7) / 2
3332 //
3333 // https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
_mm_avg_epu16(__m128i a,__m128i b)3334 FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
3335 {
3336     return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
3337                                  vreinterpretq_u16_m128i(b));
3338 }
3339 
3340 // Adds the four single-precision, floating-point values of a and b.
3341 //
3342 //   r0 := a0 + b0
3343 //   r1 := a1 + b1
3344 //   r2 := a2 + b2
3345 //   r3 := a3 + b3
3346 //
3347 // https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
_mm_add_ps(__m128 a,__m128 b)3348 FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
3349 {
3350     return vreinterpretq_m128_f32(
3351         vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3352 }
3353 
3354 // Add packed double-precision (64-bit) floating-point elements in a and b, and
3355 // store the results in dst.
3356 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
_mm_add_pd(__m128d a,__m128d b)3357 FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
3358 {
3359 #if defined(__aarch64__)
3360     return vreinterpretq_m128d_f64(
3361         vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3362 #else
3363     double *da = (double *) &a;
3364     double *db = (double *) &b;
3365     double c[2];
3366     c[0] = da[0] + db[0];
3367     c[1] = da[1] + db[1];
3368     return vld1q_f32((float32_t *) c);
3369 #endif
3370 }
3371 
3372 // Add the lower double-precision (64-bit) floating-point element in a and b,
3373 // store the result in the lower element of dst, and copy the upper element from
3374 // a to the upper element of dst.
3375 //
3376 //   dst[63:0] := a[63:0] + b[63:0]
3377 //   dst[127:64] := a[127:64]
3378 //
3379 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
_mm_add_sd(__m128d a,__m128d b)3380 FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
3381 {
3382 #if defined(__aarch64__)
3383     return _mm_move_sd(a, _mm_add_pd(a, b));
3384 #else
3385     double *da = (double *) &a;
3386     double *db = (double *) &b;
3387     double c[2];
3388     c[0] = da[0] + db[0];
3389     c[1] = da[1];
3390     return vld1q_f32((float32_t *) c);
3391 #endif
3392 }
3393 
3394 // Add 64-bit integers a and b, and store the result in dst.
3395 //
3396 //   dst[63:0] := a[63:0] + b[63:0]
3397 //
3398 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
_mm_add_si64(__m64 a,__m64 b)3399 FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
3400 {
3401     return vreinterpret_m64_s64(
3402         vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
3403 }
3404 
3405 // adds the scalar single-precision floating point values of a and b.
3406 // https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
_mm_add_ss(__m128 a,__m128 b)3407 FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
3408 {
3409     float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
3410     float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
3411     // the upper values in the result must be the remnants of <a>.
3412     return vreinterpretq_m128_f32(vaddq_f32(a, value));
3413 }
3414 
3415 // Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
3416 // unsigned 32-bit integers in b.
3417 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
_mm_add_epi64(__m128i a,__m128i b)3418 FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
3419 {
3420     return vreinterpretq_m128i_s64(
3421         vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
3422 }
3423 
3424 // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
3425 // unsigned 32-bit integers in b.
3426 //
3427 //   r0 := a0 + b0
3428 //   r1 := a1 + b1
3429 //   r2 := a2 + b2
3430 //   r3 := a3 + b3
3431 //
3432 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
_mm_add_epi32(__m128i a,__m128i b)3433 FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
3434 {
3435     return vreinterpretq_m128i_s32(
3436         vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3437 }
3438 
3439 // Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
3440 // unsigned 16-bit integers in b.
3441 // https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
_mm_add_epi16(__m128i a,__m128i b)3442 FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
3443 {
3444     return vreinterpretq_m128i_s16(
3445         vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3446 }
3447 
3448 // Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
3449 // unsigned 8-bit integers in b.
3450 // https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
_mm_add_epi8(__m128i a,__m128i b)3451 FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
3452 {
3453     return vreinterpretq_m128i_s8(
3454         vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3455 }
3456 
3457 // Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
3458 // and saturates.
3459 //
3460 //   r0 := SignedSaturate(a0 + b0)
3461 //   r1 := SignedSaturate(a1 + b1)
3462 //   ...
3463 //   r7 := SignedSaturate(a7 + b7)
3464 //
3465 // https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
_mm_adds_epi16(__m128i a,__m128i b)3466 FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
3467 {
3468     return vreinterpretq_m128i_s16(
3469         vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3470 }
3471 
3472 // Add packed signed 8-bit integers in a and b using saturation, and store the
3473 // results in dst.
3474 //
3475 //   FOR j := 0 to 15
3476 //     i := j*8
3477 //     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
3478 //   ENDFOR
3479 //
3480 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
_mm_adds_epi8(__m128i a,__m128i b)3481 FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
3482 {
3483     return vreinterpretq_m128i_s8(
3484         vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3485 }
3486 
3487 // Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
3488 // b and saturates..
3489 // https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
_mm_adds_epu8(__m128i a,__m128i b)3490 FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
3491 {
3492     return vreinterpretq_m128i_u8(
3493         vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3494 }
3495 
3496 // Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
3497 // unsigned 16-bit integers from b.
3498 //
3499 //   r0 := (a0 * b0)[15:0]
3500 //   r1 := (a1 * b1)[15:0]
3501 //   ...
3502 //   r7 := (a7 * b7)[15:0]
3503 //
3504 // https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
_mm_mullo_epi16(__m128i a,__m128i b)3505 FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
3506 {
3507     return vreinterpretq_m128i_s16(
3508         vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3509 }
3510 
3511 // Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
3512 // unsigned 32-bit integers from b.
3513 // https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
_mm_mullo_epi32(__m128i a,__m128i b)3514 FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
3515 {
3516     return vreinterpretq_m128i_s32(
3517         vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3518 }
3519 
3520 // Multiply the packed unsigned 16-bit integers in a and b, producing
3521 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
3522 // integers in dst.
3523 //
3524 //   FOR j := 0 to 3
3525 //      i := j*16
3526 //      tmp[31:0] := a[i+15:i] * b[i+15:i]
3527 //      dst[i+15:i] := tmp[31:16]
3528 //   ENDFOR
3529 //
3530 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
3531 #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
3532 
3533 // Multiplies the four single-precision, floating-point values of a and b.
3534 //
3535 //   r0 := a0 * b0
3536 //   r1 := a1 * b1
3537 //   r2 := a2 * b2
3538 //   r3 := a3 * b3
3539 //
3540 // https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
_mm_mul_ps(__m128 a,__m128 b)3541 FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
3542 {
3543     return vreinterpretq_m128_f32(
3544         vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3545 }
3546 
3547 // Multiply packed double-precision (64-bit) floating-point elements in a and b,
3548 // and store the results in dst.
3549 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
_mm_mul_pd(__m128d a,__m128d b)3550 FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
3551 {
3552 #if defined(__aarch64__)
3553     return vreinterpretq_m128d_f64(
3554         vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3555 #else
3556     double *da = (double *) &a;
3557     double *db = (double *) &b;
3558     double c[2];
3559     c[0] = da[0] * db[0];
3560     c[1] = da[1] * db[1];
3561     return vld1q_f32((float32_t *) c);
3562 #endif
3563 }
3564 
3565 // Multiply the lower double-precision (64-bit) floating-point element in a and
3566 // b, store the result in the lower element of dst, and copy the upper element
3567 // from a to the upper element of dst.
3568 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
_mm_mul_sd(__m128d a,__m128d b)3569 FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
3570 {
3571     return _mm_move_sd(a, _mm_mul_pd(a, b));
3572 }
3573 
3574 // Multiply the lower single-precision (32-bit) floating-point element in a and
3575 // b, store the result in the lower element of dst, and copy the upper 3 packed
3576 // elements from a to the upper elements of dst.
3577 //
3578 //   dst[31:0] := a[31:0] * b[31:0]
3579 //   dst[127:32] := a[127:32]
3580 //
3581 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
_mm_mul_ss(__m128 a,__m128 b)3582 FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
3583 {
3584     return _mm_move_ss(a, _mm_mul_ps(a, b));
3585 }
3586 
3587 // Multiply the low unsigned 32-bit integers from each packed 64-bit element in
3588 // a and b, and store the unsigned 64-bit results in dst.
3589 //
3590 //   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
3591 //   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
_mm_mul_epu32(__m128i a,__m128i b)3592 FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
3593 {
3594     // vmull_u32 upcasts instead of masking, so we downcast.
3595     uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
3596     uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
3597     return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
3598 }
3599 
3600 // Multiply the low unsigned 32-bit integers from a and b, and store the
3601 // unsigned 64-bit result in dst.
3602 //
3603 //   dst[63:0] := a[31:0] * b[31:0]
3604 //
3605 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
_mm_mul_su32(__m64 a,__m64 b)3606 FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
3607 {
3608     return vreinterpret_m64_u64(vget_low_u64(
3609         vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
3610 }
3611 
3612 // Multiply the low signed 32-bit integers from each packed 64-bit element in
3613 // a and b, and store the signed 64-bit results in dst.
3614 //
3615 //   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
3616 //   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
_mm_mul_epi32(__m128i a,__m128i b)3617 FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
3618 {
3619     // vmull_s32 upcasts instead of masking, so we downcast.
3620     int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
3621     int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
3622     return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
3623 }
3624 
3625 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
3626 // integers from b.
3627 //
3628 //   r0 := (a0 * b0) + (a1 * b1)
3629 //   r1 := (a2 * b2) + (a3 * b3)
3630 //   r2 := (a4 * b4) + (a5 * b5)
3631 //   r3 := (a6 * b6) + (a7 * b7)
3632 // https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
_mm_madd_epi16(__m128i a,__m128i b)3633 FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
3634 {
3635     int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
3636                               vget_low_s16(vreinterpretq_s16_m128i(b)));
3637     int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
3638                                vget_high_s16(vreinterpretq_s16_m128i(b)));
3639 
3640     int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
3641     int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
3642 
3643     return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
3644 }
3645 
3646 // Multiply packed signed 16-bit integers in a and b, producing intermediate
3647 // signed 32-bit integers. Shift right by 15 bits while rounding up, and store
3648 // the packed 16-bit integers in dst.
3649 //
3650 //   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
3651 //   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
3652 //   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
3653 //   ...
3654 //   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
_mm_mulhrs_epi16(__m128i a,__m128i b)3655 FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
3656 {
3657     // Has issues due to saturation
3658     // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
3659 
3660     // Multiply
3661     int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
3662                                  vget_low_s16(vreinterpretq_s16_m128i(b)));
3663     int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
3664                                  vget_high_s16(vreinterpretq_s16_m128i(b)));
3665 
3666     // Rounding narrowing shift right
3667     // narrow = (int16_t)((mul + 16384) >> 15);
3668     int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
3669     int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
3670 
3671     // Join together
3672     return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
3673 }
3674 
3675 // Vertically multiply each unsigned 8-bit integer from a with the corresponding
3676 // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
3677 // Horizontally add adjacent pairs of intermediate signed 16-bit integers,
3678 // and pack the saturated results in dst.
3679 //
3680 //   FOR j := 0 to 7
3681 //      i := j*16
3682 //      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
3683 //      a[i+7:i]*b[i+7:i] )
3684 //   ENDFOR
_mm_maddubs_epi16(__m128i _a,__m128i _b)3685 FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
3686 {
3687 #if defined(__aarch64__)
3688     uint8x16_t a = vreinterpretq_u8_m128i(_a);
3689     int8x16_t b = vreinterpretq_s8_m128i(_b);
3690     int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
3691                              vmovl_s8(vget_low_s8(b)));
3692     int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
3693                              vmovl_s8(vget_high_s8(b)));
3694     return vreinterpretq_m128i_s16(
3695         vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
3696 #else
3697     // This would be much simpler if x86 would choose to zero extend OR sign
3698     // extend, not both. This could probably be optimized better.
3699     uint16x8_t a = vreinterpretq_u16_m128i(_a);
3700     int16x8_t b = vreinterpretq_s16_m128i(_b);
3701 
3702     // Zero extend a
3703     int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
3704     int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
3705 
3706     // Sign extend by shifting left then shifting right.
3707     int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
3708     int16x8_t b_odd = vshrq_n_s16(b, 8);
3709 
3710     // multiply
3711     int16x8_t prod1 = vmulq_s16(a_even, b_even);
3712     int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
3713 
3714     // saturated add
3715     return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
3716 #endif
3717 }
3718 
3719 // Computes the fused multiple add product of 32-bit floating point numbers.
3720 //
3721 // Return Value
3722 // Multiplies A and B, and adds C to the temporary result before returning it.
3723 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
_mm_fmadd_ps(__m128 a,__m128 b,__m128 c)3724 FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
3725 {
3726 #if defined(__aarch64__)
3727     return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
3728                                             vreinterpretq_f32_m128(b),
3729                                             vreinterpretq_f32_m128(a)));
3730 #else
3731     return _mm_add_ps(_mm_mul_ps(a, b), c);
3732 #endif
3733 }
3734 
3735 // Alternatively add and subtract packed single-precision (32-bit)
3736 // floating-point elements in a to/from packed elements in b, and store the
3737 // results in dst.
3738 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
_mm_addsub_ps(__m128 a,__m128 b)3739 FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
3740 {
3741     __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
3742     return _mm_fmadd_ps(b, mask, a);
3743 }
3744 
3745 // Horizontally add adjacent pairs of double-precision (64-bit) floating-point
3746 // elements in a and b, and pack the results in dst.
3747 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
_mm_hadd_pd(__m128d a,__m128d b)3748 FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
3749 {
3750 #if defined(__aarch64__)
3751     return vreinterpretq_m128d_f64(
3752         vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3753 #else
3754     double *da = (double *) &a;
3755     double *db = (double *) &b;
3756     double c[] = {da[0] + da[1], db[0] + db[1]};
3757     return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
3758 #endif
3759 }
3760 
3761 // Compute the absolute differences of packed unsigned 8-bit integers in a and
3762 // b, then horizontally sum each consecutive 8 differences to produce two
3763 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3764 // 16 bits of 64-bit elements in dst.
3765 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
_mm_sad_epu8(__m128i a,__m128i b)3766 FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
3767 {
3768     uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
3769     uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3770     uint16_t r4 = t[4] + t[5] + t[6] + t[7];
3771     uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
3772     return (__m128i) vsetq_lane_u16(r4, r, 4);
3773 }
3774 
3775 // Compute the absolute differences of packed unsigned 8-bit integers in a and
3776 // b, then horizontally sum each consecutive 8 differences to produce four
3777 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3778 // 16 bits of dst.
3779 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
_mm_sad_pu8(__m64 a,__m64 b)3780 FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
3781 {
3782     uint16x4_t t =
3783         vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3784     uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3785     return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0));
3786 }
3787 
3788 // Compute the absolute differences of packed unsigned 8-bit integers in a and
3789 // b, then horizontally sum each consecutive 8 differences to produce four
3790 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3791 // 16 bits of dst.
3792 //
3793 //   FOR j := 0 to 7
3794 //      i := j*8
3795 //      tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
3796 //   ENDFOR
3797 //   dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] +
3798 //   tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
3799 //
3800 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw
3801 #define _m_psadbw(a, b) _mm_sad_pu8(a, b)
3802 
3803 // Divides the four single-precision, floating-point values of a and b.
3804 //
3805 //   r0 := a0 / b0
3806 //   r1 := a1 / b1
3807 //   r2 := a2 / b2
3808 //   r3 := a3 / b3
3809 //
3810 // https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
_mm_div_ps(__m128 a,__m128 b)3811 FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
3812 {
3813 #if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
3814     return vreinterpretq_m128_f32(
3815         vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3816 #else
3817     float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
3818     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
3819 #if SSE2NEON_PRECISE_DIV
3820     // Additional Netwon-Raphson iteration for accuracy
3821     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
3822 #endif
3823     return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
3824 #endif
3825 }
3826 
3827 // Divides the scalar single-precision floating point value of a by b.
3828 // https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
_mm_div_ss(__m128 a,__m128 b)3829 FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
3830 {
3831     float32_t value =
3832         vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
3833     return vreinterpretq_m128_f32(
3834         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
3835 }
3836 
3837 // Divide packed double-precision (64-bit) floating-point elements in a by
3838 // packed elements in b, and store the results in dst.
3839 //
3840 //  FOR j := 0 to 1
3841 //    i := 64*j
3842 //    dst[i+63:i] := a[i+63:i] / b[i+63:i]
3843 //  ENDFOR
3844 //
3845 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd
_mm_div_pd(__m128d a,__m128d b)3846 FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
3847 {
3848 #if defined(__aarch64__)
3849     return vreinterpretq_m128d_f64(
3850         vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3851 #else
3852     double *da = (double *) &a;
3853     double *db = (double *) &b;
3854     double c[2];
3855     c[0] = da[0] / db[0];
3856     c[1] = da[1] / db[1];
3857     return vld1q_f32((float32_t *) c);
3858 #endif
3859 }
3860 
3861 // Divide the lower double-precision (64-bit) floating-point element in a by the
3862 // lower double-precision (64-bit) floating-point element in b, store the result
3863 // in the lower element of dst, and copy the upper element from a to the upper
3864 // element of dst.
3865 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd
_mm_div_sd(__m128d a,__m128d b)3866 FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
3867 {
3868 #if defined(__aarch64__)
3869     float64x2_t tmp =
3870         vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
3871     return vreinterpretq_m128d_f64(
3872         vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
3873 #else
3874     return _mm_move_sd(a, _mm_div_pd(a, b));
3875 #endif
3876 }
3877 
3878 // Compute the approximate reciprocal of packed single-precision (32-bit)
3879 // floating-point elements in a, and store the results in dst. The maximum
3880 // relative error for this approximation is less than 1.5*2^-12.
3881 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
_mm_rcp_ps(__m128 in)3882 FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
3883 {
3884     float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
3885     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
3886 #if SSE2NEON_PRECISE_DIV
3887     // Additional Netwon-Raphson iteration for accuracy
3888     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
3889 #endif
3890     return vreinterpretq_m128_f32(recip);
3891 }
3892 
3893 // Compute the approximate reciprocal of the lower single-precision (32-bit)
3894 // floating-point element in a, store the result in the lower element of dst,
3895 // and copy the upper 3 packed elements from a to the upper elements of dst. The
3896 // maximum relative error for this approximation is less than 1.5*2^-12.
3897 //
3898 //   dst[31:0] := (1.0 / a[31:0])
3899 //   dst[127:32] := a[127:32]
3900 //
3901 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
_mm_rcp_ss(__m128 a)3902 FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
3903 {
3904     return _mm_move_ss(a, _mm_rcp_ps(a));
3905 }
3906 
3907 // Computes the approximations of square roots of the four single-precision,
3908 // floating-point values of a. First computes reciprocal square roots and then
3909 // reciprocals of the four values.
3910 //
3911 //   r0 := sqrt(a0)
3912 //   r1 := sqrt(a1)
3913 //   r2 := sqrt(a2)
3914 //   r3 := sqrt(a3)
3915 //
3916 // https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
_mm_sqrt_ps(__m128 in)3917 FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
3918 {
3919 #if SSE2NEON_PRECISE_SQRT
3920     float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
3921 
3922     // Test for vrsqrteq_f32(0) -> positive infinity case.
3923     // Change to zero, so that s * 1/sqrt(s) result is zero too.
3924     const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
3925     const uint32x4_t div_by_zero =
3926         vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
3927     recip = vreinterpretq_f32_u32(
3928         vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
3929 
3930     // Additional Netwon-Raphson iteration for accuracy
3931     recip = vmulq_f32(
3932         vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
3933         recip);
3934     recip = vmulq_f32(
3935         vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
3936         recip);
3937 
3938     // sqrt(s) = s * 1/sqrt(s)
3939     return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
3940 #elif defined(__aarch64__)
3941     return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
3942 #else
3943     float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
3944     float32x4_t sq = vrecpeq_f32(recipsq);
3945     return vreinterpretq_m128_f32(sq);
3946 #endif
3947 }
3948 
3949 // Computes the approximation of the square root of the scalar single-precision
3950 // floating point value of in.
3951 // https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
_mm_sqrt_ss(__m128 in)3952 FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
3953 {
3954     float32_t value =
3955         vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
3956     return vreinterpretq_m128_f32(
3957         vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
3958 }
3959 
3960 // Computes the approximations of the reciprocal square roots of the four
3961 // single-precision floating point values of in.
3962 // https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
_mm_rsqrt_ps(__m128 in)3963 FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
3964 {
3965     float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
3966 #if SSE2NEON_PRECISE_RSQRT
3967     // Additional Netwon-Raphson iteration for accuracy
3968     out = vmulq_f32(
3969         out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
3970     out = vmulq_f32(
3971         out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
3972 #endif
3973     return vreinterpretq_m128_f32(out);
3974 }
3975 
3976 // Compute the approximate reciprocal square root of the lower single-precision
3977 // (32-bit) floating-point element in a, store the result in the lower element
3978 // of dst, and copy the upper 3 packed elements from a to the upper elements of
3979 // dst.
3980 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
_mm_rsqrt_ss(__m128 in)3981 FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
3982 {
3983     return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
3984 }
3985 
3986 // Compare packed signed 16-bit integers in a and b, and store packed maximum
3987 // values in dst.
3988 //
3989 //   FOR j := 0 to 3
3990 //      i := j*16
3991 //      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
3992 //   ENDFOR
3993 //
3994 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
_mm_max_pi16(__m64 a,__m64 b)3995 FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
3996 {
3997     return vreinterpret_m64_s16(
3998         vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
3999 }
4000 
4001 // Compare packed signed 16-bit integers in a and b, and store packed maximum
4002 // values in dst.
4003 //
4004 //   FOR j := 0 to 3
4005 //      i := j*16
4006 //      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
4007 //   ENDFOR
4008 //
4009 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
4010 #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
4011 
4012 // Computes the maximums of the four single-precision, floating-point values of
4013 // a and b.
4014 // https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
_mm_max_ps(__m128 a,__m128 b)4015 FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
4016 {
4017 #if SSE2NEON_PRECISE_MINMAX
4018     float32x4_t _a = vreinterpretq_f32_m128(a);
4019     float32x4_t _b = vreinterpretq_f32_m128(b);
4020     return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
4021 #else
4022     return vreinterpretq_m128_f32(
4023         vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4024 #endif
4025 }
4026 
4027 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
4028 // values in dst.
4029 //
4030 //   FOR j := 0 to 7
4031 //      i := j*8
4032 //      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
4033 //   ENDFOR
4034 //
4035 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
_mm_max_pu8(__m64 a,__m64 b)4036 FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
4037 {
4038     return vreinterpret_m64_u8(
4039         vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
4040 }
4041 
4042 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
4043 // values in dst.
4044 //
4045 //   FOR j := 0 to 7
4046 //      i := j*8
4047 //      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
4048 //   ENDFOR
4049 //
4050 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
4051 #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
4052 
4053 // Compare packed signed 16-bit integers in a and b, and store packed minimum
4054 // values in dst.
4055 //
4056 //   FOR j := 0 to 3
4057 //      i := j*16
4058 //      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
4059 //   ENDFOR
4060 //
4061 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
_mm_min_pi16(__m64 a,__m64 b)4062 FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
4063 {
4064     return vreinterpret_m64_s16(
4065         vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
4066 }
4067 
4068 // Compare packed signed 16-bit integers in a and b, and store packed minimum
4069 // values in dst.
4070 //
4071 //   FOR j := 0 to 3
4072 //      i := j*16
4073 //      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
4074 //   ENDFOR
4075 //
4076 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
4077 #define _m_pminsw(a, b) _mm_min_pi16(a, b)
4078 
4079 // Computes the minima of the four single-precision, floating-point values of a
4080 // and b.
4081 // https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
_mm_min_ps(__m128 a,__m128 b)4082 FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
4083 {
4084 #if SSE2NEON_PRECISE_MINMAX
4085     float32x4_t _a = vreinterpretq_f32_m128(a);
4086     float32x4_t _b = vreinterpretq_f32_m128(b);
4087     return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
4088 #else
4089     return vreinterpretq_m128_f32(
4090         vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4091 #endif
4092 }
4093 
4094 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
4095 // values in dst.
4096 //
4097 //   FOR j := 0 to 7
4098 //      i := j*8
4099 //      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
4100 //   ENDFOR
4101 //
4102 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
_mm_min_pu8(__m64 a,__m64 b)4103 FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
4104 {
4105     return vreinterpret_m64_u8(
4106         vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
4107 }
4108 
4109 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
4110 // values in dst.
4111 //
4112 //   FOR j := 0 to 7
4113 //      i := j*8
4114 //      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
4115 //   ENDFOR
4116 //
4117 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
4118 #define _m_pminub(a, b) _mm_min_pu8(a, b)
4119 
4120 // Computes the maximum of the two lower scalar single-precision floating point
4121 // values of a and b.
4122 // https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
_mm_max_ss(__m128 a,__m128 b)4123 FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
4124 {
4125     float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
4126     return vreinterpretq_m128_f32(
4127         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
4128 }
4129 
4130 // Computes the minimum of the two lower scalar single-precision floating point
4131 // values of a and b.
4132 // https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
_mm_min_ss(__m128 a,__m128 b)4133 FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
4134 {
4135     float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
4136     return vreinterpretq_m128_f32(
4137         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
4138 }
4139 
4140 // Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
4141 // 16 unsigned 8-bit integers from b.
4142 // https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
_mm_max_epu8(__m128i a,__m128i b)4143 FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
4144 {
4145     return vreinterpretq_m128i_u8(
4146         vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4147 }
4148 
4149 // Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
4150 // 16 unsigned 8-bit integers from b.
4151 // https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
_mm_min_epu8(__m128i a,__m128i b)4152 FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
4153 {
4154     return vreinterpretq_m128i_u8(
4155         vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4156 }
4157 
4158 // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
4159 // signed 16-bit integers from b.
4160 // https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
_mm_min_epi16(__m128i a,__m128i b)4161 FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
4162 {
4163     return vreinterpretq_m128i_s16(
4164         vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4165 }
4166 
4167 // Compare packed signed 8-bit integers in a and b, and store packed maximum
4168 // values in dst.
4169 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
_mm_max_epi8(__m128i a,__m128i b)4170 FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
4171 {
4172     return vreinterpretq_m128i_s8(
4173         vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4174 }
4175 
4176 // Compare packed unsigned 16-bit integers in a and b, and store packed maximum
4177 // values in dst.
4178 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
_mm_max_epu16(__m128i a,__m128i b)4179 FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
4180 {
4181     return vreinterpretq_m128i_u16(
4182         vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
4183 }
4184 
4185 // Compare packed signed 8-bit integers in a and b, and store packed minimum
4186 // values in dst.
4187 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
_mm_min_epi8(__m128i a,__m128i b)4188 FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
4189 {
4190     return vreinterpretq_m128i_s8(
4191         vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4192 }
4193 
4194 // Compare packed unsigned 16-bit integers in a and b, and store packed minimum
4195 // values in dst.
4196 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
_mm_min_epu16(__m128i a,__m128i b)4197 FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
4198 {
4199     return vreinterpretq_m128i_u16(
4200         vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
4201 }
4202 
4203 // Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
4204 // signed 16-bit integers from b.
4205 // https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
_mm_max_epi16(__m128i a,__m128i b)4206 FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
4207 {
4208     return vreinterpretq_m128i_s16(
4209         vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4210 }
4211 
4212 // epi versions of min/max
4213 // Computes the pariwise maximums of the four signed 32-bit integer values of a
4214 // and b.
4215 //
4216 // A 128-bit parameter that can be defined with the following equations:
4217 //   r0 := (a0 > b0) ? a0 : b0
4218 //   r1 := (a1 > b1) ? a1 : b1
4219 //   r2 := (a2 > b2) ? a2 : b2
4220 //   r3 := (a3 > b3) ? a3 : b3
4221 //
4222 // https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
_mm_max_epi32(__m128i a,__m128i b)4223 FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
4224 {
4225     return vreinterpretq_m128i_s32(
4226         vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4227 }
4228 
4229 // Computes the pariwise minima of the four signed 32-bit integer values of a
4230 // and b.
4231 //
4232 // A 128-bit parameter that can be defined with the following equations:
4233 //   r0 := (a0 < b0) ? a0 : b0
4234 //   r1 := (a1 < b1) ? a1 : b1
4235 //   r2 := (a2 < b2) ? a2 : b2
4236 //   r3 := (a3 < b3) ? a3 : b3
4237 //
4238 // https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
_mm_min_epi32(__m128i a,__m128i b)4239 FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
4240 {
4241     return vreinterpretq_m128i_s32(
4242         vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4243 }
4244 
4245 // Compare packed unsigned 32-bit integers in a and b, and store packed maximum
4246 // values in dst.
4247 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
_mm_max_epu32(__m128i a,__m128i b)4248 FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
4249 {
4250     return vreinterpretq_m128i_u32(
4251         vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
4252 }
4253 
4254 // Compare packed unsigned 32-bit integers in a and b, and store packed minimum
4255 // values in dst.
4256 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
_mm_min_epu32(__m128i a,__m128i b)4257 FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
4258 {
4259     return vreinterpretq_m128i_u32(
4260         vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
4261 }
4262 
4263 // Multiply the packed unsigned 16-bit integers in a and b, producing
4264 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
4265 // integers in dst.
4266 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
_mm_mulhi_pu16(__m64 a,__m64 b)4267 FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
4268 {
4269     return vreinterpret_m64_u16(vshrn_n_u32(
4270         vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
4271 }
4272 
4273 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
4274 // integers from b.
4275 //
4276 //   r0 := (a0 * b0)[31:16]
4277 //   r1 := (a1 * b1)[31:16]
4278 //   ...
4279 //   r7 := (a7 * b7)[31:16]
4280 //
4281 // https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
_mm_mulhi_epi16(__m128i a,__m128i b)4282 FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
4283 {
4284     /* FIXME: issue with large values because of result saturation */
4285     // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
4286     // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
4287     // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
4288     int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
4289     int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
4290     int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4291     int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
4292     int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
4293     int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4294     uint16x8x2_t r =
4295         vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4296     return vreinterpretq_m128i_u16(r.val[1]);
4297 }
4298 
4299 // Multiply the packed unsigned 16-bit integers in a and b, producing
4300 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
4301 // integers in dst.
4302 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
_mm_mulhi_epu16(__m128i a,__m128i b)4303 FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
4304 {
4305     uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
4306     uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
4307     uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4308 #if defined(__aarch64__)
4309     uint32x4_t ab7654 =
4310         vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
4311     uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4312                               vreinterpretq_u16_u32(ab7654));
4313     return vreinterpretq_m128i_u16(r);
4314 #else
4315     uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
4316     uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
4317     uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4318     uint16x8x2_t r =
4319         vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4320     return vreinterpretq_m128i_u16(r.val[1]);
4321 #endif
4322 }
4323 
4324 // Computes pairwise add of each argument as single-precision, floating-point
4325 // values a and b.
4326 // https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
_mm_hadd_ps(__m128 a,__m128 b)4327 FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
4328 {
4329 #if defined(__aarch64__)
4330     return vreinterpretq_m128_f32(
4331         vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4332 #else
4333     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
4334     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
4335     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
4336     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
4337     return vreinterpretq_m128_f32(
4338         vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
4339 #endif
4340 }
4341 
4342 // Computes pairwise add of each argument as a 16-bit signed or unsigned integer
4343 // values a and b.
_mm_hadd_epi16(__m128i _a,__m128i _b)4344 FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
4345 {
4346     int16x8_t a = vreinterpretq_s16_m128i(_a);
4347     int16x8_t b = vreinterpretq_s16_m128i(_b);
4348 #if defined(__aarch64__)
4349     return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
4350 #else
4351     return vreinterpretq_m128i_s16(
4352         vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
4353                      vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
4354 #endif
4355 }
4356 
4357 // Horizontally substract adjacent pairs of single-precision (32-bit)
4358 // floating-point elements in a and b, and pack the results in dst.
4359 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
_mm_hsub_ps(__m128 _a,__m128 _b)4360 FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
4361 {
4362 #if defined(__aarch64__)
4363     return vreinterpretq_m128_f32(vsubq_f32(
4364         vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
4365         vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
4366 #else
4367     float32x4x2_t c =
4368         vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
4369     return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
4370 #endif
4371 }
4372 
4373 // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
4374 // signed 16-bit results in dst.
4375 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
_mm_hadd_pi16(__m64 a,__m64 b)4376 FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
4377 {
4378     return vreinterpret_m64_s16(
4379         vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
4380 }
4381 
4382 // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
4383 // signed 32-bit results in dst.
4384 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
_mm_hadd_pi32(__m64 a,__m64 b)4385 FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
4386 {
4387     return vreinterpret_m64_s32(
4388         vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
4389 }
4390 
4391 // Computes pairwise difference of each argument as a 16-bit signed or unsigned
4392 // integer values a and b.
_mm_hsub_epi16(__m128i _a,__m128i _b)4393 FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
4394 {
4395     int32x4_t a = vreinterpretq_s32_m128i(_a);
4396     int32x4_t b = vreinterpretq_s32_m128i(_b);
4397     // Interleave using vshrn/vmovn
4398     // [a0|a2|a4|a6|b0|b2|b4|b6]
4399     // [a1|a3|a5|a7|b1|b3|b5|b7]
4400     int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
4401     int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
4402     // Subtract
4403     return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
4404 }
4405 
4406 // Computes saturated pairwise sub of each argument as a 16-bit signed
4407 // integer values a and b.
_mm_hadds_epi16(__m128i _a,__m128i _b)4408 FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
4409 {
4410 #if defined(__aarch64__)
4411     int16x8_t a = vreinterpretq_s16_m128i(_a);
4412     int16x8_t b = vreinterpretq_s16_m128i(_b);
4413     return vreinterpretq_s64_s16(
4414         vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
4415 #else
4416     int32x4_t a = vreinterpretq_s32_m128i(_a);
4417     int32x4_t b = vreinterpretq_s32_m128i(_b);
4418     // Interleave using vshrn/vmovn
4419     // [a0|a2|a4|a6|b0|b2|b4|b6]
4420     // [a1|a3|a5|a7|b1|b3|b5|b7]
4421     int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
4422     int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
4423     // Saturated add
4424     return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
4425 #endif
4426 }
4427 
4428 // Computes saturated pairwise difference of each argument as a 16-bit signed
4429 // integer values a and b.
4430 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
_mm_hsubs_epi16(__m128i _a,__m128i _b)4431 FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
4432 {
4433 #if defined(__aarch64__)
4434     int16x8_t a = vreinterpretq_s16_m128i(_a);
4435     int16x8_t b = vreinterpretq_s16_m128i(_b);
4436     return vreinterpretq_s64_s16(
4437         vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
4438 #else
4439     int32x4_t a = vreinterpretq_s32_m128i(_a);
4440     int32x4_t b = vreinterpretq_s32_m128i(_b);
4441     // Interleave using vshrn/vmovn
4442     // [a0|a2|a4|a6|b0|b2|b4|b6]
4443     // [a1|a3|a5|a7|b1|b3|b5|b7]
4444     int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
4445     int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
4446     // Saturated subtract
4447     return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
4448 #endif
4449 }
4450 
4451 // Computes pairwise add of each argument as a 32-bit signed or unsigned integer
4452 // values a and b.
_mm_hadd_epi32(__m128i _a,__m128i _b)4453 FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
4454 {
4455     int32x4_t a = vreinterpretq_s32_m128i(_a);
4456     int32x4_t b = vreinterpretq_s32_m128i(_b);
4457     return vreinterpretq_m128i_s32(
4458         vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
4459                      vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
4460 }
4461 
4462 // Computes pairwise difference of each argument as a 32-bit signed or unsigned
4463 // integer values a and b.
_mm_hsub_epi32(__m128i _a,__m128i _b)4464 FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
4465 {
4466     int64x2_t a = vreinterpretq_s64_m128i(_a);
4467     int64x2_t b = vreinterpretq_s64_m128i(_b);
4468     // Interleave using vshrn/vmovn
4469     // [a0|a2|b0|b2]
4470     // [a1|a2|b1|b3]
4471     int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
4472     int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
4473     // Subtract
4474     return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
4475 }
4476 
4477 // Kahan summation for accurate summation of floating-point numbers.
4478 // http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
_sse2neon_kadd_f32(float * sum,float * c,float y)4479 FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
4480 {
4481     y -= *c;
4482     float t = *sum + y;
4483     *c = (t - *sum) - y;
4484     *sum = t;
4485 }
4486 
4487 // Conditionally multiply the packed single-precision (32-bit) floating-point
4488 // elements in a and b using the high 4 bits in imm8, sum the four products,
4489 // and conditionally store the sum in dst using the low 4 bits of imm.
4490 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
_mm_dp_ps(__m128 a,__m128 b,const int imm)4491 FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
4492 {
4493 #if defined(__aarch64__)
4494     /* shortcuts */
4495     if (imm == 0xFF) {
4496         return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
4497     }
4498     if (imm == 0x7F) {
4499         float32x4_t m = _mm_mul_ps(a, b);
4500         m[3] = 0;
4501         return _mm_set1_ps(vaddvq_f32(m));
4502     }
4503 #endif
4504 
4505     float s = 0, c = 0;
4506     float32x4_t f32a = vreinterpretq_f32_m128(a);
4507     float32x4_t f32b = vreinterpretq_f32_m128(b);
4508 
4509     /* To improve the accuracy of floating-point summation, Kahan algorithm
4510      * is used for each operation.
4511      */
4512     if (imm & (1 << 4))
4513         _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
4514     if (imm & (1 << 5))
4515         _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
4516     if (imm & (1 << 6))
4517         _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
4518     if (imm & (1 << 7))
4519         _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
4520     s += c;
4521 
4522     float32x4_t res = {
4523         (imm & 0x1) ? s : 0,
4524         (imm & 0x2) ? s : 0,
4525         (imm & 0x4) ? s : 0,
4526         (imm & 0x8) ? s : 0,
4527     };
4528     return vreinterpretq_m128_f32(res);
4529 }
4530 
4531 /* Compare operations */
4532 
4533 // Compares for less than
4534 // https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
_mm_cmplt_ps(__m128 a,__m128 b)4535 FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
4536 {
4537     return vreinterpretq_m128_u32(
4538         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4539 }
4540 
4541 // Compares for less than
4542 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
_mm_cmplt_ss(__m128 a,__m128 b)4543 FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
4544 {
4545     return _mm_move_ss(a, _mm_cmplt_ps(a, b));
4546 }
4547 
4548 // Compares for greater than.
4549 //
4550 //   r0 := (a0 > b0) ? 0xffffffff : 0x0
4551 //   r1 := (a1 > b1) ? 0xffffffff : 0x0
4552 //   r2 := (a2 > b2) ? 0xffffffff : 0x0
4553 //   r3 := (a3 > b3) ? 0xffffffff : 0x0
4554 //
4555 // https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
_mm_cmpgt_ps(__m128 a,__m128 b)4556 FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
4557 {
4558     return vreinterpretq_m128_u32(
4559         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4560 }
4561 
4562 // Compares for greater than.
4563 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
_mm_cmpgt_ss(__m128 a,__m128 b)4564 FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
4565 {
4566     return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
4567 }
4568 
4569 // Compares for greater than or equal.
4570 // https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
_mm_cmpge_ps(__m128 a,__m128 b)4571 FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
4572 {
4573     return vreinterpretq_m128_u32(
4574         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4575 }
4576 
4577 // Compares for greater than or equal.
4578 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
_mm_cmpge_ss(__m128 a,__m128 b)4579 FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
4580 {
4581     return _mm_move_ss(a, _mm_cmpge_ps(a, b));
4582 }
4583 
4584 // Compares for less than or equal.
4585 //
4586 //   r0 := (a0 <= b0) ? 0xffffffff : 0x0
4587 //   r1 := (a1 <= b1) ? 0xffffffff : 0x0
4588 //   r2 := (a2 <= b2) ? 0xffffffff : 0x0
4589 //   r3 := (a3 <= b3) ? 0xffffffff : 0x0
4590 //
4591 // https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
_mm_cmple_ps(__m128 a,__m128 b)4592 FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
4593 {
4594     return vreinterpretq_m128_u32(
4595         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4596 }
4597 
4598 // Compares for less than or equal.
4599 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
_mm_cmple_ss(__m128 a,__m128 b)4600 FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
4601 {
4602     return _mm_move_ss(a, _mm_cmple_ps(a, b));
4603 }
4604 
4605 // Compares for equality.
4606 // https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
_mm_cmpeq_ps(__m128 a,__m128 b)4607 FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
4608 {
4609     return vreinterpretq_m128_u32(
4610         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4611 }
4612 
4613 // Compares for equality.
4614 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
_mm_cmpeq_ss(__m128 a,__m128 b)4615 FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
4616 {
4617     return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
4618 }
4619 
4620 // Compares for inequality.
4621 // https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
_mm_cmpneq_ps(__m128 a,__m128 b)4622 FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
4623 {
4624     return vreinterpretq_m128_u32(vmvnq_u32(
4625         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
4626 }
4627 
4628 // Compares for inequality.
4629 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
_mm_cmpneq_ss(__m128 a,__m128 b)4630 FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
4631 {
4632     return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
4633 }
4634 
4635 // Compares for not greater than or equal.
4636 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
_mm_cmpnge_ps(__m128 a,__m128 b)4637 FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
4638 {
4639     return _mm_cmplt_ps(a, b);
4640 }
4641 
4642 // Compares for not greater than or equal.
4643 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
_mm_cmpnge_ss(__m128 a,__m128 b)4644 FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
4645 {
4646     return _mm_cmplt_ss(a, b);
4647 }
4648 
4649 // Compares for not greater than.
4650 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
_mm_cmpngt_ps(__m128 a,__m128 b)4651 FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
4652 {
4653     return _mm_cmple_ps(a, b);
4654 }
4655 
4656 // Compares for not greater than.
4657 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
_mm_cmpngt_ss(__m128 a,__m128 b)4658 FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
4659 {
4660     return _mm_cmple_ss(a, b);
4661 }
4662 
4663 // Compares for not less than or equal.
4664 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
_mm_cmpnle_ps(__m128 a,__m128 b)4665 FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
4666 {
4667     return _mm_cmpgt_ps(a, b);
4668 }
4669 
4670 // Compares for not less than or equal.
4671 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
_mm_cmpnle_ss(__m128 a,__m128 b)4672 FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
4673 {
4674     return _mm_cmpgt_ss(a, b);
4675 }
4676 
4677 // Compares for not less than.
4678 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
_mm_cmpnlt_ps(__m128 a,__m128 b)4679 FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
4680 {
4681     return _mm_cmpge_ps(a, b);
4682 }
4683 
4684 // Compares for not less than.
4685 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
_mm_cmpnlt_ss(__m128 a,__m128 b)4686 FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
4687 {
4688     return _mm_cmpge_ss(a, b);
4689 }
4690 
4691 // Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
4692 // unsigned 8-bit integers in b for equality.
4693 // https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
_mm_cmpeq_epi8(__m128i a,__m128i b)4694 FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
4695 {
4696     return vreinterpretq_m128i_u8(
4697         vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4698 }
4699 
4700 // Compare packed double-precision (64-bit) floating-point elements in a and b
4701 // for equality, and store the results in dst.
4702 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
_mm_cmpeq_pd(__m128d a,__m128d b)4703 FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
4704 {
4705 #if defined(__aarch64__)
4706     return vreinterpretq_m128d_u64(
4707         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4708 #else
4709     // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
4710     uint32x4_t cmp =
4711         vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
4712     uint32x4_t swapped = vrev64q_u32(cmp);
4713     return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
4714 #endif
4715 }
4716 
4717 // Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
4718 // unsigned 16-bit integers in b for equality.
4719 // https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
_mm_cmpeq_epi16(__m128i a,__m128i b)4720 FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
4721 {
4722     return vreinterpretq_m128i_u16(
4723         vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4724 }
4725 
4726 // Compare packed 32-bit integers in a and b for equality, and store the results
4727 // in dst
_mm_cmpeq_epi32(__m128i a,__m128i b)4728 FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
4729 {
4730     return vreinterpretq_m128i_u32(
4731         vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4732 }
4733 
4734 // Compare packed 64-bit integers in a and b for equality, and store the results
4735 // in dst
_mm_cmpeq_epi64(__m128i a,__m128i b)4736 FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
4737 {
4738 #if defined(__aarch64__)
4739     return vreinterpretq_m128i_u64(
4740         vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
4741 #else
4742     // ARMv7 lacks vceqq_u64
4743     // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
4744     uint32x4_t cmp =
4745         vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
4746     uint32x4_t swapped = vrev64q_u32(cmp);
4747     return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
4748 #endif
4749 }
4750 
4751 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
4752 // in b for lesser than.
4753 // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
_mm_cmplt_epi8(__m128i a,__m128i b)4754 FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
4755 {
4756     return vreinterpretq_m128i_u8(
4757         vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4758 }
4759 
4760 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
4761 // in b for greater than.
4762 //
4763 //   r0 := (a0 > b0) ? 0xff : 0x0
4764 //   r1 := (a1 > b1) ? 0xff : 0x0
4765 //   ...
4766 //   r15 := (a15 > b15) ? 0xff : 0x0
4767 //
4768 // https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
_mm_cmpgt_epi8(__m128i a,__m128i b)4769 FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
4770 {
4771     return vreinterpretq_m128i_u8(
4772         vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4773 }
4774 
4775 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
4776 // in b for less than.
4777 //
4778 //   r0 := (a0 < b0) ? 0xffff : 0x0
4779 //   r1 := (a1 < b1) ? 0xffff : 0x0
4780 //   ...
4781 //   r7 := (a7 < b7) ? 0xffff : 0x0
4782 //
4783 // https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
_mm_cmplt_epi16(__m128i a,__m128i b)4784 FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
4785 {
4786     return vreinterpretq_m128i_u16(
4787         vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4788 }
4789 
4790 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
4791 // in b for greater than.
4792 //
4793 //   r0 := (a0 > b0) ? 0xffff : 0x0
4794 //   r1 := (a1 > b1) ? 0xffff : 0x0
4795 //   ...
4796 //   r7 := (a7 > b7) ? 0xffff : 0x0
4797 //
4798 // https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
_mm_cmpgt_epi16(__m128i a,__m128i b)4799 FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
4800 {
4801     return vreinterpretq_m128i_u16(
4802         vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4803 }
4804 
4805 
4806 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
4807 // in b for less than.
4808 // https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
_mm_cmplt_epi32(__m128i a,__m128i b)4809 FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
4810 {
4811     return vreinterpretq_m128i_u32(
4812         vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4813 }
4814 
4815 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
4816 // in b for greater than.
4817 // https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
_mm_cmpgt_epi32(__m128i a,__m128i b)4818 FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
4819 {
4820     return vreinterpretq_m128i_u32(
4821         vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4822 }
4823 
4824 // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
4825 // in b for greater than.
_mm_cmpgt_epi64(__m128i a,__m128i b)4826 FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
4827 {
4828 #if defined(__aarch64__)
4829     return vreinterpretq_m128i_u64(
4830         vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
4831 #else
4832     // ARMv7 lacks vcgtq_s64.
4833     // This is based off of Clang's SSE2 polyfill:
4834     // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
4835 
4836     // Mask the sign bit out since we need a signed AND an unsigned comparison
4837     // and it is ugly to try and split them.
4838     int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
4839     int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
4840     int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
4841     // Check if a > b
4842     int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
4843     // Copy upper mask to lower mask
4844     // a_hi > b_hi
4845     int64x2_t gt_hi = vshrq_n_s64(greater, 63);
4846     // Copy lower mask to upper mask
4847     // a_lo > b_lo
4848     int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
4849     // Compare for equality
4850     int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
4851     // Copy upper mask to lower mask
4852     // a_hi == b_hi
4853     int64x2_t eq_hi = vshrq_n_s64(equal, 63);
4854     // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
4855     int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
4856     return vreinterpretq_m128i_s64(ret);
4857 #endif
4858 }
4859 
4860 // Compares the four 32-bit floats in a and b to check if any values are NaN.
4861 // Ordered compare between each value returns true for "orderable" and false for
4862 // "not orderable" (NaN).
4863 // https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
4864 // also:
4865 // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
4866 // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
_mm_cmpord_ps(__m128 a,__m128 b)4867 FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
4868 {
4869     // Note: NEON does not have ordered compare builtin
4870     // Need to compare a eq a and b eq b to check for NaN
4871     // Do AND of results to get final
4872     uint32x4_t ceqaa =
4873         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4874     uint32x4_t ceqbb =
4875         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4876     return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
4877 }
4878 
4879 // Compares for ordered.
4880 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
_mm_cmpord_ss(__m128 a,__m128 b)4881 FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
4882 {
4883     return _mm_move_ss(a, _mm_cmpord_ps(a, b));
4884 }
4885 
4886 // Compares for unordered.
4887 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
_mm_cmpunord_ps(__m128 a,__m128 b)4888 FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
4889 {
4890     uint32x4_t f32a =
4891         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4892     uint32x4_t f32b =
4893         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4894     return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
4895 }
4896 
4897 // Compares for unordered.
4898 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
_mm_cmpunord_ss(__m128 a,__m128 b)4899 FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
4900 {
4901     return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
4902 }
4903 
4904 // Compares the lower single-precision floating point scalar values of a and b
4905 // using a less than operation. :
4906 // https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
4907 // note!! The documentation on MSDN is incorrect!  If either of the values is a
4908 // NAN the docs say you will get a one, but in fact, it will return a zero!!
_mm_comilt_ss(__m128 a,__m128 b)4909 FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
4910 {
4911     uint32x4_t a_not_nan =
4912         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4913     uint32x4_t b_not_nan =
4914         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4915     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4916     uint32x4_t a_lt_b =
4917         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4918     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
4919 }
4920 
4921 // Compares the lower single-precision floating point scalar values of a and b
4922 // using a greater than operation. :
4923 // https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
_mm_comigt_ss(__m128 a,__m128 b)4924 FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
4925 {
4926     // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
4927     // vreinterpretq_f32_m128(b)), 0);
4928     uint32x4_t a_not_nan =
4929         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4930     uint32x4_t b_not_nan =
4931         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4932     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4933     uint32x4_t a_gt_b =
4934         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4935     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
4936 }
4937 
4938 // Compares the lower single-precision floating point scalar values of a and b
4939 // using a less than or equal operation. :
4940 // https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
_mm_comile_ss(__m128 a,__m128 b)4941 FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
4942 {
4943     // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
4944     // vreinterpretq_f32_m128(b)), 0);
4945     uint32x4_t a_not_nan =
4946         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4947     uint32x4_t b_not_nan =
4948         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4949     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4950     uint32x4_t a_le_b =
4951         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4952     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
4953 }
4954 
4955 // Compares the lower single-precision floating point scalar values of a and b
4956 // using a greater than or equal operation. :
4957 // https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
_mm_comige_ss(__m128 a,__m128 b)4958 FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
4959 {
4960     // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
4961     // vreinterpretq_f32_m128(b)), 0);
4962     uint32x4_t a_not_nan =
4963         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4964     uint32x4_t b_not_nan =
4965         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4966     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4967     uint32x4_t a_ge_b =
4968         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4969     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
4970 }
4971 
4972 // Compares the lower single-precision floating point scalar values of a and b
4973 // using an equality operation. :
4974 // https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
_mm_comieq_ss(__m128 a,__m128 b)4975 FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
4976 {
4977     // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
4978     // vreinterpretq_f32_m128(b)), 0);
4979     uint32x4_t a_not_nan =
4980         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4981     uint32x4_t b_not_nan =
4982         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4983     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4984     uint32x4_t a_eq_b =
4985         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4986     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
4987 }
4988 
4989 // Compares the lower single-precision floating point scalar values of a and b
4990 // using an inequality operation. :
4991 // https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
_mm_comineq_ss(__m128 a,__m128 b)4992 FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
4993 {
4994     // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
4995     // vreinterpretq_f32_m128(b)), 0);
4996     uint32x4_t a_not_nan =
4997         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4998     uint32x4_t b_not_nan =
4999         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
5000     uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
5001     uint32x4_t a_neq_b = vmvnq_u32(
5002         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
5003     return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
5004 }
5005 
5006 // according to the documentation, these intrinsics behave the same as the
5007 // non-'u' versions.  We'll just alias them here.
5008 #define _mm_ucomieq_ss _mm_comieq_ss
5009 #define _mm_ucomige_ss _mm_comige_ss
5010 #define _mm_ucomigt_ss _mm_comigt_ss
5011 #define _mm_ucomile_ss _mm_comile_ss
5012 #define _mm_ucomilt_ss _mm_comilt_ss
5013 #define _mm_ucomineq_ss _mm_comineq_ss
5014 
5015 /* Conversions */
5016 
5017 // Convert packed signed 32-bit integers in b to packed single-precision
5018 // (32-bit) floating-point elements, store the results in the lower 2 elements
5019 // of dst, and copy the upper 2 packed elements from a to the upper elements of
5020 // dst.
5021 //
5022 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
5023 //   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
5024 //   dst[95:64] := a[95:64]
5025 //   dst[127:96] := a[127:96]
5026 //
5027 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
_mm_cvt_pi2ps(__m128 a,__m64 b)5028 FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
5029 {
5030     return vreinterpretq_m128_f32(
5031         vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
5032                      vget_high_f32(vreinterpretq_f32_m128(a))));
5033 }
5034 
5035 // Convert the signed 32-bit integer b to a single-precision (32-bit)
5036 // floating-point element, store the result in the lower element of dst, and
5037 // copy the upper 3 packed elements from a to the upper elements of dst.
5038 //
5039 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
5040 //   dst[127:32] := a[127:32]
5041 //
5042 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
_mm_cvt_si2ss(__m128 a,int b)5043 FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
5044 {
5045     return vreinterpretq_m128_f32(
5046         vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
5047 }
5048 
5049 // Convert the signed 32-bit integer b to a single-precision (32-bit)
5050 // floating-point element, store the result in the lower element of dst, and
5051 // copy the upper 3 packed elements from a to the upper elements of dst.
5052 //
5053 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
5054 //   dst[127:32] := a[127:32]
5055 //
5056 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
5057 #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
5058 
5059 // Convert the signed 64-bit integer b to a single-precision (32-bit)
5060 // floating-point element, store the result in the lower element of dst, and
5061 // copy the upper 3 packed elements from a to the upper elements of dst.
5062 //
5063 //   dst[31:0] := Convert_Int64_To_FP32(b[63:0])
5064 //   dst[127:32] := a[127:32]
5065 //
5066 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
_mm_cvtsi64_ss(__m128 a,int64_t b)5067 FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
5068 {
5069     return vreinterpretq_m128_f32(
5070         vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
5071 }
5072 
5073 // Convert the lower single-precision (32-bit) floating-point element in a to a
5074 // 32-bit integer, and store the result in dst.
5075 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
_mm_cvt_ss2si(__m128 a)5076 FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
5077 {
5078 #if defined(__aarch64__)
5079     return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0);
5080 #else
5081     float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
5082     float32_t diff = data - floor(data);
5083     if (diff > 0.5)
5084         return (int32_t) ceil(data);
5085     if (unlikely(diff == 0.5)) {
5086         int32_t f = (int32_t) floor(data);
5087         int32_t c = (int32_t) ceil(data);
5088         return c & 1 ? f : c;
5089     }
5090     return (int32_t) floor(data);
5091 #endif
5092 }
5093 
5094 // Convert packed 16-bit integers in a to packed single-precision (32-bit)
5095 // floating-point elements, and store the results in dst.
5096 //
5097 //   FOR j := 0 to 3
5098 //      i := j*16
5099 //      m := j*32
5100 //      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
5101 //   ENDFOR
5102 //
5103 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
_mm_cvtpi16_ps(__m64 a)5104 FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
5105 {
5106     return vreinterpretq_m128_f32(
5107         vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
5108 }
5109 
5110 // Convert packed 32-bit integers in b to packed single-precision (32-bit)
5111 // floating-point elements, store the results in the lower 2 elements of dst,
5112 // and copy the upper 2 packed elements from a to the upper elements of dst.
5113 //
5114 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
5115 //   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
5116 //   dst[95:64] := a[95:64]
5117 //   dst[127:96] := a[127:96]
5118 //
5119 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
_mm_cvtpi32_ps(__m128 a,__m64 b)5120 FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
5121 {
5122     return vreinterpretq_m128_f32(
5123         vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
5124                      vget_high_f32(vreinterpretq_f32_m128(a))));
5125 }
5126 
5127 // Convert packed signed 32-bit integers in a to packed single-precision
5128 // (32-bit) floating-point elements, store the results in the lower 2 elements
5129 // of dst, then covert the packed signed 32-bit integers in b to
5130 // single-precision (32-bit) floating-point element, and store the results in
5131 // the upper 2 elements of dst.
5132 //
5133 //   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
5134 //   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
5135 //   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
5136 //   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
5137 //
5138 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
_mm_cvtpi32x2_ps(__m64 a,__m64 b)5139 FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
5140 {
5141     return vreinterpretq_m128_f32(vcvtq_f32_s32(
5142         vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
5143 }
5144 
5145 // Convert the lower packed 8-bit integers in a to packed single-precision
5146 // (32-bit) floating-point elements, and store the results in dst.
5147 //
5148 //   FOR j := 0 to 3
5149 //      i := j*8
5150 //      m := j*32
5151 //      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
5152 //   ENDFOR
5153 //
5154 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
_mm_cvtpi8_ps(__m64 a)5155 FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
5156 {
5157     return vreinterpretq_m128_f32(vcvtq_f32_s32(
5158         vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
5159 }
5160 
5161 // Convert packed unsigned 16-bit integers in a to packed single-precision
5162 // (32-bit) floating-point elements, and store the results in dst.
5163 //
5164 //   FOR j := 0 to 3
5165 //      i := j*16
5166 //      m := j*32
5167 //      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
5168 //   ENDFOR
5169 //
5170 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
_mm_cvtpu16_ps(__m64 a)5171 FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
5172 {
5173     return vreinterpretq_m128_f32(
5174         vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
5175 }
5176 
5177 // Convert the lower packed unsigned 8-bit integers in a to packed
5178 // single-precision (32-bit) floating-point elements, and store the results in
5179 // dst.
5180 //
5181 //   FOR j := 0 to 3
5182 //      i := j*8
5183 //      m := j*32
5184 //      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
5185 //   ENDFOR
5186 //
5187 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
_mm_cvtpu8_ps(__m64 a)5188 FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
5189 {
5190     return vreinterpretq_m128_f32(vcvtq_f32_u32(
5191         vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
5192 }
5193 
5194 // Converts the four single-precision, floating-point values of a to signed
5195 // 32-bit integer values using truncate.
5196 // https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
_mm_cvttps_epi32(__m128 a)5197 FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
5198 {
5199     return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
5200 }
5201 
5202 // Convert the lower double-precision (64-bit) floating-point element in a to a
5203 // 64-bit integer with truncation, and store the result in dst.
5204 //
5205 //   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
5206 //
5207 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
_mm_cvttsd_si64(__m128d a)5208 FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
5209 {
5210 #if defined(__aarch64__)
5211     return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
5212 #else
5213     double ret = *((double *) &a);
5214     return (int64_t) ret;
5215 #endif
5216 }
5217 
5218 // Convert the lower double-precision (64-bit) floating-point element in a to a
5219 // 64-bit integer with truncation, and store the result in dst.
5220 //
5221 //   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
5222 //
5223 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
5224 #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
5225 
5226 // Converts the four signed 32-bit integer values of a to single-precision,
5227 // floating-point values
5228 // https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
_mm_cvtepi32_ps(__m128i a)5229 FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
5230 {
5231     return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
5232 }
5233 
5234 // Converts the four unsigned 8-bit integers in the lower 16 bits to four
5235 // unsigned 32-bit integers.
_mm_cvtepu8_epi16(__m128i a)5236 FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
5237 {
5238     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx xxxx DCBA */
5239     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
5240     return vreinterpretq_m128i_u16(u16x8);
5241 }
5242 
5243 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
5244 // unsigned 32-bit integers.
5245 // https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
_mm_cvtepu8_epi32(__m128i a)5246 FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
5247 {
5248     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
5249     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
5250     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
5251     return vreinterpretq_m128i_u32(u32x4);
5252 }
5253 
5254 // Converts the two unsigned 8-bit integers in the lower 16 bits to two
5255 // unsigned 64-bit integers.
_mm_cvtepu8_epi64(__m128i a)5256 FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
5257 {
5258     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
5259     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
5260     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
5261     uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
5262     return vreinterpretq_m128i_u64(u64x2);
5263 }
5264 
5265 // Converts the four unsigned 8-bit integers in the lower 16 bits to four
5266 // unsigned 32-bit integers.
_mm_cvtepi8_epi16(__m128i a)5267 FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
5268 {
5269     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
5270     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
5271     return vreinterpretq_m128i_s16(s16x8);
5272 }
5273 
5274 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
5275 // unsigned 32-bit integers.
_mm_cvtepi8_epi32(__m128i a)5276 FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
5277 {
5278     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
5279     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
5280     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
5281     return vreinterpretq_m128i_s32(s32x4);
5282 }
5283 
5284 // Converts the two signed 8-bit integers in the lower 32 bits to four
5285 // signed 64-bit integers.
_mm_cvtepi8_epi64(__m128i a)5286 FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
5287 {
5288     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
5289     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
5290     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
5291     int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
5292     return vreinterpretq_m128i_s64(s64x2);
5293 }
5294 
5295 // Converts the four signed 16-bit integers in the lower 64 bits to four signed
5296 // 32-bit integers.
_mm_cvtepi16_epi32(__m128i a)5297 FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
5298 {
5299     return vreinterpretq_m128i_s32(
5300         vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
5301 }
5302 
5303 // Converts the two signed 16-bit integers in the lower 32 bits two signed
5304 // 32-bit integers.
_mm_cvtepi16_epi64(__m128i a)5305 FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
5306 {
5307     int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
5308     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
5309     int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
5310     return vreinterpretq_m128i_s64(s64x2);
5311 }
5312 
5313 // Converts the four unsigned 16-bit integers in the lower 64 bits to four
5314 // unsigned 32-bit integers.
_mm_cvtepu16_epi32(__m128i a)5315 FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
5316 {
5317     return vreinterpretq_m128i_u32(
5318         vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
5319 }
5320 
5321 // Converts the two unsigned 16-bit integers in the lower 32 bits to two
5322 // unsigned 64-bit integers.
_mm_cvtepu16_epi64(__m128i a)5323 FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
5324 {
5325     uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
5326     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
5327     uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
5328     return vreinterpretq_m128i_u64(u64x2);
5329 }
5330 
5331 // Converts the two unsigned 32-bit integers in the lower 64 bits to two
5332 // unsigned 64-bit integers.
_mm_cvtepu32_epi64(__m128i a)5333 FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
5334 {
5335     return vreinterpretq_m128i_u64(
5336         vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
5337 }
5338 
5339 // Converts the two signed 32-bit integers in the lower 64 bits to two signed
5340 // 64-bit integers.
_mm_cvtepi32_epi64(__m128i a)5341 FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
5342 {
5343     return vreinterpretq_m128i_s64(
5344         vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
5345 }
5346 
5347 // Converts the four single-precision, floating-point values of a to signed
5348 // 32-bit integer values.
5349 //
5350 //   r0 := (int) a0
5351 //   r1 := (int) a1
5352 //   r2 := (int) a2
5353 //   r3 := (int) a3
5354 //
5355 // https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
5356 // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
5357 // does not support! It is supported on ARMv8-A however.
_mm_cvtps_epi32(__m128 a)5358 FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
5359 {
5360 #if defined(__aarch64__)
5361     return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
5362 #else
5363     uint32x4_t signmask = vdupq_n_u32(0x80000000);
5364     float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
5365                                  vdupq_n_f32(0.5f)); /* +/- 0.5 */
5366     int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
5367         vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
5368     int32x4_t r_trunc =
5369         vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
5370     int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
5371         vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
5372     int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
5373                                  vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
5374     float32x4_t delta = vsubq_f32(
5375         vreinterpretq_f32_m128(a),
5376         vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
5377     uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
5378     return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
5379 #endif
5380 }
5381 
5382 // Convert packed single-precision (32-bit) floating-point elements in a to
5383 // packed 16-bit integers, and store the results in dst. Note: this intrinsic
5384 // will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
5385 // 0x7FFFFFFF.
5386 //
5387 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
_mm_cvtps_pi16(__m128 a)5388 FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
5389 {
5390     return vreinterpret_m64_s16(
5391         vmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
5392 }
5393 
5394 // Copy the lower 32-bit integer in a to dst.
5395 //
5396 //   dst[31:0] := a[31:0]
5397 //
5398 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
_mm_cvtsi128_si32(__m128i a)5399 FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
5400 {
5401     return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
5402 }
5403 
5404 // Copy the lower 64-bit integer in a to dst.
5405 //
5406 //   dst[63:0] := a[63:0]
5407 //
5408 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
_mm_cvtsi128_si64(__m128i a)5409 FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
5410 {
5411     return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
5412 }
5413 
5414 // Copy the lower 64-bit integer in a to dst.
5415 //
5416 //   dst[63:0] := a[63:0]
5417 //
5418 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
5419 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
5420 
5421 // Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
5422 // zero extending the upper bits.
5423 //
5424 //   r0 := a
5425 //   r1 := 0x0
5426 //   r2 := 0x0
5427 //   r3 := 0x0
5428 //
5429 // https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
_mm_cvtsi32_si128(int a)5430 FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
5431 {
5432     return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
5433 }
5434 
5435 // Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
5436 // zero extending the upper bits.
5437 //
5438 //   r0 := a
5439 //   r1 := 0x0
_mm_cvtsi64_si128(int64_t a)5440 FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
5441 {
5442     return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
5443 }
5444 
5445 // Cast vector of type __m128 to type __m128d. This intrinsic is only used for
5446 // compilation and does not generate any instructions, thus it has zero latency.
5447 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
_mm_castps_pd(__m128 a)5448 FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
5449 {
5450     return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
5451 }
5452 
5453 // Applies a type cast to reinterpret four 32-bit floating point values passed
5454 // in as a 128-bit parameter as packed 32-bit integers.
5455 // https://msdn.microsoft.com/en-us/library/bb514099.aspx
_mm_castps_si128(__m128 a)5456 FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
5457 {
5458     return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
5459 }
5460 
5461 // Cast vector of type __m128i to type __m128d. This intrinsic is only used for
5462 // compilation and does not generate any instructions, thus it has zero latency.
5463 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
_mm_castsi128_pd(__m128i a)5464 FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
5465 {
5466 #if defined(__aarch64__)
5467     return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
5468 #else
5469     return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
5470 #endif
5471 }
5472 
5473 // Applies a type cast to reinterpret four 32-bit integers passed in as a
5474 // 128-bit parameter as packed 32-bit floating point values.
5475 // https://msdn.microsoft.com/en-us/library/bb514029.aspx
_mm_castsi128_ps(__m128i a)5476 FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
5477 {
5478     return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
5479 }
5480 
5481 // Loads 128-bit value. :
5482 // https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
_mm_load_si128(const __m128i * p)5483 FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
5484 {
5485     return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
5486 }
5487 
5488 // Load a double-precision (64-bit) floating-point element from memory into both
5489 // elements of dst.
5490 //
5491 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
5492 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
5493 //
5494 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
_mm_load1_pd(const double * p)5495 FORCE_INLINE __m128d _mm_load1_pd(const double *p)
5496 {
5497 #if defined(__aarch64__)
5498     return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
5499 #else
5500     return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
5501 #endif
5502 }
5503 
5504 // Load a double-precision (64-bit) floating-point element from memory into both
5505 // elements of dst.
5506 //
5507 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
5508 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
5509 //
5510 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
5511 #define _mm_load_pd1 _mm_load1_pd
5512 
5513 // Load a double-precision (64-bit) floating-point element from memory into the
5514 // upper element of dst, and copy the lower element from a to dst. mem_addr does
5515 // not need to be aligned on any particular boundary.
5516 //
5517 //   dst[63:0] := a[63:0]
5518 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
5519 //
5520 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
_mm_loadh_pd(__m128d a,const double * p)5521 FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
5522 {
5523 #if defined(__aarch64__)
5524     return vreinterpretq_m128d_f64(
5525         vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
5526 #else
5527     return vreinterpretq_m128d_f32(vcombine_f32(
5528         vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
5529 #endif
5530 }
5531 
5532 // Load a double-precision (64-bit) floating-point element from memory into both
5533 // elements of dst.
5534 //
5535 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
5536 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
5537 //
5538 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
5539 #define _mm_load_pd1 _mm_load1_pd
5540 
5541 // Load a double-precision (64-bit) floating-point element from memory into both
5542 // elements of dst.
5543 //
5544 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
5545 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
5546 //
5547 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
5548 #define _mm_loaddup_pd _mm_load1_pd
5549 
5550 // Loads 128-bit value. :
5551 // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
_mm_loadu_si128(const __m128i * p)5552 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
5553 {
5554     return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
5555 }
5556 
5557 // Load unaligned 32-bit integer from memory into the first element of dst.
5558 //
5559 //   dst[31:0] := MEM[mem_addr+31:mem_addr]
5560 //   dst[MAX:32] := 0
5561 //
5562 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
_mm_loadu_si32(const void * p)5563 FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
5564 {
5565     return vreinterpretq_m128i_s32(
5566         vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
5567 }
5568 
5569 // Convert packed double-precision (64-bit) floating-point elements in a to
5570 // packed single-precision (32-bit) floating-point elements, and store the
5571 // results in dst.
5572 //
5573 //   FOR j := 0 to 1
5574 //     i := 32*j
5575 //     k := 64*j
5576 //     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
5577 //   ENDFOR
5578 //   dst[127:64] := 0
5579 //
5580 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
_mm_cvtpd_ps(__m128d a)5581 FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
5582 {
5583 #if defined(__aarch64__)
5584     float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
5585     return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
5586 #else
5587     float a0 = (float) ((double *) &a)[0];
5588     float a1 = (float) ((double *) &a)[1];
5589     return _mm_set_ps(0, 0, a1, a0);
5590 #endif
5591 }
5592 
5593 // Copy the lower double-precision (64-bit) floating-point element of a to dst.
5594 //
5595 //   dst[63:0] := a[63:0]
5596 //
5597 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
_mm_cvtsd_f64(__m128d a)5598 FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
5599 {
5600 #if defined(__aarch64__)
5601     return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
5602 #else
5603     return ((double *) &a)[0];
5604 #endif
5605 }
5606 
5607 // Convert packed single-precision (32-bit) floating-point elements in a to
5608 // packed double-precision (64-bit) floating-point elements, and store the
5609 // results in dst.
5610 //
5611 //   FOR j := 0 to 1
5612 //     i := 64*j
5613 //     k := 32*j
5614 //     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
5615 //   ENDFOR
5616 //
5617 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
_mm_cvtps_pd(__m128 a)5618 FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
5619 {
5620 #if defined(__aarch64__)
5621     return vreinterpretq_m128d_f64(
5622         vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
5623 #else
5624     double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
5625     double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
5626     return _mm_set_pd(a1, a0);
5627 #endif
5628 }
5629 
5630 // Cast vector of type __m128d to type __m128i. This intrinsic is only used for
5631 // compilation and does not generate any instructions, thus it has zero latency.
5632 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
_mm_castpd_si128(__m128d a)5633 FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
5634 {
5635     return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
5636 }
5637 
5638 // Cast vector of type __m128d to type __m128. This intrinsic is only used for
5639 // compilation and does not generate any instructions, thus it has zero latency.
5640 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
_mm_castpd_ps(__m128d a)5641 FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
5642 {
5643     return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
5644 }
5645 
5646 // Blend packed single-precision (32-bit) floating-point elements from a and b
5647 // using mask, and store the results in dst.
5648 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
_mm_blendv_ps(__m128 _a,__m128 _b,__m128 _mask)5649 FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
5650 {
5651     // Use a signed shift right to create a mask with the sign bit
5652     uint32x4_t mask =
5653         vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
5654     float32x4_t a = vreinterpretq_f32_m128(_a);
5655     float32x4_t b = vreinterpretq_f32_m128(_b);
5656     return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
5657 }
5658 
5659 // Blend packed single-precision (32-bit) floating-point elements from a and b
5660 // using mask, and store the results in dst.
5661 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps
_mm_blend_ps(__m128 _a,__m128 _b,const char imm8)5662 FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
5663 {
5664     const uint32_t ALIGN_STRUCT(16)
5665         data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
5666                    ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
5667                    ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
5668                    ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
5669     uint32x4_t mask = vld1q_u32(data);
5670     float32x4_t a = vreinterpretq_f32_m128(_a);
5671     float32x4_t b = vreinterpretq_f32_m128(_b);
5672     return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
5673 }
5674 
5675 // Blend packed double-precision (64-bit) floating-point elements from a and b
5676 // using mask, and store the results in dst.
5677 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
_mm_blendv_pd(__m128d _a,__m128d _b,__m128d _mask)5678 FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
5679 {
5680     uint64x2_t mask =
5681         vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
5682 #if defined(__aarch64__)
5683     float64x2_t a = vreinterpretq_f64_m128d(_a);
5684     float64x2_t b = vreinterpretq_f64_m128d(_b);
5685     return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
5686 #else
5687     uint64x2_t a = vreinterpretq_u64_m128d(_a);
5688     uint64x2_t b = vreinterpretq_u64_m128d(_b);
5689     return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
5690 #endif
5691 }
5692 
5693 typedef struct {
5694     uint16_t res0;
5695     uint8_t res1 : 6;
5696     uint8_t bit22 : 1;
5697     uint8_t bit23 : 1;
5698     uint8_t res2;
5699 #if defined(__aarch64__)
5700     uint32_t res3;
5701 #endif
5702 } fpcr_bitfield;
5703 
5704 // Macro: Set the rounding mode bits of the MXCSR control and status register to
5705 // the value in unsigned 32-bit integer a. The rounding mode may contain any of
5706 // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
5707 // _MM_ROUND_TOWARD_ZERO
5708 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
_MM_SET_ROUNDING_MODE(int rounding)5709 FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
5710 {
5711     union {
5712         fpcr_bitfield field;
5713 #if defined(__aarch64__)
5714         uint64_t value;
5715 #else
5716         uint32_t value;
5717 #endif
5718     } r;
5719 
5720 #if defined(__aarch64__)
5721     asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
5722 #else
5723     asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
5724 #endif
5725 
5726     switch (rounding) {
5727     case _MM_ROUND_TOWARD_ZERO:
5728         r.field.bit22 = 1;
5729         r.field.bit23 = 1;
5730         break;
5731     case _MM_ROUND_DOWN:
5732         r.field.bit22 = 0;
5733         r.field.bit23 = 1;
5734         break;
5735     case _MM_ROUND_UP:
5736         r.field.bit22 = 1;
5737         r.field.bit23 = 0;
5738         break;
5739     default:  //_MM_ROUND_NEAREST
5740         r.field.bit22 = 0;
5741         r.field.bit23 = 0;
5742     }
5743 
5744 #if defined(__aarch64__)
5745     asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
5746 #else
5747     asm volatile("vmsr FPSCR, %0" ::"r"(r));        /* write */
5748 #endif
5749 }
5750 
_mm_setcsr(unsigned int a)5751 FORCE_INLINE void _mm_setcsr(unsigned int a)
5752 {
5753     _MM_SET_ROUNDING_MODE(a);
5754 }
5755 
5756 // Round the packed single-precision (32-bit) floating-point elements in a using
5757 // the rounding parameter, and store the results as packed single-precision
5758 // floating-point elements in dst.
5759 // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
_mm_round_ps(__m128 a,int rounding)5760 FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
5761 {
5762 #if defined(__aarch64__)
5763     switch (rounding) {
5764     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
5765         return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
5766     case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
5767         return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
5768     case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
5769         return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
5770     case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
5771         return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
5772     default:  //_MM_FROUND_CUR_DIRECTION
5773         return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
5774     }
5775 #else
5776     float *v_float = (float *) &a;
5777     __m128 zero, neg_inf, pos_inf;
5778 
5779     switch (rounding) {
5780     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
5781         return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
5782     case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
5783         return (__m128){floorf(v_float[0]), floorf(v_float[1]),
5784                         floorf(v_float[2]), floorf(v_float[3])};
5785     case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
5786         return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]),
5787                         ceilf(v_float[3])};
5788     case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
5789         zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
5790         neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]),
5791                              floorf(v_float[2]), floorf(v_float[3]));
5792         pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]),
5793                              ceilf(v_float[2]), ceilf(v_float[3]));
5794         return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero));
5795     default:  //_MM_FROUND_CUR_DIRECTION
5796         return (__m128){roundf(v_float[0]), roundf(v_float[1]),
5797                         roundf(v_float[2]), roundf(v_float[3])};
5798     }
5799 #endif
5800 }
5801 
5802 // Convert packed single-precision (32-bit) floating-point elements in a to
5803 // packed 32-bit integers, and store the results in dst.
5804 //
5805 //   FOR j := 0 to 1
5806 //       i := 32*j
5807 //       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
5808 //   ENDFOR
5809 //
5810 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
_mm_cvt_ps2pi(__m128 a)5811 FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
5812 {
5813 #if defined(__aarch64__)
5814     return vreinterpret_m64_s32(
5815         vget_low_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))));
5816 #else
5817     return vreinterpret_m64_s32(
5818         vcvt_s32_f32(vget_low_f32(vreinterpretq_f32_m128(
5819             _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)))));
5820 #endif
5821 }
5822 
5823 // Convert packed single-precision (32-bit) floating-point elements in a to
5824 // packed 32-bit integers, and store the results in dst.
5825 //
5826 //   FOR j := 0 to 1
5827 //       i := 32*j
5828 //       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
5829 //   ENDFOR
5830 //
5831 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
5832 #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
5833 
5834 // Round the packed single-precision (32-bit) floating-point elements in a up to
5835 // an integer value, and store the results as packed single-precision
5836 // floating-point elements in dst.
5837 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
_mm_ceil_ps(__m128 a)5838 FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
5839 {
5840     return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
5841 }
5842 
5843 // Round the lower single-precision (32-bit) floating-point element in b up to
5844 // an integer value, store the result as a single-precision floating-point
5845 // element in the lower element of dst, and copy the upper 3 packed elements
5846 // from a to the upper elements of dst.
5847 //
5848 //   dst[31:0] := CEIL(b[31:0])
5849 //   dst[127:32] := a[127:32]
5850 //
5851 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss
_mm_ceil_ss(__m128 a,__m128 b)5852 FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
5853 {
5854     return _mm_move_ss(
5855         a, _mm_round_ps(b, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC));
5856 }
5857 
5858 // Round the packed single-precision (32-bit) floating-point elements in a down
5859 // to an integer value, and store the results as packed single-precision
5860 // floating-point elements in dst.
5861 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
_mm_floor_ps(__m128 a)5862 FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
5863 {
5864     return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
5865 }
5866 
5867 // Round the lower single-precision (32-bit) floating-point element in b down to
5868 // an integer value, store the result as a single-precision floating-point
5869 // element in the lower element of dst, and copy the upper 3 packed elements
5870 // from a to the upper elements of dst.
5871 //
5872 //   dst[31:0] := FLOOR(b[31:0])
5873 //   dst[127:32] := a[127:32]
5874 //
5875 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
_mm_floor_ss(__m128 a,__m128 b)5876 FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
5877 {
5878     return _mm_move_ss(
5879         a, _mm_round_ps(b, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
5880 }
5881 
5882 // Load 128-bits of integer data from unaligned memory into dst. This intrinsic
5883 // may perform better than _mm_loadu_si128 when the data crosses a cache line
5884 // boundary.
5885 //
5886 //   dst[127:0] := MEM[mem_addr+127:mem_addr]
5887 //
5888 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
5889 #define _mm_lddqu_si128 _mm_loadu_si128
5890 
5891 /* Miscellaneous Operations */
5892 
5893 // Shifts the 8 signed 16-bit integers in a right by count bits while shifting
5894 // in the sign bit.
5895 //
5896 //   r0 := a0 >> count
5897 //   r1 := a1 >> count
5898 //   ...
5899 //   r7 := a7 >> count
5900 //
5901 // https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
_mm_sra_epi16(__m128i a,__m128i count)5902 FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
5903 {
5904     int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5905     if (unlikely(c > 15))
5906         return _mm_cmplt_epi16(a, _mm_setzero_si128());
5907     return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
5908 }
5909 
5910 // Shifts the 4 signed 32-bit integers in a right by count bits while shifting
5911 // in the sign bit.
5912 //
5913 //   r0 := a0 >> count
5914 //   r1 := a1 >> count
5915 //   r2 := a2 >> count
5916 //   r3 := a3 >> count
5917 //
5918 // https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
_mm_sra_epi32(__m128i a,__m128i count)5919 FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
5920 {
5921     int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5922     if (unlikely(c > 31))
5923         return _mm_cmplt_epi32(a, _mm_setzero_si128());
5924     return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
5925 }
5926 
5927 // Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
5928 // saturates.
5929 // https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
_mm_packs_epi16(__m128i a,__m128i b)5930 FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
5931 {
5932     return vreinterpretq_m128i_s8(
5933         vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
5934                     vqmovn_s16(vreinterpretq_s16_m128i(b))));
5935 }
5936 
5937 // Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
5938 // integers and saturates.
5939 //
5940 //   r0 := UnsignedSaturate(a0)
5941 //   r1 := UnsignedSaturate(a1)
5942 //   ...
5943 //   r7 := UnsignedSaturate(a7)
5944 //   r8 := UnsignedSaturate(b0)
5945 //   r9 := UnsignedSaturate(b1)
5946 //   ...
5947 //   r15 := UnsignedSaturate(b7)
5948 //
5949 // https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
_mm_packus_epi16(const __m128i a,const __m128i b)5950 FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
5951 {
5952     return vreinterpretq_m128i_u8(
5953         vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
5954                     vqmovun_s16(vreinterpretq_s16_m128i(b))));
5955 }
5956 
5957 // Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
5958 // and saturates.
5959 //
5960 //   r0 := SignedSaturate(a0)
5961 //   r1 := SignedSaturate(a1)
5962 //   r2 := SignedSaturate(a2)
5963 //   r3 := SignedSaturate(a3)
5964 //   r4 := SignedSaturate(b0)
5965 //   r5 := SignedSaturate(b1)
5966 //   r6 := SignedSaturate(b2)
5967 //   r7 := SignedSaturate(b3)
5968 //
5969 // https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
_mm_packs_epi32(__m128i a,__m128i b)5970 FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
5971 {
5972     return vreinterpretq_m128i_s16(
5973         vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
5974                      vqmovn_s32(vreinterpretq_s32_m128i(b))));
5975 }
5976 
5977 // Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
5978 // integers and saturates.
5979 //
5980 //   r0 := UnsignedSaturate(a0)
5981 //   r1 := UnsignedSaturate(a1)
5982 //   r2 := UnsignedSaturate(a2)
5983 //   r3 := UnsignedSaturate(a3)
5984 //   r4 := UnsignedSaturate(b0)
5985 //   r5 := UnsignedSaturate(b1)
5986 //   r6 := UnsignedSaturate(b2)
5987 //   r7 := UnsignedSaturate(b3)
_mm_packus_epi32(__m128i a,__m128i b)5988 FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
5989 {
5990     return vreinterpretq_m128i_u16(
5991         vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
5992                      vqmovun_s32(vreinterpretq_s32_m128i(b))));
5993 }
5994 
5995 // Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
5996 // 8 signed or unsigned 8-bit integers in b.
5997 //
5998 //   r0 := a0
5999 //   r1 := b0
6000 //   r2 := a1
6001 //   r3 := b1
6002 //   ...
6003 //   r14 := a7
6004 //   r15 := b7
6005 //
6006 // https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
_mm_unpacklo_epi8(__m128i a,__m128i b)6007 FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
6008 {
6009 #if defined(__aarch64__)
6010     return vreinterpretq_m128i_s8(
6011         vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6012 #else
6013     int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
6014     int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
6015     int8x8x2_t result = vzip_s8(a1, b1);
6016     return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6017 #endif
6018 }
6019 
6020 // Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
6021 // lower 4 signed or unsigned 16-bit integers in b.
6022 //
6023 //   r0 := a0
6024 //   r1 := b0
6025 //   r2 := a1
6026 //   r3 := b1
6027 //   r4 := a2
6028 //   r5 := b2
6029 //   r6 := a3
6030 //   r7 := b3
6031 //
6032 // https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
_mm_unpacklo_epi16(__m128i a,__m128i b)6033 FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
6034 {
6035 #if defined(__aarch64__)
6036     return vreinterpretq_m128i_s16(
6037         vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6038 #else
6039     int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
6040     int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
6041     int16x4x2_t result = vzip_s16(a1, b1);
6042     return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6043 #endif
6044 }
6045 
6046 // Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
6047 // lower 2 signed or unsigned 32 - bit integers in b.
6048 //
6049 //   r0 := a0
6050 //   r1 := b0
6051 //   r2 := a1
6052 //   r3 := b1
6053 //
6054 // https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
_mm_unpacklo_epi32(__m128i a,__m128i b)6055 FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
6056 {
6057 #if defined(__aarch64__)
6058     return vreinterpretq_m128i_s32(
6059         vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6060 #else
6061     int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
6062     int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
6063     int32x2x2_t result = vzip_s32(a1, b1);
6064     return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6065 #endif
6066 }
6067 
_mm_unpacklo_epi64(__m128i a,__m128i b)6068 FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
6069 {
6070     int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
6071     int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
6072     return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
6073 }
6074 
6075 // Selects and interleaves the lower two single-precision, floating-point values
6076 // from a and b.
6077 //
6078 //   r0 := a0
6079 //   r1 := b0
6080 //   r2 := a1
6081 //   r3 := b1
6082 //
6083 // https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
_mm_unpacklo_ps(__m128 a,__m128 b)6084 FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
6085 {
6086 #if defined(__aarch64__)
6087     return vreinterpretq_m128_f32(
6088         vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
6089 #else
6090     float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
6091     float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
6092     float32x2x2_t result = vzip_f32(a1, b1);
6093     return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
6094 #endif
6095 }
6096 
6097 // Unpack and interleave double-precision (64-bit) floating-point elements from
6098 // the low half of a and b, and store the results in dst.
6099 //
6100 //   DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
6101 //     dst[63:0] := src1[63:0]
6102 //     dst[127:64] := src2[63:0]
6103 //     RETURN dst[127:0]
6104 //   }
6105 //   dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
6106 //
6107 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
_mm_unpacklo_pd(__m128d a,__m128d b)6108 FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
6109 {
6110 #if defined(__aarch64__)
6111     return vreinterpretq_m128d_f64(
6112         vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6113 #else
6114     return vreinterpretq_m128d_s64(
6115         vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
6116                      vget_low_s64(vreinterpretq_s64_m128d(b))));
6117 #endif
6118 }
6119 
6120 // Unpack and interleave double-precision (64-bit) floating-point elements from
6121 // the high half of a and b, and store the results in dst.
6122 //
6123 //   DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
6124 //     dst[63:0] := src1[127:64]
6125 //     dst[127:64] := src2[127:64]
6126 //     RETURN dst[127:0]
6127 //   }
6128 //   dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
6129 //
6130 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
_mm_unpackhi_pd(__m128d a,__m128d b)6131 FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
6132 {
6133 #if defined(__aarch64__)
6134     return vreinterpretq_m128d_f64(
6135         vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6136 #else
6137     return vreinterpretq_m128d_s64(
6138         vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
6139                      vget_high_s64(vreinterpretq_s64_m128d(b))));
6140 #endif
6141 }
6142 
6143 // Selects and interleaves the upper two single-precision, floating-point values
6144 // from a and b.
6145 //
6146 //   r0 := a2
6147 //   r1 := b2
6148 //   r2 := a3
6149 //   r3 := b3
6150 //
6151 // https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
_mm_unpackhi_ps(__m128 a,__m128 b)6152 FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
6153 {
6154 #if defined(__aarch64__)
6155     return vreinterpretq_m128_f32(
6156         vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
6157 #else
6158     float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
6159     float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
6160     float32x2x2_t result = vzip_f32(a1, b1);
6161     return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
6162 #endif
6163 }
6164 
6165 // Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
6166 // 8 signed or unsigned 8-bit integers in b.
6167 //
6168 //   r0 := a8
6169 //   r1 := b8
6170 //   r2 := a9
6171 //   r3 := b9
6172 //   ...
6173 //   r14 := a15
6174 //   r15 := b15
6175 //
6176 // https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
_mm_unpackhi_epi8(__m128i a,__m128i b)6177 FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
6178 {
6179 #if defined(__aarch64__)
6180     return vreinterpretq_m128i_s8(
6181         vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6182 #else
6183     int8x8_t a1 =
6184         vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
6185     int8x8_t b1 =
6186         vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
6187     int8x8x2_t result = vzip_s8(a1, b1);
6188     return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6189 #endif
6190 }
6191 
6192 // Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
6193 // upper 4 signed or unsigned 16-bit integers in b.
6194 //
6195 //   r0 := a4
6196 //   r1 := b4
6197 //   r2 := a5
6198 //   r3 := b5
6199 //   r4 := a6
6200 //   r5 := b6
6201 //   r6 := a7
6202 //   r7 := b7
6203 //
6204 // https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
_mm_unpackhi_epi16(__m128i a,__m128i b)6205 FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
6206 {
6207 #if defined(__aarch64__)
6208     return vreinterpretq_m128i_s16(
6209         vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6210 #else
6211     int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
6212     int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
6213     int16x4x2_t result = vzip_s16(a1, b1);
6214     return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6215 #endif
6216 }
6217 
6218 // Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
6219 // upper 2 signed or unsigned 32-bit integers in b.
6220 // https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
_mm_unpackhi_epi32(__m128i a,__m128i b)6221 FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
6222 {
6223 #if defined(__aarch64__)
6224     return vreinterpretq_m128i_s32(
6225         vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6226 #else
6227     int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
6228     int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
6229     int32x2x2_t result = vzip_s32(a1, b1);
6230     return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6231 #endif
6232 }
6233 
6234 // Interleaves the upper signed or unsigned 64-bit integer in a with the
6235 // upper signed or unsigned 64-bit integer in b.
6236 //
6237 //   r0 := a1
6238 //   r1 := b1
_mm_unpackhi_epi64(__m128i a,__m128i b)6239 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
6240 {
6241     int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
6242     int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
6243     return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
6244 }
6245 
6246 // Horizontally compute the minimum amongst the packed unsigned 16-bit integers
6247 // in a, store the minimum and index in dst, and zero the remaining bits in dst.
6248 //
6249 //   index[2:0] := 0
6250 //   min[15:0] := a[15:0]
6251 //   FOR j := 0 to 7
6252 //       i := j*16
6253 //       IF a[i+15:i] < min[15:0]
6254 //           index[2:0] := j
6255 //           min[15:0] := a[i+15:i]
6256 //       FI
6257 //   ENDFOR
6258 //   dst[15:0] := min[15:0]
6259 //   dst[18:16] := index[2:0]
6260 //   dst[127:19] := 0
6261 //
6262 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
_mm_minpos_epu16(__m128i a)6263 FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
6264 {
6265     __m128i dst;
6266     uint16_t min, idx = 0;
6267     // Find the minimum value
6268 #if defined(__aarch64__)
6269     min = vminvq_u16(vreinterpretq_u16_m128i(a));
6270 #else
6271     __m64 tmp;
6272     tmp = vreinterpret_m64_u16(
6273         vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
6274                  vget_high_u16(vreinterpretq_u16_m128i(a))));
6275     tmp = vreinterpret_m64_u16(
6276         vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
6277     tmp = vreinterpret_m64_u16(
6278         vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
6279     min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
6280 #endif
6281     // Get the index of the minimum value
6282     int i;
6283     for (i = 0; i < 8; i++) {
6284         if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
6285             idx = (uint16_t) i;
6286             break;
6287         }
6288         a = _mm_srli_si128(a, 2);
6289     }
6290     // Generate result
6291     dst = _mm_setzero_si128();
6292     dst = vreinterpretq_m128i_u16(
6293         vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
6294     dst = vreinterpretq_m128i_u16(
6295         vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
6296     return dst;
6297 }
6298 
6299 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
6300 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
6301 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
6302 // otherwise set CF to 0. Return the CF value.
6303 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
_mm_testc_si128(__m128i a,__m128i b)6304 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
6305 {
6306     int64x2_t s64 =
6307         vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
6308                   vreinterpretq_s64_m128i(b));
6309     return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
6310 }
6311 
6312 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
6313 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
6314 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
6315 // otherwise set CF to 0. Return the ZF value.
6316 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
_mm_testz_si128(__m128i a,__m128i b)6317 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
6318 {
6319     int64x2_t s64 =
6320         vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
6321     return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
6322 }
6323 
6324 // Extracts the selected signed or unsigned 8-bit integer from a and zero
6325 // extends.
6326 // FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
6327 #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
6328 
6329 // Inserts the least significant 8 bits of b into the selected 8-bit integer
6330 // of a.
6331 // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
6332 //                                      __constrange(0,16) int imm)
6333 #define _mm_insert_epi8(a, b, imm)                                 \
6334     __extension__({                                                \
6335         vreinterpretq_m128i_s8(                                    \
6336             vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
6337     })
6338 
6339 // Extracts the selected signed or unsigned 16-bit integer from a and zero
6340 // extends.
6341 // https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
6342 // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
6343 #define _mm_extract_epi16(a, imm) \
6344     vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
6345 
6346 // Inserts the least significant 16 bits of b into the selected 16-bit integer
6347 // of a.
6348 // https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
6349 // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
6350 //                                       __constrange(0,8) int imm)
6351 #define _mm_insert_epi16(a, b, imm)                                  \
6352     __extension__({                                                  \
6353         vreinterpretq_m128i_s16(                                     \
6354             vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
6355     })
6356 
6357 // Copy a to dst, and insert the 16-bit integer i into dst at the location
6358 // specified by imm8.
6359 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
6360 #define _mm_insert_pi16(a, b, imm)                               \
6361     __extension__({                                              \
6362         vreinterpret_m64_s16(                                    \
6363             vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
6364     })
6365 
6366 // Extracts the selected signed or unsigned 32-bit integer from a and zero
6367 // extends.
6368 // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
6369 #define _mm_extract_epi32(a, imm) \
6370     vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
6371 
6372 // Extracts the selected single-precision (32-bit) floating-point from a.
6373 // FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
6374 #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
6375 
6376 // Inserts the least significant 32 bits of b into the selected 32-bit integer
6377 // of a.
6378 // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
6379 //                                       __constrange(0,4) int imm)
6380 #define _mm_insert_epi32(a, b, imm)                                  \
6381     __extension__({                                                  \
6382         vreinterpretq_m128i_s32(                                     \
6383             vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
6384     })
6385 
6386 // Extracts the selected signed or unsigned 64-bit integer from a and zero
6387 // extends.
6388 // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
6389 #define _mm_extract_epi64(a, imm) \
6390     vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
6391 
6392 // Inserts the least significant 64 bits of b into the selected 64-bit integer
6393 // of a.
6394 // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
6395 //                                       __constrange(0,2) int imm)
6396 #define _mm_insert_epi64(a, b, imm)                                  \
6397     __extension__({                                                  \
6398         vreinterpretq_m128i_s64(                                     \
6399             vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
6400     })
6401 
6402 // Count the number of bits set to 1 in unsigned 32-bit integer a, and
6403 // return that count in dst.
6404 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
_mm_popcnt_u32(unsigned int a)6405 FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
6406 {
6407 #if defined(__aarch64__)
6408 #if __has_builtin(__builtin_popcount)
6409     return __builtin_popcount(a);
6410 #else
6411     return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
6412 #endif
6413 #else
6414     uint32_t count = 0;
6415     uint8x8_t input_val, count8x8_val;
6416     uint16x4_t count16x4_val;
6417     uint32x2_t count32x2_val;
6418 
6419     input_val = vld1_u8((uint8_t *) &a);
6420     count8x8_val = vcnt_u8(input_val);
6421     count16x4_val = vpaddl_u8(count8x8_val);
6422     count32x2_val = vpaddl_u16(count16x4_val);
6423 
6424     vst1_u32(&count, count32x2_val);
6425     return count;
6426 #endif
6427 }
6428 
6429 // Count the number of bits set to 1 in unsigned 64-bit integer a, and
6430 // return that count in dst.
6431 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
_mm_popcnt_u64(uint64_t a)6432 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
6433 {
6434 #if defined(__aarch64__)
6435 #if __has_builtin(__builtin_popcountll)
6436     return __builtin_popcountll(a);
6437 #else
6438     return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
6439 #endif
6440 #else
6441     uint64_t count = 0;
6442     uint8x8_t input_val, count8x8_val;
6443     uint16x4_t count16x4_val;
6444     uint32x2_t count32x2_val;
6445     uint64x1_t count64x1_val;
6446 
6447     input_val = vld1_u8((uint8_t *) &a);
6448     count8x8_val = vcnt_u8(input_val);
6449     count16x4_val = vpaddl_u8(count8x8_val);
6450     count32x2_val = vpaddl_u16(count16x4_val);
6451     count64x1_val = vpaddl_u32(count32x2_val);
6452     vst1_u64(&count, count64x1_val);
6453     return count;
6454 #endif
6455 }
6456 
6457 // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
6458 // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
6459 // transposed matrix in these vectors (row0 now contains column 0, etc.).
6460 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
6461 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
6462     do {                                                  \
6463         float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
6464         float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
6465         row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
6466                             vget_low_f32(ROW23.val[0]));  \
6467         row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
6468                             vget_low_f32(ROW23.val[1]));  \
6469         row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
6470                             vget_high_f32(ROW23.val[0])); \
6471         row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
6472                             vget_high_f32(ROW23.val[1])); \
6473     } while (0)
6474 
6475 /* Crypto Extensions */
6476 
6477 #if defined(__ARM_FEATURE_CRYPTO)
6478 // Wraps vmull_p64
_sse2neon_vmull_p64(uint64x1_t _a,uint64x1_t _b)6479 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
6480 {
6481     poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
6482     poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
6483     return vreinterpretq_u64_p128(vmull_p64(a, b));
6484 }
6485 #else  // ARMv7 polyfill
6486 // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
6487 //
6488 // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
6489 // 64-bit->128-bit polynomial multiply.
6490 //
6491 // It needs some work and is somewhat slow, but it is still faster than all
6492 // known scalar methods.
6493 //
6494 // Algorithm adapted to C from
6495 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
6496 // from "Fast Software Polynomial Multiplication on ARM Processors Using the
6497 // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
6498 // (https://hal.inria.fr/hal-01506572)
_sse2neon_vmull_p64(uint64x1_t _a,uint64x1_t _b)6499 static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
6500 {
6501     poly8x8_t a = vreinterpret_p8_u64(_a);
6502     poly8x8_t b = vreinterpret_p8_u64(_b);
6503 
6504     // Masks
6505     uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
6506                                     vcreate_u8(0x00000000ffffffff));
6507     uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
6508                                     vcreate_u8(0x0000000000000000));
6509 
6510     // Do the multiplies, rotating with vext to get all combinations
6511     uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
6512     uint8x16_t e =
6513         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
6514     uint8x16_t f =
6515         vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
6516     uint8x16_t g =
6517         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
6518     uint8x16_t h =
6519         vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
6520     uint8x16_t i =
6521         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
6522     uint8x16_t j =
6523         vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
6524     uint8x16_t k =
6525         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
6526 
6527     // Add cross products
6528     uint8x16_t l = veorq_u8(e, f);  // L = E + F
6529     uint8x16_t m = veorq_u8(g, h);  // M = G + H
6530     uint8x16_t n = veorq_u8(i, j);  // N = I + J
6531 
6532     // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
6533     // instructions.
6534 #if defined(__aarch64__)
6535     uint8x16_t lm_p0 = vreinterpretq_u8_u64(
6536         vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
6537     uint8x16_t lm_p1 = vreinterpretq_u8_u64(
6538         vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
6539     uint8x16_t nk_p0 = vreinterpretq_u8_u64(
6540         vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
6541     uint8x16_t nk_p1 = vreinterpretq_u8_u64(
6542         vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
6543 #else
6544     uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
6545     uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
6546     uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
6547     uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
6548 #endif
6549     // t0 = (L) (P0 + P1) << 8
6550     // t1 = (M) (P2 + P3) << 16
6551     uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
6552     uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
6553     uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
6554 
6555     // t2 = (N) (P4 + P5) << 24
6556     // t3 = (K) (P6 + P7) << 32
6557     uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
6558     uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
6559     uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
6560 
6561     // De-interleave
6562 #if defined(__aarch64__)
6563     uint8x16_t t0 = vreinterpretq_u8_u64(
6564         vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
6565     uint8x16_t t1 = vreinterpretq_u8_u64(
6566         vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
6567     uint8x16_t t2 = vreinterpretq_u8_u64(
6568         vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
6569     uint8x16_t t3 = vreinterpretq_u8_u64(
6570         vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
6571 #else
6572     uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
6573     uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
6574     uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
6575     uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
6576 #endif
6577     // Shift the cross products
6578     uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
6579     uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
6580     uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
6581     uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
6582 
6583     // Accumulate the products
6584     uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
6585     uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
6586     uint8x16_t mix = veorq_u8(d, cross1);
6587     uint8x16_t r = veorq_u8(mix, cross2);
6588     return vreinterpretq_u64_u8(r);
6589 }
6590 #endif  // ARMv7 polyfill
6591 
6592 // Perform a carry-less multiplication of two 64-bit integers, selected from a
6593 // and b according to imm8, and store the results in dst.
6594 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
_mm_clmulepi64_si128(__m128i _a,__m128i _b,const int imm)6595 FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
6596 {
6597     uint64x2_t a = vreinterpretq_u64_m128i(_a);
6598     uint64x2_t b = vreinterpretq_u64_m128i(_b);
6599     switch (imm & 0x11) {
6600     case 0x00:
6601         return vreinterpretq_m128i_u64(
6602             _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
6603     case 0x01:
6604         return vreinterpretq_m128i_u64(
6605             _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
6606     case 0x10:
6607         return vreinterpretq_m128i_u64(
6608             _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
6609     case 0x11:
6610         return vreinterpretq_m128i_u64(
6611             _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
6612     default:
6613         abort();
6614     }
6615 }
6616 
6617 #if !defined(__ARM_FEATURE_CRYPTO)
6618 /* clang-format off */
6619 #define SSE2NEON_AES_DATA(w)                                           \
6620     {                                                                  \
6621         w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
6622         w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
6623         w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
6624         w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
6625         w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
6626         w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
6627         w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
6628         w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
6629         w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
6630         w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
6631         w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
6632         w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
6633         w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
6634         w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
6635         w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
6636         w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
6637         w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
6638         w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
6639         w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
6640         w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
6641         w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
6642         w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
6643         w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
6644         w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
6645         w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
6646         w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
6647         w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
6648         w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
6649         w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
6650         w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
6651         w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
6652         w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
6653         w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
6654         w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
6655         w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
6656         w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
6657         w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
6658     }
6659 /* clang-format on */
6660 
6661 /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
6662 #define SSE2NEON_AES_H0(x) (x)
6663 static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
6664 #undef SSE2NEON_AES_H0
6665 
6666 // In the absence of crypto extensions, implement aesenc using regular neon
6667 // intrinsics instead. See:
6668 // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
6669 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
6670 // https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
6671 // for more information Reproduced with permission of the author.
_mm_aesenc_si128(__m128i EncBlock,__m128i RoundKey)6672 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
6673 {
6674 #if defined(__aarch64__)
6675     static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
6676                                          0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
6677                                          0xc, 0x1, 0x6, 0xb};
6678     static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
6679                                        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
6680 
6681     uint8x16_t v;
6682     uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
6683 
6684     // shift rows
6685     w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
6686 
6687     // sub bytes
6688     v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
6689     v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
6690     v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
6691     v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
6692 
6693     // mix columns
6694     w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
6695     w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
6696     w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
6697 
6698     //  add round key
6699     return vreinterpretq_m128i_u8(w) ^ RoundKey;
6700 
6701 #else /* ARMv7-A NEON implementation */
6702 #define SSE2NEON_AES_B2W(b0, b1, b2, b3)                                       \
6703     (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
6704      (b0))
6705 #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
6706 #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
6707 #define SSE2NEON_AES_U0(p) \
6708     SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
6709 #define SSE2NEON_AES_U1(p) \
6710     SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
6711 #define SSE2NEON_AES_U2(p) \
6712     SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
6713 #define SSE2NEON_AES_U3(p) \
6714     SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
6715     static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
6716         SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
6717         SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
6718         SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
6719         SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
6720     };
6721 #undef SSE2NEON_AES_B2W
6722 #undef SSE2NEON_AES_F2
6723 #undef SSE2NEON_AES_F3
6724 #undef SSE2NEON_AES_U0
6725 #undef SSE2NEON_AES_U1
6726 #undef SSE2NEON_AES_U2
6727 #undef SSE2NEON_AES_U3
6728 
6729     uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
6730     uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
6731     uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
6732     uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
6733 
6734     __m128i out = _mm_set_epi32(
6735         (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
6736          aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
6737         (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
6738          aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
6739         (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
6740          aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
6741         (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
6742          aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
6743 
6744     return _mm_xor_si128(out, RoundKey);
6745 #endif
6746 }
6747 
6748 // Perform the last round of an AES encryption flow on data (state) in a using
6749 // the round key in RoundKey, and store the result in dst.
6750 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
_mm_aesenclast_si128(__m128i a,__m128i RoundKey)6751 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
6752 {
6753     /* FIXME: optimized for NEON */
6754     uint8_t v[4][4] = {
6755         [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
6756                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
6757                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
6758                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
6759         [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
6760                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
6761                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
6762                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
6763         [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
6764                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
6765                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
6766                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
6767         [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
6768                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
6769                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
6770                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
6771     };
6772     for (int i = 0; i < 16; i++)
6773         vreinterpretq_nth_u8_m128i(a, i) =
6774             v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
6775     return a;
6776 }
6777 
6778 // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
6779 // This instruction generates a round key for AES encryption. See
6780 // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
6781 // for details.
6782 //
6783 // https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
_mm_aeskeygenassist_si128(__m128i key,const int rcon)6784 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
6785 {
6786     uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
6787     uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
6788     for (int i = 0; i < 4; ++i) {
6789         ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
6790         ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
6791     }
6792     return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
6793                          ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
6794 }
6795 #undef SSE2NEON_AES_DATA
6796 
6797 #else /* __ARM_FEATURE_CRYPTO */
6798 // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
6799 // AESMC and then manually applying the real key as an xor operation. This
6800 // unfortunately means an additional xor op; the compiler should be able to
6801 // optimize this away for repeated calls however. See
6802 // https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
6803 // for more details.
_mm_aesenc_si128(__m128i a,__m128i b)6804 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
6805 {
6806     return vreinterpretq_m128i_u8(
6807         vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
6808         vreinterpretq_u8_m128i(b));
6809 }
6810 
6811 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
_mm_aesenclast_si128(__m128i a,__m128i RoundKey)6812 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
6813 {
6814     return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
6815                              vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
6816                          RoundKey);
6817 }
6818 
_mm_aeskeygenassist_si128(__m128i a,const int rcon)6819 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
6820 {
6821     // AESE does ShiftRows and SubBytes on A
6822     uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
6823 
6824     uint8x16_t dest = {
6825         // Undo ShiftRows step from AESE and extract X1 and X3
6826         u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
6827         u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
6828         u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
6829         u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
6830     };
6831     uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
6832     return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
6833 }
6834 #endif
6835 
6836 /* Streaming Extensions */
6837 
6838 // Guarantees that every preceding store is globally visible before any
6839 // subsequent store.
6840 // https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
_mm_sfence(void)6841 FORCE_INLINE void _mm_sfence(void)
6842 {
6843     __sync_synchronize();
6844 }
6845 
6846 // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
6847 // point elements) from a into memory using a non-temporal memory hint.
6848 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
_mm_stream_ps(float * p,__m128 a)6849 FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
6850 {
6851 #if __has_builtin(__builtin_nontemporal_store)
6852     __builtin_nontemporal_store(a, (float32x4_t *) p);
6853 #else
6854     vst1q_f32(p, vreinterpretq_f32_m128(a));
6855 #endif
6856 }
6857 
6858 // Stores the data in a to the address p without polluting the caches.  If the
6859 // cache line containing address p is already in the cache, the cache will be
6860 // updated.
6861 // https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
_mm_stream_si128(__m128i * p,__m128i a)6862 FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
6863 {
6864 #if __has_builtin(__builtin_nontemporal_store)
6865     __builtin_nontemporal_store(a, p);
6866 #else
6867     vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
6868 #endif
6869 }
6870 
6871 // Load 128-bits of integer data from memory into dst using a non-temporal
6872 // memory hint. mem_addr must be aligned on a 16-byte boundary or a
6873 // general-protection exception may be generated.
6874 //
6875 //   dst[127:0] := MEM[mem_addr+127:mem_addr]
6876 //
6877 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
_mm_stream_load_si128(__m128i * p)6878 FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
6879 {
6880 #if __has_builtin(__builtin_nontemporal_store)
6881     return __builtin_nontemporal_load(p);
6882 #else
6883     return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
6884 #endif
6885 }
6886 
6887 // Cache line containing p is flushed and invalidated from all caches in the
6888 // coherency domain. :
6889 // https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
_mm_clflush(void const * p)6890 FORCE_INLINE void _mm_clflush(void const *p)
6891 {
6892     (void) p;
6893     // no corollary for Neon?
6894 }
6895 
6896 // Allocate aligned blocks of memory.
6897 // https://software.intel.com/en-us/
6898 //         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
_mm_malloc(size_t size,size_t align)6899 FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
6900 {
6901     void *ptr;
6902     if (align == 1)
6903         return malloc(size);
6904     if (align == 2 || (sizeof(void *) == 8 && align == 4))
6905         align = sizeof(void *);
6906     if (!posix_memalign(&ptr, align, size))
6907         return ptr;
6908     return NULL;
6909 }
6910 
6911 // Free aligned memory that was allocated with _mm_malloc.
6912 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
_mm_free(void * addr)6913 FORCE_INLINE void _mm_free(void *addr)
6914 {
6915     free(addr);
6916 }
6917 
6918 // Starting with the initial value in crc, accumulates a CRC32 value for
6919 // unsigned 8-bit integer v.
6920 // https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
_mm_crc32_u8(uint32_t crc,uint8_t v)6921 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
6922 {
6923 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
6924     __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
6925                          : [c] "+r"(crc)
6926                          : [v] "r"(v));
6927 #else
6928     crc ^= v;
6929     for (int bit = 0; bit < 8; bit++) {
6930         if (crc & 1)
6931             crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
6932         else
6933             crc = (crc >> 1);
6934     }
6935 #endif
6936     return crc;
6937 }
6938 
6939 // Starting with the initial value in crc, accumulates a CRC32 value for
6940 // unsigned 16-bit integer v.
6941 // https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
_mm_crc32_u16(uint32_t crc,uint16_t v)6942 FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
6943 {
6944 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
6945     __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
6946                          : [c] "+r"(crc)
6947                          : [v] "r"(v));
6948 #else
6949     crc = _mm_crc32_u8(crc, v & 0xff);
6950     crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
6951 #endif
6952     return crc;
6953 }
6954 
6955 // Starting with the initial value in crc, accumulates a CRC32 value for
6956 // unsigned 32-bit integer v.
6957 // https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
_mm_crc32_u32(uint32_t crc,uint32_t v)6958 FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
6959 {
6960 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
6961     __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
6962                          : [c] "+r"(crc)
6963                          : [v] "r"(v));
6964 #else
6965     crc = _mm_crc32_u16(crc, v & 0xffff);
6966     crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
6967 #endif
6968     return crc;
6969 }
6970 
6971 // Starting with the initial value in crc, accumulates a CRC32 value for
6972 // unsigned 64-bit integer v.
6973 // https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
_mm_crc32_u64(uint64_t crc,uint64_t v)6974 FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
6975 {
6976 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
6977     __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
6978                          : [c] "+r"(crc)
6979                          : [v] "r"(v));
6980 #else
6981     crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
6982     crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
6983 #endif
6984     return crc;
6985 }
6986 
6987 #if defined(__GNUC__) || defined(__clang__)
6988 #pragma pop_macro("ALIGN_STRUCT")
6989 #pragma pop_macro("FORCE_INLINE")
6990 #endif
6991 
6992 #if defined(__GNUC__) && !defined(__clang__)
6993 #pragma GCC pop_options
6994 #endif
6995 
6996 #endif
6997