1 #ifndef SSE2NEON_H
2 #define SSE2NEON_H
3 
4 // This header file provides a simple API translation layer
5 // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
6 //
7 // This header file does not yet translate all of the SSE intrinsics.
8 //
9 // Contributors to this work are:
10 //   John W. Ratcliff <jratcliffscarab@gmail.com>
11 //   Brandon Rowlett <browlett@nvidia.com>
12 //   Ken Fast <kfast@gdeb.com>
13 //   Eric van Beurden <evanbeurden@nvidia.com>
14 //   Alexander Potylitsin <apotylitsin@nvidia.com>
15 //   Hasindu Gamaarachchi <hasindu2008@gmail.com>
16 //   Jim Huang <jserv@biilabs.io>
17 //   Mark Cheng <marktwtn@biilabs.io>
18 //   Malcolm James MacLeod <malcolm@gulden.com>
19 //   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
20 //   Sebastian Pop <spop@amazon.com>
21 //   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
22 //   Danila Kutenin <danilak@google.com>
23 //   François Turban (JishinMaster) <francois.turban@gmail.com>
24 //   Pei-Hsuan Hung <afcidk@gmail.com>
25 //   Yang-Hao Yuan <yanghau@biilabs.io>
26 //   Syoyo Fujita <syoyo@lighttransport.com>
27 //   Brecht Van Lommel <brecht@blender.org>
28 
29 /*
30  * sse2neon is freely redistributable under the MIT License.
31  *
32  * Permission is hereby granted, free of charge, to any person obtaining a copy
33  * of this software and associated documentation files (the "Software"), to deal
34  * in the Software without restriction, including without limitation the rights
35  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
36  * copies of the Software, and to permit persons to whom the Software is
37  * furnished to do so, subject to the following conditions:
38  *
39  * The above copyright notice and this permission notice shall be included in
40  * all copies or substantial portions of the Software.
41  *
42  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
43  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
44  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
47  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
48  * SOFTWARE.
49  */
50 
51 /* Tunable configurations */
52 
53 /* Enable precise implementation of math operations
54  * This would slow down the computation a bit, but gives consistent result with
55  * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
56  */
57 /* _mm_min_ps and _mm_max_ps */
58 #ifndef SSE2NEON_PRECISE_MINMAX
59 #define SSE2NEON_PRECISE_MINMAX (0)
60 #endif
61 /* _mm_rcp_ps and _mm_div_ps */
62 #ifndef SSE2NEON_PRECISE_DIV
63 #define SSE2NEON_PRECISE_DIV (0)
64 #endif
65 /* _mm_sqrt_ps and _mm_rsqrt_ps */
66 #ifndef SSE2NEON_PRECISE_SQRT
67 #define SSE2NEON_PRECISE_SQRT (0)
68 #endif
69 
70 #if defined(__GNUC__) || defined(__clang__)
71 #pragma push_macro("FORCE_INLINE")
72 #pragma push_macro("ALIGN_STRUCT")
73 #define FORCE_INLINE static inline __attribute__((always_inline))
74 #define ALIGN_STRUCT(x) __attribute__((aligned(x)))
75 #ifndef likely
76 #define likely(x) __builtin_expect(!!(x), 1)
77 #endif
78 #ifndef unlikely
79 #define unlikely(x) __builtin_expect(!!(x), 0)
80 #endif
81 #else
82 #error "Macro name collisions may happen with unsupported compiler."
83 #ifdef FORCE_INLINE
84 #undef FORCE_INLINE
85 #endif
86 #define FORCE_INLINE static inline
87 #ifndef ALIGN_STRUCT
88 #define ALIGN_STRUCT(x) __declspec(align(x))
89 #endif
90 #endif
91 #ifndef likely
92 #define likely(x) (x)
93 #endif
94 #ifndef unlikely
95 #define unlikely(x) (x)
96 #endif
97 
98 #include <stdint.h>
99 #include <stdlib.h>
100 
101 /* Architecture-specific build options */
102 /* FIXME: #pragma GCC push_options is only available on GCC */
103 #if defined(__GNUC__)
104 #if defined(__arm__) && __ARM_ARCH == 7
105 /* According to ARM C Language Extensions Architecture specification,
106  * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
107  * architecture supported.
108  */
109 #if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
110 #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
111 #endif
112 #if !defined(__clang__)
113 #pragma GCC push_options
114 #pragma GCC target("fpu=neon")
115 #endif
116 #elif defined(__aarch64__)
117 #if !defined(__clang__)
118 #pragma GCC push_options
119 #pragma GCC target("+simd")
120 #endif
121 #else
122 #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
123 #endif
124 #endif
125 
126 #include <arm_neon.h>
127 
128 /* Rounding functions require either Aarch64 instructions or libm failback */
129 #if !defined(__aarch64__)
130 #include <math.h>
131 #endif
132 
133 /* "__has_builtin" can be used to query support for built-in functions
134  * provided by gcc/clang and other compilers that support it.
135  */
136 #ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
137 /* Compatibility with gcc <= 9 */
138 #if __GNUC__ <= 9
139 #define __has_builtin(x) HAS##x
140 #define HAS__builtin_popcount 1
141 #define HAS__builtin_popcountll 1
142 #else
143 #define __has_builtin(x) 0
144 #endif
145 #endif
146 
147 /**
148  * MACRO for shuffle parameter for _mm_shuffle_ps().
149  * Argument fp3 is a digit[0123] that represents the fp from argument "b"
150  * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
151  * for fp2 in result. fp1 is a digit[0123] that represents the fp from
152  * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
153  * fp0 is the same for fp0 of result.
154  */
155 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
156     (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
157 
158 /* Rounding mode macros. */
159 #define _MM_FROUND_TO_NEAREST_INT 0x00
160 #define _MM_FROUND_TO_NEG_INF 0x01
161 #define _MM_FROUND_TO_POS_INF 0x02
162 #define _MM_FROUND_TO_ZERO 0x03
163 #define _MM_FROUND_CUR_DIRECTION 0x04
164 #define _MM_FROUND_NO_EXC 0x08
165 #define _MM_ROUND_NEAREST 0x0000
166 #define _MM_ROUND_DOWN 0x2000
167 #define _MM_ROUND_UP 0x4000
168 #define _MM_ROUND_TOWARD_ZERO 0x6000
169 
170 /* indicate immediate constant argument in a given range */
171 #define __constrange(a, b) const
172 
173 /* A few intrinsics accept traditional data types like ints or floats, but
174  * most operate on data types that are specific to SSE.
175  * If a vector type ends in d, it contains doubles, and if it does not have
176  * a suffix, it contains floats. An integer vector type can contain any type
177  * of integer, from chars to shorts to unsigned long longs.
178  */
179 typedef int64x1_t __m64;
180 typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
181 // On ARM 32-bit architecture, the float64x2_t is not supported.
182 // The data type __m128d should be represented in a different way for related
183 // intrinsic conversion.
184 #if defined(__aarch64__)
185 typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
186 #else
187 typedef float32x4_t __m128d;
188 #endif
189 typedef int64x2_t __m128i; /* 128-bit vector containing integers */
190 
191 /* type-safe casting between types */
192 
193 #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
194 #define vreinterpretq_m128_f32(x) (x)
195 #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
196 
197 #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
198 #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
199 #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
200 #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
201 
202 #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
203 #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
204 #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
205 #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
206 
207 #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
208 #define vreinterpretq_f32_m128(x) (x)
209 #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
210 
211 #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
212 #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
213 #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
214 #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
215 
216 #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
217 #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
218 #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
219 #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
220 
221 #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
222 #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
223 #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
224 #define vreinterpretq_m128i_s64(x) (x)
225 
226 #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
227 #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
228 #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
229 #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
230 
231 #define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
232 #define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
233 
234 #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
235 #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
236 #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
237 #define vreinterpretq_s64_m128i(x) (x)
238 
239 #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
240 #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
241 #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
242 #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
243 
244 #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
245 #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
246 #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
247 #define vreinterpret_m64_s64(x) (x)
248 
249 #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
250 #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
251 #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
252 #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
253 
254 #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
255 #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
256 #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
257 
258 #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
259 #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
260 #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
261 #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
262 
263 #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
264 #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
265 #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
266 #define vreinterpret_s64_m64(x) (x)
267 
268 #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
269 
270 #if defined(__aarch64__)
271 #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
272 #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
273 
274 #define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
275 
276 #define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
277 #define vreinterpretq_m128d_f64(x) (x)
278 
279 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
280 
281 #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
282 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
283 
284 #define vreinterpretq_f64_m128d(x) (x)
285 #define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
286 #else
287 #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
288 #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
289 
290 #define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
291 #define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
292 
293 #define vreinterpretq_m128d_f32(x) (x)
294 
295 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
296 
297 #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
298 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
299 
300 #define vreinterpretq_f32_m128d(x) (x)
301 #endif
302 
303 // A struct is defined in this header file called 'SIMDVec' which can be used
304 // by applications which attempt to access the contents of an _m128 struct
305 // directly.  It is important to note that accessing the __m128 struct directly
306 // is bad coding practice by Microsoft: @see:
307 // https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
308 //
309 // However, some legacy source code may try to access the contents of an __m128
310 // struct directly so the developer can use the SIMDVec as an alias for it.  Any
311 // casting must be done manually by the developer, as you cannot cast or
312 // otherwise alias the base NEON data type for intrinsic operations.
313 //
314 // union intended to allow direct access to an __m128 variable using the names
315 // that the MSVC compiler provides.  This union should really only be used when
316 // trying to access the members of the vector as integer values.  GCC/clang
317 // allow native access to the float members through a simple array access
318 // operator (in C since 4.6, in C++ since 4.8).
319 //
320 // Ideally direct accesses to SIMD vectors should not be used since it can cause
321 // a performance hit.  If it really is needed however, the original __m128
322 // variable can be aliased with a pointer to this union and used to access
323 // individual components.  The use of this union should be hidden behind a macro
324 // that is used throughout the codebase to access the members instead of always
325 // declaring this type of variable.
326 typedef union ALIGN_STRUCT(16) SIMDVec {
327     float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
328     int8_t m128_i8[16];    // as signed 8-bit integers.
329     int16_t m128_i16[8];   // as signed 16-bit integers.
330     int32_t m128_i32[4];   // as signed 32-bit integers.
331     int64_t m128_i64[2];   // as signed 64-bit integers.
332     uint8_t m128_u8[16];   // as unsigned 8-bit integers.
333     uint16_t m128_u16[8];  // as unsigned 16-bit integers.
334     uint32_t m128_u32[4];  // as unsigned 32-bit integers.
335     uint64_t m128_u64[2];  // as unsigned 64-bit integers.
336 } SIMDVec;
337 
338 // casting using SIMDVec
339 #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
340 #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
341 #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
342 
343 // Function declaration
344 // SSE
345 FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
346 FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
347 // SSE2
348 FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
349 FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
350 FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
351 FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
352 FORCE_INLINE __m128d _mm_set_pd(double, double);
353 // SSE4.1
354 FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
355 FORCE_INLINE __m128 _mm_ceil_ps(__m128);
356 FORCE_INLINE __m128d _mm_floor_pd(__m128d);
357 FORCE_INLINE __m128 _mm_floor_ps(__m128);
358 FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
359 FORCE_INLINE __m128 _mm_round_ps(__m128, int);
360 // SSE4.2
361 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
362 
363 /* Backwards compatibility for compilers with lack of specific type support */
364 
365 // Older gcc does not define vld1q_u8_x4 type
366 #if defined(__GNUC__) && !defined(__clang__) &&                        \
367     ((__GNUC__ <= 10 && defined(__arm__)) ||                           \
368      (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
369      (__GNUC__ <= 9 && defined(__aarch64__)))
_sse2neon_vld1q_u8_x4(const uint8_t * p)370 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
371 {
372     uint8x16x4_t ret;
373     ret.val[0] = vld1q_u8(p + 0);
374     ret.val[1] = vld1q_u8(p + 16);
375     ret.val[2] = vld1q_u8(p + 32);
376     ret.val[3] = vld1q_u8(p + 48);
377     return ret;
378 }
379 #else
380 // Wraps vld1q_u8_x4
_sse2neon_vld1q_u8_x4(const uint8_t * p)381 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
382 {
383     return vld1q_u8_x4(p);
384 }
385 #endif
386 
387 /* Function Naming Conventions
388  * The naming convention of SSE intrinsics is straightforward. A generic SSE
389  * intrinsic function is given as follows:
390  *   _mm_<name>_<data_type>
391  *
392  * The parts of this format are given as follows:
393  * 1. <name> describes the operation performed by the intrinsic
394  * 2. <data_type> identifies the data type of the function's primary arguments
395  *
396  * This last part, <data_type>, is a little complicated. It identifies the
397  * content of the input values, and can be set to any of the following values:
398  * + ps - vectors contain floats (ps stands for packed single-precision)
399  * + pd - vectors cantain doubles (pd stands for packed double-precision)
400  * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
401  *                            signed integers
402  * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
403  *                            unsigned integers
404  * + si128 - unspecified 128-bit vector or 256-bit vector
405  * + m128/m128i/m128d - identifies input vector types when they are different
406  *                      than the type of the returned vector
407  *
408  * For example, _mm_setzero_ps. The _mm implies that the function returns
409  * a 128-bit vector. The _ps at the end implies that the argument vectors
410  * contain floats.
411  *
412  * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
413  *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
414  *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
415  *   // Set packed 8-bit integers
416  *   // 128 bits, 16 chars, per 8 bits
417  *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
418  *                                  4, 5, 12, 13, 6, 7, 14, 15);
419  *   // Shuffle packed 8-bit integers
420  *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
421  *
422  * Data (Number, Binary, Byte Index):
423     +------+------+-------------+------+------+-------------+
424     |      1      |      2      |      3      |      4      | Number
425     +------+------+------+------+------+------+------+------+
426     | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
427     +------+------+------+------+------+------+------+------+
428     |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
429     +------+------+------+------+------+------+------+------+
430 
431     +------+------+------+------+------+------+------+------+
432     |      5      |      6      |      7      |      8      | Number
433     +------+------+------+------+------+------+------+------+
434     | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
435     +------+------+------+------+------+------+------+------+
436     |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
437     +------+------+------+------+------+------+------+------+
438  * Index (Byte Index):
439     +------+------+------+------+------+------+------+------+
440     |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
441     +------+------+------+------+------+------+------+------+
442 
443     +------+------+------+------+------+------+------+------+
444     |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
445     +------+------+------+------+------+------+------+------+
446  * Result:
447     +------+------+------+------+------+------+------+------+
448     |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
449     +------+------+------+------+------+------+------+------+
450     | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
451     +------+------+------+------+------+------+------+------+
452     |     256     |      2      |      5      |      6      | Number
453     +------+------+------+------+------+------+------+------+
454 
455     +------+------+------+------+------+------+------+------+
456     |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
457     +------+------+------+------+------+------+------+------+
458     | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
459     +------+------+------+------+------+------+------+------+
460     |      3      |      7      |      4      |      8      | Number
461     +------+------+------+------+------+------+-------------+
462  */
463 
464 /* Constants for use with _mm_prefetch.  */
465 enum _mm_hint {
466     _MM_HINT_NTA = 0,  /* load data to L1 and L2 cache, mark it as NTA */
467     _MM_HINT_T0 = 1,   /* load data to L1 and L2 cache */
468     _MM_HINT_T1 = 2,   /* load data to L2 cache only */
469     _MM_HINT_T2 = 3,   /* load data to L2 cache only, mark it as NTA */
470     _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
471     _MM_HINT_ET0 = 5,  /* exclusive version of _MM_HINT_T0 */
472     _MM_HINT_ET1 = 6,  /* exclusive version of _MM_HINT_T1 */
473     _MM_HINT_ET2 = 7   /* exclusive version of _MM_HINT_T2 */
474 };
475 
476 // The bit field mapping to the FPCR(floating-point control register)
477 typedef struct {
478     uint16_t res0;
479     uint8_t res1 : 6;
480     uint8_t bit22 : 1;
481     uint8_t bit23 : 1;
482     uint8_t res2;
483 #if defined(__aarch64__)
484     uint32_t res3;
485 #endif
486 } fpcr_bitfield;
487 
488 // Takes the upper 64 bits of a and places it in the low end of the result
489 // Takes the lower 64 bits of b and places it into the high end of the result.
_mm_shuffle_ps_1032(__m128 a,__m128 b)490 FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
491 {
492     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
493     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
494     return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
495 }
496 
497 // takes the lower two 32-bit values from a and swaps them and places in high
498 // end of result takes the higher two 32 bit values from b and swaps them and
499 // places in low end of result.
_mm_shuffle_ps_2301(__m128 a,__m128 b)500 FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
501 {
502     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
503     float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
504     return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
505 }
506 
_mm_shuffle_ps_0321(__m128 a,__m128 b)507 FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
508 {
509     float32x2_t a21 = vget_high_f32(
510         vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
511     float32x2_t b03 = vget_low_f32(
512         vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
513     return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
514 }
515 
_mm_shuffle_ps_2103(__m128 a,__m128 b)516 FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
517 {
518     float32x2_t a03 = vget_low_f32(
519         vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
520     float32x2_t b21 = vget_high_f32(
521         vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
522     return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
523 }
524 
_mm_shuffle_ps_1010(__m128 a,__m128 b)525 FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
526 {
527     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
528     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
529     return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
530 }
531 
_mm_shuffle_ps_1001(__m128 a,__m128 b)532 FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
533 {
534     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
535     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
536     return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
537 }
538 
_mm_shuffle_ps_0101(__m128 a,__m128 b)539 FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
540 {
541     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
542     float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
543     return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
544 }
545 
546 // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
547 // high
_mm_shuffle_ps_3210(__m128 a,__m128 b)548 FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
549 {
550     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
551     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
552     return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
553 }
554 
_mm_shuffle_ps_0011(__m128 a,__m128 b)555 FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
556 {
557     float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
558     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
559     return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
560 }
561 
_mm_shuffle_ps_0022(__m128 a,__m128 b)562 FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
563 {
564     float32x2_t a22 =
565         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
566     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
567     return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
568 }
569 
_mm_shuffle_ps_2200(__m128 a,__m128 b)570 FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
571 {
572     float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
573     float32x2_t b22 =
574         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
575     return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
576 }
577 
_mm_shuffle_ps_3202(__m128 a,__m128 b)578 FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
579 {
580     float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
581     float32x2_t a22 =
582         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
583     float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
584     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
585     return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
586 }
587 
_mm_shuffle_ps_1133(__m128 a,__m128 b)588 FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
589 {
590     float32x2_t a33 =
591         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
592     float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
593     return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
594 }
595 
_mm_shuffle_ps_2010(__m128 a,__m128 b)596 FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
597 {
598     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
599     float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
600     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
601     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
602     return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
603 }
604 
_mm_shuffle_ps_2001(__m128 a,__m128 b)605 FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
606 {
607     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
608     float32_t b2 = vgetq_lane_f32(b, 2);
609     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
610     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
611     return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
612 }
613 
_mm_shuffle_ps_2032(__m128 a,__m128 b)614 FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
615 {
616     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
617     float32_t b2 = vgetq_lane_f32(b, 2);
618     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
619     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
620     return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
621 }
622 
623 // Kahan summation for accurate summation of floating-point numbers.
624 // http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
_sse2neon_kadd_f32(float * sum,float * c,float y)625 FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
626 {
627     y -= *c;
628     float t = *sum + y;
629     *c = (t - *sum) - y;
630     *sum = t;
631 }
632 
633 #if defined(__ARM_FEATURE_CRYPTO)
634 // Wraps vmull_p64
_sse2neon_vmull_p64(uint64x1_t _a,uint64x1_t _b)635 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
636 {
637     poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
638     poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
639     return vreinterpretq_u64_p128(vmull_p64(a, b));
640 }
641 #else  // ARMv7 polyfill
642 // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
643 //
644 // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
645 // 64-bit->128-bit polynomial multiply.
646 //
647 // It needs some work and is somewhat slow, but it is still faster than all
648 // known scalar methods.
649 //
650 // Algorithm adapted to C from
651 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
652 // from "Fast Software Polynomial Multiplication on ARM Processors Using the
653 // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
654 // (https://hal.inria.fr/hal-01506572)
_sse2neon_vmull_p64(uint64x1_t _a,uint64x1_t _b)655 static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
656 {
657     poly8x8_t a = vreinterpret_p8_u64(_a);
658     poly8x8_t b = vreinterpret_p8_u64(_b);
659 
660     // Masks
661     uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
662                                     vcreate_u8(0x00000000ffffffff));
663     uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
664                                     vcreate_u8(0x0000000000000000));
665 
666     // Do the multiplies, rotating with vext to get all combinations
667     uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
668     uint8x16_t e =
669         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
670     uint8x16_t f =
671         vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
672     uint8x16_t g =
673         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
674     uint8x16_t h =
675         vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
676     uint8x16_t i =
677         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
678     uint8x16_t j =
679         vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
680     uint8x16_t k =
681         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
682 
683     // Add cross products
684     uint8x16_t l = veorq_u8(e, f);  // L = E + F
685     uint8x16_t m = veorq_u8(g, h);  // M = G + H
686     uint8x16_t n = veorq_u8(i, j);  // N = I + J
687 
688     // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
689     // instructions.
690 #if defined(__aarch64__)
691     uint8x16_t lm_p0 = vreinterpretq_u8_u64(
692         vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
693     uint8x16_t lm_p1 = vreinterpretq_u8_u64(
694         vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
695     uint8x16_t nk_p0 = vreinterpretq_u8_u64(
696         vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
697     uint8x16_t nk_p1 = vreinterpretq_u8_u64(
698         vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
699 #else
700     uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
701     uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
702     uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
703     uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
704 #endif
705     // t0 = (L) (P0 + P1) << 8
706     // t1 = (M) (P2 + P3) << 16
707     uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
708     uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
709     uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
710 
711     // t2 = (N) (P4 + P5) << 24
712     // t3 = (K) (P6 + P7) << 32
713     uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
714     uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
715     uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
716 
717     // De-interleave
718 #if defined(__aarch64__)
719     uint8x16_t t0 = vreinterpretq_u8_u64(
720         vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
721     uint8x16_t t1 = vreinterpretq_u8_u64(
722         vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
723     uint8x16_t t2 = vreinterpretq_u8_u64(
724         vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
725     uint8x16_t t3 = vreinterpretq_u8_u64(
726         vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
727 #else
728     uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
729     uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
730     uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
731     uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
732 #endif
733     // Shift the cross products
734     uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
735     uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
736     uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
737     uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
738 
739     // Accumulate the products
740     uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
741     uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
742     uint8x16_t mix = veorq_u8(d, cross1);
743     uint8x16_t r = veorq_u8(mix, cross2);
744     return vreinterpretq_u64_u8(r);
745 }
746 #endif  // ARMv7 polyfill
747 
748 // C equivalent:
749 //   __m128i _mm_shuffle_epi32_default(__m128i a,
750 //                                     __constrange(0, 255) int imm) {
751 //       __m128i ret;
752 //       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
753 //       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
754 //       return ret;
755 //   }
756 #define _mm_shuffle_epi32_default(a, imm)                                   \
757     __extension__({                                                         \
758         int32x4_t ret;                                                      \
759         ret = vmovq_n_s32(                                                  \
760             vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
761         ret = vsetq_lane_s32(                                               \
762             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
763             ret, 1);                                                        \
764         ret = vsetq_lane_s32(                                               \
765             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
766             ret, 2);                                                        \
767         ret = vsetq_lane_s32(                                               \
768             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
769             ret, 3);                                                        \
770         vreinterpretq_m128i_s32(ret);                                       \
771     })
772 
773 // Takes the upper 64 bits of a and places it in the low end of the result
774 // Takes the lower 64 bits of a and places it into the high end of the result.
_mm_shuffle_epi_1032(__m128i a)775 FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
776 {
777     int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
778     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
779     return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
780 }
781 
782 // takes the lower two 32-bit values from a and swaps them and places in low end
783 // of result takes the higher two 32 bit values from a and swaps them and places
784 // in high end of result.
_mm_shuffle_epi_2301(__m128i a)785 FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
786 {
787     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
788     int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
789     return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
790 }
791 
792 // rotates the least significant 32 bits into the most significant 32 bits, and
793 // shifts the rest down
_mm_shuffle_epi_0321(__m128i a)794 FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
795 {
796     return vreinterpretq_m128i_s32(
797         vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
798 }
799 
800 // rotates the most significant 32 bits into the least significant 32 bits, and
801 // shifts the rest up
_mm_shuffle_epi_2103(__m128i a)802 FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
803 {
804     return vreinterpretq_m128i_s32(
805         vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
806 }
807 
808 // gets the lower 64 bits of a, and places it in the upper 64 bits
809 // gets the lower 64 bits of a and places it in the lower 64 bits
_mm_shuffle_epi_1010(__m128i a)810 FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
811 {
812     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
813     return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
814 }
815 
816 // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
817 // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
_mm_shuffle_epi_1001(__m128i a)818 FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
819 {
820     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
821     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
822     return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
823 }
824 
825 // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
826 // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
827 // places it in the lower 64 bits
_mm_shuffle_epi_0101(__m128i a)828 FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
829 {
830     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
831     return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
832 }
833 
_mm_shuffle_epi_2211(__m128i a)834 FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
835 {
836     int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
837     int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
838     return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
839 }
840 
_mm_shuffle_epi_0122(__m128i a)841 FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
842 {
843     int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
844     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
845     return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
846 }
847 
_mm_shuffle_epi_3332(__m128i a)848 FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
849 {
850     int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
851     int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
852     return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
853 }
854 
855 // FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
856 // int imm)
857 #if defined(__aarch64__)
858 #define _mm_shuffle_epi32_splat(a, imm)                          \
859     __extension__({                                              \
860         vreinterpretq_m128i_s32(                                 \
861             vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
862     })
863 #else
864 #define _mm_shuffle_epi32_splat(a, imm)                                      \
865     __extension__({                                                          \
866         vreinterpretq_m128i_s32(                                             \
867             vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
868     })
869 #endif
870 
871 // NEON does not support a general purpose permute intrinsic
872 // Selects four specific single-precision, floating-point values from a and b,
873 // based on the mask i.
874 //
875 // C equivalent:
876 //   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
877 //                                 __constrange(0, 255) int imm) {
878 //       __m128 ret;
879 //       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
880 //       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
881 //       return ret;
882 //   }
883 //
884 // https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
885 #define _mm_shuffle_ps_default(a, b, imm)                                  \
886     __extension__({                                                        \
887         float32x4_t ret;                                                   \
888         ret = vmovq_n_f32(                                                 \
889             vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
890         ret = vsetq_lane_f32(                                              \
891             vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
892             ret, 1);                                                       \
893         ret = vsetq_lane_f32(                                              \
894             vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
895             ret, 2);                                                       \
896         ret = vsetq_lane_f32(                                              \
897             vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
898             ret, 3);                                                       \
899         vreinterpretq_m128_f32(ret);                                       \
900     })
901 
902 // Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
903 // by imm.
904 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
905 // FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
906 //                                                   __constrange(0,255) int
907 //                                                   imm)
908 #define _mm_shufflelo_epi16_function(a, imm)                                  \
909     __extension__({                                                           \
910         int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
911         int16x4_t lowBits = vget_low_s16(ret);                                \
912         ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
913         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
914                              1);                                              \
915         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
916                              2);                                              \
917         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
918                              3);                                              \
919         vreinterpretq_m128i_s16(ret);                                         \
920     })
921 
922 // Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
923 // by imm.
924 // https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
925 // FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
926 //                                                   __constrange(0,255) int
927 //                                                   imm)
928 #define _mm_shufflehi_epi16_function(a, imm)                                   \
929     __extension__({                                                            \
930         int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
931         int16x4_t highBits = vget_high_s16(ret);                               \
932         ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
933         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
934                              5);                                               \
935         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
936                              6);                                               \
937         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
938                              7);                                               \
939         vreinterpretq_m128i_s16(ret);                                          \
940     })
941 
942 /* SSE */
943 
944 // Adds the four single-precision, floating-point values of a and b.
945 //
946 //   r0 := a0 + b0
947 //   r1 := a1 + b1
948 //   r2 := a2 + b2
949 //   r3 := a3 + b3
950 //
951 // https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
_mm_add_ps(__m128 a,__m128 b)952 FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
953 {
954     return vreinterpretq_m128_f32(
955         vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
956 }
957 
958 // adds the scalar single-precision floating point values of a and b.
959 // https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
_mm_add_ss(__m128 a,__m128 b)960 FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
961 {
962     float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
963     float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
964     // the upper values in the result must be the remnants of <a>.
965     return vreinterpretq_m128_f32(vaddq_f32(a, value));
966 }
967 
968 // Computes the bitwise AND of the four single-precision, floating-point values
969 // of a and b.
970 //
971 //   r0 := a0 & b0
972 //   r1 := a1 & b1
973 //   r2 := a2 & b2
974 //   r3 := a3 & b3
975 //
976 // https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
_mm_and_ps(__m128 a,__m128 b)977 FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
978 {
979     return vreinterpretq_m128_s32(
980         vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
981 }
982 
983 // Computes the bitwise AND-NOT of the four single-precision, floating-point
984 // values of a and b.
985 //
986 //   r0 := ~a0 & b0
987 //   r1 := ~a1 & b1
988 //   r2 := ~a2 & b2
989 //   r3 := ~a3 & b3
990 //
991 // https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
_mm_andnot_ps(__m128 a,__m128 b)992 FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
993 {
994     return vreinterpretq_m128_s32(
995         vbicq_s32(vreinterpretq_s32_m128(b),
996                   vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
997 }
998 
999 // Average packed unsigned 16-bit integers in a and b, and store the results in
1000 // dst.
1001 //
1002 //   FOR j := 0 to 3
1003 //     i := j*16
1004 //     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
1005 //   ENDFOR
1006 //
1007 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
_mm_avg_pu16(__m64 a,__m64 b)1008 FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
1009 {
1010     return vreinterpret_m64_u16(
1011         vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
1012 }
1013 
1014 // Average packed unsigned 8-bit integers in a and b, and store the results in
1015 // dst.
1016 //
1017 //   FOR j := 0 to 7
1018 //     i := j*8
1019 //     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
1020 //   ENDFOR
1021 //
1022 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
_mm_avg_pu8(__m64 a,__m64 b)1023 FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
1024 {
1025     return vreinterpret_m64_u8(
1026         vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1027 }
1028 
1029 // Compares for equality.
1030 // https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
_mm_cmpeq_ps(__m128 a,__m128 b)1031 FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
1032 {
1033     return vreinterpretq_m128_u32(
1034         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1035 }
1036 
1037 // Compares for equality.
1038 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
_mm_cmpeq_ss(__m128 a,__m128 b)1039 FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
1040 {
1041     return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
1042 }
1043 
1044 // Compares for greater than or equal.
1045 // https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
_mm_cmpge_ps(__m128 a,__m128 b)1046 FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
1047 {
1048     return vreinterpretq_m128_u32(
1049         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1050 }
1051 
1052 // Compares for greater than or equal.
1053 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
_mm_cmpge_ss(__m128 a,__m128 b)1054 FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
1055 {
1056     return _mm_move_ss(a, _mm_cmpge_ps(a, b));
1057 }
1058 
1059 // Compares for greater than.
1060 //
1061 //   r0 := (a0 > b0) ? 0xffffffff : 0x0
1062 //   r1 := (a1 > b1) ? 0xffffffff : 0x0
1063 //   r2 := (a2 > b2) ? 0xffffffff : 0x0
1064 //   r3 := (a3 > b3) ? 0xffffffff : 0x0
1065 //
1066 // https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
_mm_cmpgt_ps(__m128 a,__m128 b)1067 FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
1068 {
1069     return vreinterpretq_m128_u32(
1070         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1071 }
1072 
1073 // Compares for greater than.
1074 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
_mm_cmpgt_ss(__m128 a,__m128 b)1075 FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
1076 {
1077     return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
1078 }
1079 
1080 // Compares for less than or equal.
1081 //
1082 //   r0 := (a0 <= b0) ? 0xffffffff : 0x0
1083 //   r1 := (a1 <= b1) ? 0xffffffff : 0x0
1084 //   r2 := (a2 <= b2) ? 0xffffffff : 0x0
1085 //   r3 := (a3 <= b3) ? 0xffffffff : 0x0
1086 //
1087 // https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
_mm_cmple_ps(__m128 a,__m128 b)1088 FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
1089 {
1090     return vreinterpretq_m128_u32(
1091         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1092 }
1093 
1094 // Compares for less than or equal.
1095 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
_mm_cmple_ss(__m128 a,__m128 b)1096 FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
1097 {
1098     return _mm_move_ss(a, _mm_cmple_ps(a, b));
1099 }
1100 
1101 // Compares for less than
1102 // https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
_mm_cmplt_ps(__m128 a,__m128 b)1103 FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
1104 {
1105     return vreinterpretq_m128_u32(
1106         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1107 }
1108 
1109 // Compares for less than
1110 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
_mm_cmplt_ss(__m128 a,__m128 b)1111 FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
1112 {
1113     return _mm_move_ss(a, _mm_cmplt_ps(a, b));
1114 }
1115 
1116 // Compares for inequality.
1117 // https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
_mm_cmpneq_ps(__m128 a,__m128 b)1118 FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
1119 {
1120     return vreinterpretq_m128_u32(vmvnq_u32(
1121         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1122 }
1123 
1124 // Compares for inequality.
1125 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
_mm_cmpneq_ss(__m128 a,__m128 b)1126 FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
1127 {
1128     return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
1129 }
1130 
1131 // Compares for not greater than or equal.
1132 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
_mm_cmpnge_ps(__m128 a,__m128 b)1133 FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
1134 {
1135     return _mm_cmplt_ps(a, b);
1136 }
1137 
1138 // Compares for not greater than or equal.
1139 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
_mm_cmpnge_ss(__m128 a,__m128 b)1140 FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
1141 {
1142     return _mm_cmplt_ss(a, b);
1143 }
1144 
1145 // Compares for not greater than.
1146 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
_mm_cmpngt_ps(__m128 a,__m128 b)1147 FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
1148 {
1149     return _mm_cmple_ps(a, b);
1150 }
1151 
1152 // Compares for not greater than.
1153 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
_mm_cmpngt_ss(__m128 a,__m128 b)1154 FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
1155 {
1156     return _mm_cmple_ss(a, b);
1157 }
1158 
1159 // Compares for not less than or equal.
1160 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
_mm_cmpnle_ps(__m128 a,__m128 b)1161 FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
1162 {
1163     return _mm_cmpgt_ps(a, b);
1164 }
1165 
1166 // Compares for not less than or equal.
1167 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
_mm_cmpnle_ss(__m128 a,__m128 b)1168 FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
1169 {
1170     return _mm_cmpgt_ss(a, b);
1171 }
1172 
1173 // Compares for not less than.
1174 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
_mm_cmpnlt_ps(__m128 a,__m128 b)1175 FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
1176 {
1177     return _mm_cmpge_ps(a, b);
1178 }
1179 
1180 // Compares for not less than.
1181 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
_mm_cmpnlt_ss(__m128 a,__m128 b)1182 FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
1183 {
1184     return _mm_cmpge_ss(a, b);
1185 }
1186 
1187 // Compares the four 32-bit floats in a and b to check if any values are NaN.
1188 // Ordered compare between each value returns true for "orderable" and false for
1189 // "not orderable" (NaN).
1190 // https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
1191 // also:
1192 // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
1193 // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
_mm_cmpord_ps(__m128 a,__m128 b)1194 FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
1195 {
1196     // Note: NEON does not have ordered compare builtin
1197     // Need to compare a eq a and b eq b to check for NaN
1198     // Do AND of results to get final
1199     uint32x4_t ceqaa =
1200         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1201     uint32x4_t ceqbb =
1202         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1203     return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
1204 }
1205 
1206 // Compares for ordered.
1207 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
_mm_cmpord_ss(__m128 a,__m128 b)1208 FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
1209 {
1210     return _mm_move_ss(a, _mm_cmpord_ps(a, b));
1211 }
1212 
1213 // Compares for unordered.
1214 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
_mm_cmpunord_ps(__m128 a,__m128 b)1215 FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
1216 {
1217     uint32x4_t f32a =
1218         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1219     uint32x4_t f32b =
1220         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1221     return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
1222 }
1223 
1224 // Compares for unordered.
1225 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
_mm_cmpunord_ss(__m128 a,__m128 b)1226 FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
1227 {
1228     return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
1229 }
1230 
1231 // Compares the lower single-precision floating point scalar values of a and b
1232 // using an equality operation. :
1233 // https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
_mm_comieq_ss(__m128 a,__m128 b)1234 FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
1235 {
1236     // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
1237     // vreinterpretq_f32_m128(b)), 0);
1238     uint32x4_t a_not_nan =
1239         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1240     uint32x4_t b_not_nan =
1241         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1242     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1243     uint32x4_t a_eq_b =
1244         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1245     return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) & 0x1;
1246 }
1247 
1248 // Compares the lower single-precision floating point scalar values of a and b
1249 // using a greater than or equal operation. :
1250 // https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
_mm_comige_ss(__m128 a,__m128 b)1251 FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
1252 {
1253     // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
1254     // vreinterpretq_f32_m128(b)), 0);
1255     uint32x4_t a_not_nan =
1256         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1257     uint32x4_t b_not_nan =
1258         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1259     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1260     uint32x4_t a_ge_b =
1261         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1262     return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) & 0x1;
1263 }
1264 
1265 // Compares the lower single-precision floating point scalar values of a and b
1266 // using a greater than operation. :
1267 // https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
_mm_comigt_ss(__m128 a,__m128 b)1268 FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
1269 {
1270     // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
1271     // vreinterpretq_f32_m128(b)), 0);
1272     uint32x4_t a_not_nan =
1273         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1274     uint32x4_t b_not_nan =
1275         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1276     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1277     uint32x4_t a_gt_b =
1278         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1279     return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) & 0x1;
1280 }
1281 
1282 // Compares the lower single-precision floating point scalar values of a and b
1283 // using a less than or equal operation. :
1284 // https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
_mm_comile_ss(__m128 a,__m128 b)1285 FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
1286 {
1287     // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
1288     // vreinterpretq_f32_m128(b)), 0);
1289     uint32x4_t a_not_nan =
1290         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1291     uint32x4_t b_not_nan =
1292         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1293     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1294     uint32x4_t a_le_b =
1295         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1296     return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) & 0x1;
1297 }
1298 
1299 // Compares the lower single-precision floating point scalar values of a and b
1300 // using a less than operation. :
1301 // https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
1302 // note!! The documentation on MSDN is incorrect!  If either of the values is a
1303 // NAN the docs say you will get a one, but in fact, it will return a zero!!
_mm_comilt_ss(__m128 a,__m128 b)1304 FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
1305 {
1306     uint32x4_t a_not_nan =
1307         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1308     uint32x4_t b_not_nan =
1309         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1310     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1311     uint32x4_t a_lt_b =
1312         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1313     return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) & 0x1;
1314 }
1315 
1316 // Compares the lower single-precision floating point scalar values of a and b
1317 // using an inequality operation. :
1318 // https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
_mm_comineq_ss(__m128 a,__m128 b)1319 FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
1320 {
1321     // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
1322     // vreinterpretq_f32_m128(b)), 0);
1323     uint32x4_t a_not_nan =
1324         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1325     uint32x4_t b_not_nan =
1326         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1327     uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1328     uint32x4_t a_neq_b = vmvnq_u32(
1329         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1330     return vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) & 0x1;
1331 }
1332 
1333 // Convert packed signed 32-bit integers in b to packed single-precision
1334 // (32-bit) floating-point elements, store the results in the lower 2 elements
1335 // of dst, and copy the upper 2 packed elements from a to the upper elements of
1336 // dst.
1337 //
1338 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1339 //   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
1340 //   dst[95:64] := a[95:64]
1341 //   dst[127:96] := a[127:96]
1342 //
1343 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
_mm_cvt_pi2ps(__m128 a,__m64 b)1344 FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
1345 {
1346     return vreinterpretq_m128_f32(
1347         vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1348                      vget_high_f32(vreinterpretq_f32_m128(a))));
1349 }
1350 
1351 // Convert packed single-precision (32-bit) floating-point elements in a to
1352 // packed 32-bit integers, and store the results in dst.
1353 //
1354 //   FOR j := 0 to 1
1355 //       i := 32*j
1356 //       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
1357 //   ENDFOR
1358 //
1359 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
_mm_cvt_ps2pi(__m128 a)1360 FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
1361 {
1362 #if defined(__aarch64__)
1363     return vreinterpret_m64_s32(
1364         vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
1365 #else
1366     return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
1367         vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
1368 #endif
1369 }
1370 
1371 // Convert the signed 32-bit integer b to a single-precision (32-bit)
1372 // floating-point element, store the result in the lower element of dst, and
1373 // copy the upper 3 packed elements from a to the upper elements of dst.
1374 //
1375 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1376 //   dst[127:32] := a[127:32]
1377 //
1378 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
_mm_cvt_si2ss(__m128 a,int b)1379 FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
1380 {
1381     return vreinterpretq_m128_f32(
1382         vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1383 }
1384 
1385 // Convert the lower single-precision (32-bit) floating-point element in a to a
1386 // 32-bit integer, and store the result in dst.
1387 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
_mm_cvt_ss2si(__m128 a)1388 FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
1389 {
1390 #if defined(__aarch64__)
1391     return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
1392                           0);
1393 #else
1394     float32_t data = vgetq_lane_f32(
1395         vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1396     return (int32_t) data;
1397 #endif
1398 }
1399 
1400 // Convert packed 16-bit integers in a to packed single-precision (32-bit)
1401 // floating-point elements, and store the results in dst.
1402 //
1403 //   FOR j := 0 to 3
1404 //      i := j*16
1405 //      m := j*32
1406 //      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
1407 //   ENDFOR
1408 //
1409 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
_mm_cvtpi16_ps(__m64 a)1410 FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
1411 {
1412     return vreinterpretq_m128_f32(
1413         vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
1414 }
1415 
1416 // Convert packed 32-bit integers in b to packed single-precision (32-bit)
1417 // floating-point elements, store the results in the lower 2 elements of dst,
1418 // and copy the upper 2 packed elements from a to the upper elements of dst.
1419 //
1420 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1421 //   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
1422 //   dst[95:64] := a[95:64]
1423 //   dst[127:96] := a[127:96]
1424 //
1425 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
_mm_cvtpi32_ps(__m128 a,__m64 b)1426 FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
1427 {
1428     return vreinterpretq_m128_f32(
1429         vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1430                      vget_high_f32(vreinterpretq_f32_m128(a))));
1431 }
1432 
1433 // Convert packed signed 32-bit integers in a to packed single-precision
1434 // (32-bit) floating-point elements, store the results in the lower 2 elements
1435 // of dst, then covert the packed signed 32-bit integers in b to
1436 // single-precision (32-bit) floating-point element, and store the results in
1437 // the upper 2 elements of dst.
1438 //
1439 //   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
1440 //   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
1441 //   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
1442 //   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
1443 //
1444 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
_mm_cvtpi32x2_ps(__m64 a,__m64 b)1445 FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
1446 {
1447     return vreinterpretq_m128_f32(vcvtq_f32_s32(
1448         vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
1449 }
1450 
1451 // Convert the lower packed 8-bit integers in a to packed single-precision
1452 // (32-bit) floating-point elements, and store the results in dst.
1453 //
1454 //   FOR j := 0 to 3
1455 //      i := j*8
1456 //      m := j*32
1457 //      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
1458 //   ENDFOR
1459 //
1460 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
_mm_cvtpi8_ps(__m64 a)1461 FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
1462 {
1463     return vreinterpretq_m128_f32(vcvtq_f32_s32(
1464         vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
1465 }
1466 
1467 // Convert packed single-precision (32-bit) floating-point elements in a to
1468 // packed 16-bit integers, and store the results in dst. Note: this intrinsic
1469 // will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
1470 // 0x7FFFFFFF.
1471 //
1472 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
_mm_cvtps_pi16(__m128 a)1473 FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
1474 {
1475     return vreinterpret_m64_s16(
1476         vmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
1477 }
1478 
1479 // Convert packed single-precision (32-bit) floating-point elements in a to
1480 // packed 32-bit integers, and store the results in dst.
1481 //
1482 //   FOR j := 0 to 1
1483 //       i := 32*j
1484 //       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
1485 //   ENDFOR
1486 //
1487 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
1488 #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1489 
1490 // Convert packed unsigned 16-bit integers in a to packed single-precision
1491 // (32-bit) floating-point elements, and store the results in dst.
1492 //
1493 //   FOR j := 0 to 3
1494 //      i := j*16
1495 //      m := j*32
1496 //      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
1497 //   ENDFOR
1498 //
1499 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
_mm_cvtpu16_ps(__m64 a)1500 FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
1501 {
1502     return vreinterpretq_m128_f32(
1503         vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
1504 }
1505 
1506 // Convert the lower packed unsigned 8-bit integers in a to packed
1507 // single-precision (32-bit) floating-point elements, and store the results in
1508 // dst.
1509 //
1510 //   FOR j := 0 to 3
1511 //      i := j*8
1512 //      m := j*32
1513 //      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
1514 //   ENDFOR
1515 //
1516 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
_mm_cvtpu8_ps(__m64 a)1517 FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
1518 {
1519     return vreinterpretq_m128_f32(vcvtq_f32_u32(
1520         vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
1521 }
1522 
1523 // Convert the signed 32-bit integer b to a single-precision (32-bit)
1524 // floating-point element, store the result in the lower element of dst, and
1525 // copy the upper 3 packed elements from a to the upper elements of dst.
1526 //
1527 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1528 //   dst[127:32] := a[127:32]
1529 //
1530 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
1531 #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1532 
1533 // Convert the signed 64-bit integer b to a single-precision (32-bit)
1534 // floating-point element, store the result in the lower element of dst, and
1535 // copy the upper 3 packed elements from a to the upper elements of dst.
1536 //
1537 //   dst[31:0] := Convert_Int64_To_FP32(b[63:0])
1538 //   dst[127:32] := a[127:32]
1539 //
1540 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
_mm_cvtsi64_ss(__m128 a,int64_t b)1541 FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
1542 {
1543     return vreinterpretq_m128_f32(
1544         vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1545 }
1546 
1547 // Copy the lower single-precision (32-bit) floating-point element of a to dst.
1548 //
1549 //   dst[31:0] := a[31:0]
1550 //
1551 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
_mm_cvtss_f32(__m128 a)1552 FORCE_INLINE float _mm_cvtss_f32(__m128 a)
1553 {
1554     return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1555 }
1556 
1557 // Convert the lower single-precision (32-bit) floating-point element in a to a
1558 // 32-bit integer, and store the result in dst.
1559 //
1560 //   dst[31:0] := Convert_FP32_To_Int32(a[31:0])
1561 //
1562 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
1563 #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1564 
1565 // Convert the lower single-precision (32-bit) floating-point element in a to a
1566 // 64-bit integer, and store the result in dst.
1567 //
1568 //   dst[63:0] := Convert_FP32_To_Int64(a[31:0])
1569 //
1570 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
_mm_cvtss_si64(__m128 a)1571 FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
1572 {
1573 #if defined(__aarch64__)
1574     return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
1575 #else
1576     float32_t data = vgetq_lane_f32(
1577         vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1578     return (int64_t) data;
1579 #endif
1580 }
1581 
1582 // Convert packed single-precision (32-bit) floating-point elements in a to
1583 // packed 32-bit integers with truncation, and store the results in dst.
1584 //
1585 //   FOR j := 0 to 1
1586 //      i := 32*j
1587 //      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
1588 //   ENDFOR
1589 //
1590 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
_mm_cvtt_ps2pi(__m128 a)1591 FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
1592 {
1593     return vreinterpret_m64_s32(
1594         vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
1595 }
1596 
1597 // Convert the lower single-precision (32-bit) floating-point element in a to a
1598 // 32-bit integer with truncation, and store the result in dst.
1599 //
1600 //   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
1601 //
1602 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
_mm_cvtt_ss2si(__m128 a)1603 FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
1604 {
1605     return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
1606 }
1607 
1608 // Convert packed single-precision (32-bit) floating-point elements in a to
1609 // packed 32-bit integers with truncation, and store the results in dst.
1610 //
1611 //   FOR j := 0 to 1
1612 //      i := 32*j
1613 //      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
1614 //   ENDFOR
1615 //
1616 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
1617 #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1618 
1619 // Convert the lower single-precision (32-bit) floating-point element in a to a
1620 // 32-bit integer with truncation, and store the result in dst.
1621 //
1622 //   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
1623 //
1624 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
1625 #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1626 
1627 // Convert the lower single-precision (32-bit) floating-point element in a to a
1628 // 64-bit integer with truncation, and store the result in dst.
1629 //
1630 //   dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
1631 //
1632 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
_mm_cvttss_si64(__m128 a)1633 FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
1634 {
1635     return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1636 }
1637 
1638 // Divides the four single-precision, floating-point values of a and b.
1639 //
1640 //   r0 := a0 / b0
1641 //   r1 := a1 / b1
1642 //   r2 := a2 / b2
1643 //   r3 := a3 / b3
1644 //
1645 // https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
_mm_div_ps(__m128 a,__m128 b)1646 FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
1647 {
1648 #if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
1649     return vreinterpretq_m128_f32(
1650         vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1651 #else
1652     float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
1653     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1654 #if SSE2NEON_PRECISE_DIV
1655     // Additional Netwon-Raphson iteration for accuracy
1656     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1657 #endif
1658     return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
1659 #endif
1660 }
1661 
1662 // Divides the scalar single-precision floating point value of a by b.
1663 // https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
_mm_div_ss(__m128 a,__m128 b)1664 FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
1665 {
1666     float32_t value =
1667         vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
1668     return vreinterpretq_m128_f32(
1669         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1670 }
1671 
1672 // Extract a 16-bit integer from a, selected with imm8, and store the result in
1673 // the lower element of dst.
1674 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16
1675 #define _mm_extract_pi16(a, imm) \
1676     (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1677 
1678 // Free aligned memory that was allocated with _mm_malloc.
1679 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
_mm_free(void * addr)1680 FORCE_INLINE void _mm_free(void *addr)
1681 {
1682     free(addr);
1683 }
1684 
1685 // Macro: Get the rounding mode bits from the MXCSR control and status register.
1686 // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
1687 // _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
1688 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE
_MM_GET_ROUNDING_MODE()1689 FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
1690 {
1691     union {
1692         fpcr_bitfield field;
1693 #if defined(__aarch64__)
1694         uint64_t value;
1695 #else
1696         uint32_t value;
1697 #endif
1698     } r;
1699 
1700 #if defined(__aarch64__)
1701     asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
1702 #else
1703     asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1704 #endif
1705 
1706     if (r.field.bit22) {
1707         return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
1708     } else {
1709         return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
1710     }
1711 }
1712 
1713 // Copy a to dst, and insert the 16-bit integer i into dst at the location
1714 // specified by imm8.
1715 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
1716 #define _mm_insert_pi16(a, b, imm)                               \
1717     __extension__({                                              \
1718         vreinterpret_m64_s16(                                    \
1719             vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
1720     })
1721 
1722 // Loads four single-precision, floating-point values.
1723 // https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
_mm_load_ps(const float * p)1724 FORCE_INLINE __m128 _mm_load_ps(const float *p)
1725 {
1726     return vreinterpretq_m128_f32(vld1q_f32(p));
1727 }
1728 
1729 // Load a single-precision (32-bit) floating-point element from memory into all
1730 // elements of dst.
1731 //
1732 //   dst[31:0] := MEM[mem_addr+31:mem_addr]
1733 //   dst[63:32] := MEM[mem_addr+31:mem_addr]
1734 //   dst[95:64] := MEM[mem_addr+31:mem_addr]
1735 //   dst[127:96] := MEM[mem_addr+31:mem_addr]
1736 //
1737 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
1738 #define _mm_load_ps1 _mm_load1_ps
1739 
1740 // Loads an single - precision, floating - point value into the low word and
1741 // clears the upper three words.
1742 // https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
_mm_load_ss(const float * p)1743 FORCE_INLINE __m128 _mm_load_ss(const float *p)
1744 {
1745     return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1746 }
1747 
1748 // Loads a single single-precision, floating-point value, copying it into all
1749 // four words
1750 // https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
_mm_load1_ps(const float * p)1751 FORCE_INLINE __m128 _mm_load1_ps(const float *p)
1752 {
1753     return vreinterpretq_m128_f32(vld1q_dup_f32(p));
1754 }
1755 
1756 // Sets the upper two single-precision, floating-point values with 64
1757 // bits of data loaded from the address p; the lower two values are passed
1758 // through from a.
1759 //
1760 //   r0 := a0
1761 //   r1 := a1
1762 //   r2 := *p0
1763 //   r3 := *p1
1764 //
1765 // https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
_mm_loadh_pi(__m128 a,__m64 const * p)1766 FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
1767 {
1768     return vreinterpretq_m128_f32(
1769         vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
1770 }
1771 
1772 // Sets the lower two single-precision, floating-point values with 64
1773 // bits of data loaded from the address p; the upper two values are passed
1774 // through from a.
1775 //
1776 // Return Value
1777 //   r0 := *p0
1778 //   r1 := *p1
1779 //   r2 := a2
1780 //   r3 := a3
1781 //
1782 // https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
_mm_loadl_pi(__m128 a,__m64 const * p)1783 FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
1784 {
1785     return vreinterpretq_m128_f32(
1786         vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
1787 }
1788 
1789 // Load 4 single-precision (32-bit) floating-point elements from memory into dst
1790 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1791 // general-protection exception may be generated.
1792 //
1793 //   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
1794 //   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
1795 //   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
1796 //   dst[127:96] := MEM[mem_addr+31:mem_addr]
1797 //
1798 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
_mm_loadr_ps(const float * p)1799 FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
1800 {
1801     float32x4_t v = vrev64q_f32(vld1q_f32(p));
1802     return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
1803 }
1804 
1805 // Loads four single-precision, floating-point values.
1806 // https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
_mm_loadu_ps(const float * p)1807 FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
1808 {
1809     // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
1810     // equivalent for neon
1811     return vreinterpretq_m128_f32(vld1q_f32(p));
1812 }
1813 
1814 // Load unaligned 16-bit integer from memory into the first element of dst.
1815 //
1816 //   dst[15:0] := MEM[mem_addr+15:mem_addr]
1817 //   dst[MAX:16] := 0
1818 //
1819 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
_mm_loadu_si16(const void * p)1820 FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
1821 {
1822     return vreinterpretq_m128i_s16(
1823         vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
1824 }
1825 
1826 // Load unaligned 64-bit integer from memory into the first element of dst.
1827 //
1828 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
1829 //   dst[MAX:64] := 0
1830 //
1831 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
_mm_loadu_si64(const void * p)1832 FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
1833 {
1834     return vreinterpretq_m128i_s64(
1835         vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
1836 }
1837 
1838 // Allocate aligned blocks of memory.
1839 // https://software.intel.com/en-us/
1840 //         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
_mm_malloc(size_t size,size_t align)1841 FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
1842 {
1843     void *ptr;
1844     if (align == 1)
1845         return malloc(size);
1846     if (align == 2 || (sizeof(void *) == 8 && align == 4))
1847         align = sizeof(void *);
1848     if (!posix_memalign(&ptr, align, size))
1849         return ptr;
1850     return NULL;
1851 }
1852 
1853 // Conditionally store 8-bit integer elements from a into memory using mask
1854 // (elements are not stored when the highest bit is not set in the corresponding
1855 // element) and a non-temporal memory hint.
1856 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64
_mm_maskmove_si64(__m64 a,__m64 mask,char * mem_addr)1857 FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
1858 {
1859     int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
1860     __m128 b = _mm_load_ps((const float *) mem_addr);
1861     int8x8_t masked =
1862         vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
1863                 vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
1864     vst1_s8((int8_t *) mem_addr, masked);
1865 }
1866 
1867 // Conditionally store 8-bit integer elements from a into memory using mask
1868 // (elements are not stored when the highest bit is not set in the corresponding
1869 // element) and a non-temporal memory hint.
1870 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq
1871 #define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
1872 
1873 // Compare packed signed 16-bit integers in a and b, and store packed maximum
1874 // values in dst.
1875 //
1876 //   FOR j := 0 to 3
1877 //      i := j*16
1878 //      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
1879 //   ENDFOR
1880 //
1881 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
_mm_max_pi16(__m64 a,__m64 b)1882 FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
1883 {
1884     return vreinterpret_m64_s16(
1885         vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
1886 }
1887 
1888 // Computes the maximums of the four single-precision, floating-point values of
1889 // a and b.
1890 // https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
_mm_max_ps(__m128 a,__m128 b)1891 FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
1892 {
1893 #if SSE2NEON_PRECISE_MINMAX
1894     float32x4_t _a = vreinterpretq_f32_m128(a);
1895     float32x4_t _b = vreinterpretq_f32_m128(b);
1896     return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
1897 #else
1898     return vreinterpretq_m128_f32(
1899         vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1900 #endif
1901 }
1902 
1903 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
1904 // values in dst.
1905 //
1906 //   FOR j := 0 to 7
1907 //      i := j*8
1908 //      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
1909 //   ENDFOR
1910 //
1911 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
_mm_max_pu8(__m64 a,__m64 b)1912 FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
1913 {
1914     return vreinterpret_m64_u8(
1915         vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1916 }
1917 
1918 // Computes the maximum of the two lower scalar single-precision floating point
1919 // values of a and b.
1920 // https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
_mm_max_ss(__m128 a,__m128 b)1921 FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
1922 {
1923     float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
1924     return vreinterpretq_m128_f32(
1925         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1926 }
1927 
1928 // Compare packed signed 16-bit integers in a and b, and store packed minimum
1929 // values in dst.
1930 //
1931 //   FOR j := 0 to 3
1932 //      i := j*16
1933 //      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
1934 //   ENDFOR
1935 //
1936 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
_mm_min_pi16(__m64 a,__m64 b)1937 FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
1938 {
1939     return vreinterpret_m64_s16(
1940         vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
1941 }
1942 
1943 // Computes the minima of the four single-precision, floating-point values of a
1944 // and b.
1945 // https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
_mm_min_ps(__m128 a,__m128 b)1946 FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
1947 {
1948 #if SSE2NEON_PRECISE_MINMAX
1949     float32x4_t _a = vreinterpretq_f32_m128(a);
1950     float32x4_t _b = vreinterpretq_f32_m128(b);
1951     return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
1952 #else
1953     return vreinterpretq_m128_f32(
1954         vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1955 #endif
1956 }
1957 
1958 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
1959 // values in dst.
1960 //
1961 //   FOR j := 0 to 7
1962 //      i := j*8
1963 //      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
1964 //   ENDFOR
1965 //
1966 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
_mm_min_pu8(__m64 a,__m64 b)1967 FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
1968 {
1969     return vreinterpret_m64_u8(
1970         vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1971 }
1972 
1973 // Computes the minimum of the two lower scalar single-precision floating point
1974 // values of a and b.
1975 // https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
_mm_min_ss(__m128 a,__m128 b)1976 FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
1977 {
1978     float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
1979     return vreinterpretq_m128_f32(
1980         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1981 }
1982 
1983 // Sets the low word to the single-precision, floating-point value of b
1984 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
_mm_move_ss(__m128 a,__m128 b)1985 FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
1986 {
1987     return vreinterpretq_m128_f32(
1988         vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
1989                        vreinterpretq_f32_m128(a), 0));
1990 }
1991 
1992 // Moves the upper two values of B into the lower two values of A.
1993 //
1994 //   r3 := a3
1995 //   r2 := a2
1996 //   r1 := b3
1997 //   r0 := b2
_mm_movehl_ps(__m128 __A,__m128 __B)1998 FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
1999 {
2000     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
2001     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
2002     return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
2003 }
2004 
2005 // Moves the lower two values of B into the upper two values of A.
2006 //
2007 //   r3 := b1
2008 //   r2 := b0
2009 //   r1 := a1
2010 //   r0 := a0
_mm_movelh_ps(__m128 __A,__m128 __B)2011 FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
2012 {
2013     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
2014     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
2015     return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
2016 }
2017 
2018 // Create mask from the most significant bit of each 8-bit element in a, and
2019 // store the result in dst.
2020 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8
_mm_movemask_pi8(__m64 a)2021 FORCE_INLINE int _mm_movemask_pi8(__m64 a)
2022 {
2023     uint8x8_t input = vreinterpret_u8_m64(a);
2024 #if defined(__aarch64__)
2025     static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
2026     uint8x8_t tmp = vshr_n_u8(input, 7);
2027     return vaddv_u8(vshl_u8(tmp, shift));
2028 #else
2029     // Refer the implementation of `_mm_movemask_epi8`
2030     uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2031     uint32x2_t paired16 =
2032         vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2033     uint8x8_t paired32 =
2034         vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2035     return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2036 #endif
2037 }
2038 
2039 // NEON does not provide this method
2040 // Creates a 4-bit mask from the most significant bits of the four
2041 // single-precision, floating-point values.
2042 // https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
_mm_movemask_ps(__m128 a)2043 FORCE_INLINE int _mm_movemask_ps(__m128 a)
2044 {
2045     uint32x4_t input = vreinterpretq_u32_m128(a);
2046 #if defined(__aarch64__)
2047     static const int32x4_t shift = {0, 1, 2, 3};
2048     uint32x4_t tmp = vshrq_n_u32(input, 31);
2049     return vaddvq_u32(vshlq_u32(tmp, shift));
2050 #else
2051     // Uses the exact same method as _mm_movemask_epi8, see that for details.
2052     // Shift out everything but the sign bits with a 32-bit unsigned shift
2053     // right.
2054     uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2055     // Merge the two pairs together with a 64-bit unsigned shift right + add.
2056     uint8x16_t paired =
2057         vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2058     // Extract the result.
2059     return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2060 #endif
2061 }
2062 
2063 // Multiplies the four single-precision, floating-point values of a and b.
2064 //
2065 //   r0 := a0 * b0
2066 //   r1 := a1 * b1
2067 //   r2 := a2 * b2
2068 //   r3 := a3 * b3
2069 //
2070 // https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
_mm_mul_ps(__m128 a,__m128 b)2071 FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2072 {
2073     return vreinterpretq_m128_f32(
2074         vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2075 }
2076 
2077 // Multiply the lower single-precision (32-bit) floating-point element in a and
2078 // b, store the result in the lower element of dst, and copy the upper 3 packed
2079 // elements from a to the upper elements of dst.
2080 //
2081 //   dst[31:0] := a[31:0] * b[31:0]
2082 //   dst[127:32] := a[127:32]
2083 //
2084 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
_mm_mul_ss(__m128 a,__m128 b)2085 FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
2086 {
2087     return _mm_move_ss(a, _mm_mul_ps(a, b));
2088 }
2089 
2090 // Multiply the packed unsigned 16-bit integers in a and b, producing
2091 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
2092 // integers in dst.
2093 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
_mm_mulhi_pu16(__m64 a,__m64 b)2094 FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
2095 {
2096     return vreinterpret_m64_u16(vshrn_n_u32(
2097         vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
2098 }
2099 
2100 // Computes the bitwise OR of the four single-precision, floating-point values
2101 // of a and b.
2102 // https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
_mm_or_ps(__m128 a,__m128 b)2103 FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
2104 {
2105     return vreinterpretq_m128_s32(
2106         vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
2107 }
2108 
2109 // Average packed unsigned 8-bit integers in a and b, and store the results in
2110 // dst.
2111 //
2112 //   FOR j := 0 to 7
2113 //     i := j*8
2114 //     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2115 //   ENDFOR
2116 //
2117 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
2118 #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2119 
2120 // Average packed unsigned 16-bit integers in a and b, and store the results in
2121 // dst.
2122 //
2123 //   FOR j := 0 to 3
2124 //     i := j*16
2125 //     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2126 //   ENDFOR
2127 //
2128 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
2129 #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2130 
2131 // Extract a 16-bit integer from a, selected with imm8, and store the result in
2132 // the lower element of dst.
2133 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
2134 #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2135 
2136 // Copy a to dst, and insert the 16-bit integer i into dst at the location
2137 // specified by imm8.
2138 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
2139 #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2140 
2141 // Compare packed signed 16-bit integers in a and b, and store packed maximum
2142 // values in dst.
2143 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
2144 #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2145 
2146 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2147 // values in dst.
2148 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
2149 #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2150 
2151 // Compare packed signed 16-bit integers in a and b, and store packed minimum
2152 // values in dst.
2153 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
2154 #define _m_pminsw(a, b) _mm_min_pi16(a, b)
2155 
2156 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2157 // values in dst.
2158 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
2159 #define _m_pminub(a, b) _mm_min_pu8(a, b)
2160 
2161 // Create mask from the most significant bit of each 8-bit element in a, and
2162 // store the result in dst.
2163 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb
2164 #define _m_pmovmskb(a) _mm_movemask_pi8(a)
2165 
2166 // Multiply the packed unsigned 16-bit integers in a and b, producing
2167 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
2168 // integers in dst.
2169 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
2170 #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2171 
2172 // Loads one cache line of data from address p to a location closer to the
2173 // processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
_mm_prefetch(const void * p,int i)2174 FORCE_INLINE void _mm_prefetch(const void *p, int i)
2175 {
2176     (void) i;
2177     __builtin_prefetch(p);
2178 }
2179 
2180 // Compute the absolute differences of packed unsigned 8-bit integers in a and
2181 // b, then horizontally sum each consecutive 8 differences to produce four
2182 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2183 // 16 bits of dst.
2184 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw
2185 #define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2186 
2187 // Shuffle 16-bit integers in a using the control in imm8, and store the results
2188 // in dst.
2189 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw
2190 #define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2191 
2192 // Compute the approximate reciprocal of packed single-precision (32-bit)
2193 // floating-point elements in a, and store the results in dst. The maximum
2194 // relative error for this approximation is less than 1.5*2^-12.
2195 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
_mm_rcp_ps(__m128 in)2196 FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
2197 {
2198     float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2199     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2200 #if SSE2NEON_PRECISE_DIV
2201     // Additional Netwon-Raphson iteration for accuracy
2202     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2203 #endif
2204     return vreinterpretq_m128_f32(recip);
2205 }
2206 
2207 // Compute the approximate reciprocal of the lower single-precision (32-bit)
2208 // floating-point element in a, store the result in the lower element of dst,
2209 // and copy the upper 3 packed elements from a to the upper elements of dst. The
2210 // maximum relative error for this approximation is less than 1.5*2^-12.
2211 //
2212 //   dst[31:0] := (1.0 / a[31:0])
2213 //   dst[127:32] := a[127:32]
2214 //
2215 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
_mm_rcp_ss(__m128 a)2216 FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
2217 {
2218     return _mm_move_ss(a, _mm_rcp_ps(a));
2219 }
2220 
2221 // Computes the approximations of the reciprocal square roots of the four
2222 // single-precision floating point values of in.
2223 // The current precision is 1% error.
2224 // https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
_mm_rsqrt_ps(__m128 in)2225 FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
2226 {
2227     float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2228 #if SSE2NEON_PRECISE_SQRT
2229     // Additional Netwon-Raphson iteration for accuracy
2230     out = vmulq_f32(
2231         out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2232     out = vmulq_f32(
2233         out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2234 #endif
2235     return vreinterpretq_m128_f32(out);
2236 }
2237 
2238 // Compute the approximate reciprocal square root of the lower single-precision
2239 // (32-bit) floating-point element in a, store the result in the lower element
2240 // of dst, and copy the upper 3 packed elements from a to the upper elements of
2241 // dst.
2242 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
_mm_rsqrt_ss(__m128 in)2243 FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
2244 {
2245     return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
2246 }
2247 
2248 // Compute the absolute differences of packed unsigned 8-bit integers in a and
2249 // b, then horizontally sum each consecutive 8 differences to produce four
2250 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2251 // 16 bits of dst.
2252 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
_mm_sad_pu8(__m64 a,__m64 b)2253 FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
2254 {
2255     uint64x1_t t = vpaddl_u32(vpaddl_u16(
2256         vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
2257     return vreinterpret_m64_u16(
2258         vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2259 }
2260 
2261 // Sets the four single-precision, floating-point values to the four inputs.
2262 // https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
_mm_set_ps(float w,float z,float y,float x)2263 FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
2264 {
2265     float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
2266     return vreinterpretq_m128_f32(vld1q_f32(data));
2267 }
2268 
2269 // Sets the four single-precision, floating-point values to w.
2270 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
_mm_set_ps1(float _w)2271 FORCE_INLINE __m128 _mm_set_ps1(float _w)
2272 {
2273     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2274 }
2275 
2276 // Macro: Set the rounding mode bits of the MXCSR control and status register to
2277 // the value in unsigned 32-bit integer a. The rounding mode may contain any of
2278 // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
2279 // _MM_ROUND_TOWARD_ZERO
2280 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
_MM_SET_ROUNDING_MODE(int rounding)2281 FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
2282 {
2283     union {
2284         fpcr_bitfield field;
2285 #if defined(__aarch64__)
2286         uint64_t value;
2287 #else
2288         uint32_t value;
2289 #endif
2290     } r;
2291 
2292 #if defined(__aarch64__)
2293     asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
2294 #else
2295     asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2296 #endif
2297 
2298     switch (rounding) {
2299     case _MM_ROUND_TOWARD_ZERO:
2300         r.field.bit22 = 1;
2301         r.field.bit23 = 1;
2302         break;
2303     case _MM_ROUND_DOWN:
2304         r.field.bit22 = 0;
2305         r.field.bit23 = 1;
2306         break;
2307     case _MM_ROUND_UP:
2308         r.field.bit22 = 1;
2309         r.field.bit23 = 0;
2310         break;
2311     default:  //_MM_ROUND_NEAREST
2312         r.field.bit22 = 0;
2313         r.field.bit23 = 0;
2314     }
2315 
2316 #if defined(__aarch64__)
2317     asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
2318 #else
2319     asm volatile("vmsr FPSCR, %0" ::"r"(r));        /* write */
2320 #endif
2321 }
2322 
2323 // Copy single-precision (32-bit) floating-point element a to the lower element
2324 // of dst, and zero the upper 3 elements.
2325 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
_mm_set_ss(float a)2326 FORCE_INLINE __m128 _mm_set_ss(float a)
2327 {
2328     float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
2329     return vreinterpretq_m128_f32(vld1q_f32(data));
2330 }
2331 
2332 // Sets the four single-precision, floating-point values to w.
2333 //
2334 //   r0 := r1 := r2 := r3 := w
2335 //
2336 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
_mm_set1_ps(float _w)2337 FORCE_INLINE __m128 _mm_set1_ps(float _w)
2338 {
2339     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2340 }
2341 
_mm_setcsr(unsigned int a)2342 FORCE_INLINE void _mm_setcsr(unsigned int a)
2343 {
2344     _MM_SET_ROUNDING_MODE(a);
2345 }
2346 
2347 // Sets the four single-precision, floating-point values to the four inputs in
2348 // reverse order.
2349 // https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
_mm_setr_ps(float w,float z,float y,float x)2350 FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
2351 {
2352     float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
2353     return vreinterpretq_m128_f32(vld1q_f32(data));
2354 }
2355 
2356 // Clears the four single-precision, floating-point values.
2357 // https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
_mm_setzero_ps(void)2358 FORCE_INLINE __m128 _mm_setzero_ps(void)
2359 {
2360     return vreinterpretq_m128_f32(vdupq_n_f32(0));
2361 }
2362 
2363 // Shuffle 16-bit integers in a using the control in imm8, and store the results
2364 // in dst.
2365 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16
2366 #if __has_builtin(__builtin_shufflevector)
2367 #define _mm_shuffle_pi16(a, imm)                                           \
2368     __extension__({                                                        \
2369         vreinterpret_m64_s16(__builtin_shufflevector(                      \
2370             vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
2371             ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)));  \
2372     })
2373 #else
2374 #define _mm_shuffle_pi16(a, imm)                                               \
2375     __extension__({                                                            \
2376         int16x4_t ret;                                                         \
2377         ret =                                                                  \
2378             vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
2379         ret = vset_lane_s16(                                                   \
2380             vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret,   \
2381             1);                                                                \
2382         ret = vset_lane_s16(                                                   \
2383             vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret,   \
2384             2);                                                                \
2385         ret = vset_lane_s16(                                                   \
2386             vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret,   \
2387             3);                                                                \
2388         vreinterpret_m64_s16(ret);                                             \
2389     })
2390 #endif
2391 
2392 // Guarantees that every preceding store is globally visible before any
2393 // subsequent store.
2394 // https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
_mm_sfence(void)2395 FORCE_INLINE void _mm_sfence(void)
2396 {
2397     __sync_synchronize();
2398 }
2399 
2400 // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
2401 // int imm)
2402 #if __has_builtin(__builtin_shufflevector)
2403 #define _mm_shuffle_ps(a, b, imm)                                \
2404     __extension__({                                              \
2405         float32x4_t _input1 = vreinterpretq_f32_m128(a);         \
2406         float32x4_t _input2 = vreinterpretq_f32_m128(b);         \
2407         float32x4_t _shuf = __builtin_shufflevector(             \
2408             _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2409             (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2410         vreinterpretq_m128_f32(_shuf);                           \
2411     })
2412 #else  // generic
2413 #define _mm_shuffle_ps(a, b, imm)                          \
2414     __extension__({                                        \
2415         __m128 ret;                                        \
2416         switch (imm) {                                     \
2417         case _MM_SHUFFLE(1, 0, 3, 2):                      \
2418             ret = _mm_shuffle_ps_1032((a), (b));           \
2419             break;                                         \
2420         case _MM_SHUFFLE(2, 3, 0, 1):                      \
2421             ret = _mm_shuffle_ps_2301((a), (b));           \
2422             break;                                         \
2423         case _MM_SHUFFLE(0, 3, 2, 1):                      \
2424             ret = _mm_shuffle_ps_0321((a), (b));           \
2425             break;                                         \
2426         case _MM_SHUFFLE(2, 1, 0, 3):                      \
2427             ret = _mm_shuffle_ps_2103((a), (b));           \
2428             break;                                         \
2429         case _MM_SHUFFLE(1, 0, 1, 0):                      \
2430             ret = _mm_movelh_ps((a), (b));                 \
2431             break;                                         \
2432         case _MM_SHUFFLE(1, 0, 0, 1):                      \
2433             ret = _mm_shuffle_ps_1001((a), (b));           \
2434             break;                                         \
2435         case _MM_SHUFFLE(0, 1, 0, 1):                      \
2436             ret = _mm_shuffle_ps_0101((a), (b));           \
2437             break;                                         \
2438         case _MM_SHUFFLE(3, 2, 1, 0):                      \
2439             ret = _mm_shuffle_ps_3210((a), (b));           \
2440             break;                                         \
2441         case _MM_SHUFFLE(0, 0, 1, 1):                      \
2442             ret = _mm_shuffle_ps_0011((a), (b));           \
2443             break;                                         \
2444         case _MM_SHUFFLE(0, 0, 2, 2):                      \
2445             ret = _mm_shuffle_ps_0022((a), (b));           \
2446             break;                                         \
2447         case _MM_SHUFFLE(2, 2, 0, 0):                      \
2448             ret = _mm_shuffle_ps_2200((a), (b));           \
2449             break;                                         \
2450         case _MM_SHUFFLE(3, 2, 0, 2):                      \
2451             ret = _mm_shuffle_ps_3202((a), (b));           \
2452             break;                                         \
2453         case _MM_SHUFFLE(3, 2, 3, 2):                      \
2454             ret = _mm_movehl_ps((b), (a));                 \
2455             break;                                         \
2456         case _MM_SHUFFLE(1, 1, 3, 3):                      \
2457             ret = _mm_shuffle_ps_1133((a), (b));           \
2458             break;                                         \
2459         case _MM_SHUFFLE(2, 0, 1, 0):                      \
2460             ret = _mm_shuffle_ps_2010((a), (b));           \
2461             break;                                         \
2462         case _MM_SHUFFLE(2, 0, 0, 1):                      \
2463             ret = _mm_shuffle_ps_2001((a), (b));           \
2464             break;                                         \
2465         case _MM_SHUFFLE(2, 0, 3, 2):                      \
2466             ret = _mm_shuffle_ps_2032((a), (b));           \
2467             break;                                         \
2468         default:                                           \
2469             ret = _mm_shuffle_ps_default((a), (b), (imm)); \
2470             break;                                         \
2471         }                                                  \
2472         ret;                                               \
2473     })
2474 #endif
2475 
2476 // Computes the approximations of square roots of the four single-precision,
2477 // floating-point values of a. First computes reciprocal square roots and then
2478 // reciprocals of the four values.
2479 //
2480 //   r0 := sqrt(a0)
2481 //   r1 := sqrt(a1)
2482 //   r2 := sqrt(a2)
2483 //   r3 := sqrt(a3)
2484 //
2485 // https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
_mm_sqrt_ps(__m128 in)2486 FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
2487 {
2488 #if SSE2NEON_PRECISE_SQRT
2489     float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2490 
2491     // Test for vrsqrteq_f32(0) -> positive infinity case.
2492     // Change to zero, so that s * 1/sqrt(s) result is zero too.
2493     const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2494     const uint32x4_t div_by_zero =
2495         vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2496     recip = vreinterpretq_f32_u32(
2497         vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2498 
2499     // Additional Netwon-Raphson iteration for accuracy
2500     recip = vmulq_f32(
2501         vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2502         recip);
2503     recip = vmulq_f32(
2504         vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2505         recip);
2506 
2507     // sqrt(s) = s * 1/sqrt(s)
2508     return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
2509 #elif defined(__aarch64__)
2510     return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
2511 #else
2512     float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2513     float32x4_t sq = vrecpeq_f32(recipsq);
2514     return vreinterpretq_m128_f32(sq);
2515 #endif
2516 }
2517 
2518 // Computes the approximation of the square root of the scalar single-precision
2519 // floating point value of in.
2520 // https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
_mm_sqrt_ss(__m128 in)2521 FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
2522 {
2523     float32_t value =
2524         vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
2525     return vreinterpretq_m128_f32(
2526         vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
2527 }
2528 
2529 // Stores four single-precision, floating-point values.
2530 // https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
_mm_store_ps(float * p,__m128 a)2531 FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
2532 {
2533     vst1q_f32(p, vreinterpretq_f32_m128(a));
2534 }
2535 
2536 // Store the lower single-precision (32-bit) floating-point element from a into
2537 // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2538 // boundary or a general-protection exception may be generated.
2539 //
2540 //   MEM[mem_addr+31:mem_addr] := a[31:0]
2541 //   MEM[mem_addr+63:mem_addr+32] := a[31:0]
2542 //   MEM[mem_addr+95:mem_addr+64] := a[31:0]
2543 //   MEM[mem_addr+127:mem_addr+96] := a[31:0]
2544 //
2545 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
_mm_store_ps1(float * p,__m128 a)2546 FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
2547 {
2548     float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
2549     vst1q_f32(p, vdupq_n_f32(a0));
2550 }
2551 
2552 // Stores the lower single - precision, floating - point value.
2553 // https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
_mm_store_ss(float * p,__m128 a)2554 FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
2555 {
2556     vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
2557 }
2558 
2559 // Store the lower single-precision (32-bit) floating-point element from a into
2560 // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2561 // boundary or a general-protection exception may be generated.
2562 //
2563 //   MEM[mem_addr+31:mem_addr] := a[31:0]
2564 //   MEM[mem_addr+63:mem_addr+32] := a[31:0]
2565 //   MEM[mem_addr+95:mem_addr+64] := a[31:0]
2566 //   MEM[mem_addr+127:mem_addr+96] := a[31:0]
2567 //
2568 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
2569 #define _mm_store1_ps _mm_store_ps1
2570 
2571 // Stores the upper two single-precision, floating-point values of a to the
2572 // address p.
2573 //
2574 //   *p0 := a2
2575 //   *p1 := a3
2576 //
2577 // https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
_mm_storeh_pi(__m64 * p,__m128 a)2578 FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
2579 {
2580     *p = vreinterpret_m64_f32(vget_high_f32(a));
2581 }
2582 
2583 // Stores the lower two single-precision floating point values of a to the
2584 // address p.
2585 //
2586 //   *p0 := a0
2587 //   *p1 := a1
2588 //
2589 // https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
_mm_storel_pi(__m64 * p,__m128 a)2590 FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
2591 {
2592     *p = vreinterpret_m64_f32(vget_low_f32(a));
2593 }
2594 
2595 // Store 4 single-precision (32-bit) floating-point elements from a into memory
2596 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
2597 // general-protection exception may be generated.
2598 //
2599 //   MEM[mem_addr+31:mem_addr] := a[127:96]
2600 //   MEM[mem_addr+63:mem_addr+32] := a[95:64]
2601 //   MEM[mem_addr+95:mem_addr+64] := a[63:32]
2602 //   MEM[mem_addr+127:mem_addr+96] := a[31:0]
2603 //
2604 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
_mm_storer_ps(float * p,__m128 a)2605 FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
2606 {
2607     float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
2608     float32x4_t rev = vextq_f32(tmp, tmp, 2);
2609     vst1q_f32(p, rev);
2610 }
2611 
2612 // Stores four single-precision, floating-point values.
2613 // https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
_mm_storeu_ps(float * p,__m128 a)2614 FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
2615 {
2616     vst1q_f32(p, vreinterpretq_f32_m128(a));
2617 }
2618 
2619 // Stores 16-bits of integer data a at the address p.
2620 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16
_mm_storeu_si16(void * p,__m128i a)2621 FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
2622 {
2623     vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
2624 }
2625 
2626 // Stores 64-bits of integer data a at the address p.
2627 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64
_mm_storeu_si64(void * p,__m128i a)2628 FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
2629 {
2630     vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
2631 }
2632 
2633 // Store 64-bits of integer data from a into memory using a non-temporal memory
2634 // hint.
2635 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi
_mm_stream_pi(__m64 * p,__m64 a)2636 FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
2637 {
2638     vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
2639 }
2640 
2641 // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
2642 // point elements) from a into memory using a non-temporal memory hint.
2643 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
_mm_stream_ps(float * p,__m128 a)2644 FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
2645 {
2646 #if __has_builtin(__builtin_nontemporal_store)
2647     __builtin_nontemporal_store(a, (float32x4_t *) p);
2648 #else
2649     vst1q_f32(p, vreinterpretq_f32_m128(a));
2650 #endif
2651 }
2652 
2653 // Subtracts the four single-precision, floating-point values of a and b.
2654 //
2655 //   r0 := a0 - b0
2656 //   r1 := a1 - b1
2657 //   r2 := a2 - b2
2658 //   r3 := a3 - b3
2659 //
2660 // https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
_mm_sub_ps(__m128 a,__m128 b)2661 FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2662 {
2663     return vreinterpretq_m128_f32(
2664         vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2665 }
2666 
2667 // Subtract the lower single-precision (32-bit) floating-point element in b from
2668 // the lower single-precision (32-bit) floating-point element in a, store the
2669 // result in the lower element of dst, and copy the upper 3 packed elements from
2670 // a to the upper elements of dst.
2671 //
2672 //   dst[31:0] := a[31:0] - b[31:0]
2673 //   dst[127:32] := a[127:32]
2674 //
2675 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
_mm_sub_ss(__m128 a,__m128 b)2676 FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2677 {
2678     return _mm_move_ss(a, _mm_sub_ps(a, b));
2679 }
2680 
2681 // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
2682 // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
2683 // transposed matrix in these vectors (row0 now contains column 0, etc.).
2684 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
2685 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
2686     do {                                                  \
2687         float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
2688         float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
2689         row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
2690                             vget_low_f32(ROW23.val[0]));  \
2691         row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
2692                             vget_low_f32(ROW23.val[1]));  \
2693         row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
2694                             vget_high_f32(ROW23.val[0])); \
2695         row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
2696                             vget_high_f32(ROW23.val[1])); \
2697     } while (0)
2698 
2699 // according to the documentation, these intrinsics behave the same as the
2700 // non-'u' versions.  We'll just alias them here.
2701 #define _mm_ucomieq_ss _mm_comieq_ss
2702 #define _mm_ucomige_ss _mm_comige_ss
2703 #define _mm_ucomigt_ss _mm_comigt_ss
2704 #define _mm_ucomile_ss _mm_comile_ss
2705 #define _mm_ucomilt_ss _mm_comilt_ss
2706 #define _mm_ucomineq_ss _mm_comineq_ss
2707 
2708 // Return vector of type __m128i with undefined elements.
2709 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_undefined_si128
_mm_undefined_si128(void)2710 FORCE_INLINE __m128i _mm_undefined_si128(void)
2711 {
2712 #if defined(__GNUC__) || defined(__clang__)
2713 #pragma GCC diagnostic push
2714 #pragma GCC diagnostic ignored "-Wuninitialized"
2715 #endif
2716     __m128i a;
2717     return a;
2718 #if defined(__GNUC__) || defined(__clang__)
2719 #pragma GCC diagnostic pop
2720 #endif
2721 }
2722 
2723 // Return vector of type __m128 with undefined elements.
2724 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
_mm_undefined_ps(void)2725 FORCE_INLINE __m128 _mm_undefined_ps(void)
2726 {
2727 #if defined(__GNUC__) || defined(__clang__)
2728 #pragma GCC diagnostic push
2729 #pragma GCC diagnostic ignored "-Wuninitialized"
2730 #endif
2731     __m128 a;
2732     return a;
2733 #if defined(__GNUC__) || defined(__clang__)
2734 #pragma GCC diagnostic pop
2735 #endif
2736 }
2737 
2738 // Selects and interleaves the upper two single-precision, floating-point values
2739 // from a and b.
2740 //
2741 //   r0 := a2
2742 //   r1 := b2
2743 //   r2 := a3
2744 //   r3 := b3
2745 //
2746 // https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
_mm_unpackhi_ps(__m128 a,__m128 b)2747 FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
2748 {
2749 #if defined(__aarch64__)
2750     return vreinterpretq_m128_f32(
2751         vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2752 #else
2753     float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
2754     float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
2755     float32x2x2_t result = vzip_f32(a1, b1);
2756     return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2757 #endif
2758 }
2759 
2760 // Selects and interleaves the lower two single-precision, floating-point values
2761 // from a and b.
2762 //
2763 //   r0 := a0
2764 //   r1 := b0
2765 //   r2 := a1
2766 //   r3 := b1
2767 //
2768 // https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
_mm_unpacklo_ps(__m128 a,__m128 b)2769 FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
2770 {
2771 #if defined(__aarch64__)
2772     return vreinterpretq_m128_f32(
2773         vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2774 #else
2775     float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
2776     float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
2777     float32x2x2_t result = vzip_f32(a1, b1);
2778     return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2779 #endif
2780 }
2781 
2782 // Computes bitwise EXOR (exclusive-or) of the four single-precision,
2783 // floating-point values of a and b.
2784 // https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
_mm_xor_ps(__m128 a,__m128 b)2785 FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
2786 {
2787     return vreinterpretq_m128_s32(
2788         veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
2789 }
2790 
2791 /* SSE2 */
2792 
2793 // Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
2794 // unsigned 16-bit integers in b.
2795 // https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
_mm_add_epi16(__m128i a,__m128i b)2796 FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
2797 {
2798     return vreinterpretq_m128i_s16(
2799         vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2800 }
2801 
2802 // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
2803 // unsigned 32-bit integers in b.
2804 //
2805 //   r0 := a0 + b0
2806 //   r1 := a1 + b1
2807 //   r2 := a2 + b2
2808 //   r3 := a3 + b3
2809 //
2810 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
_mm_add_epi32(__m128i a,__m128i b)2811 FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
2812 {
2813     return vreinterpretq_m128i_s32(
2814         vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2815 }
2816 
2817 // Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
2818 // unsigned 32-bit integers in b.
2819 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
_mm_add_epi64(__m128i a,__m128i b)2820 FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
2821 {
2822     return vreinterpretq_m128i_s64(
2823         vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2824 }
2825 
2826 // Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
2827 // unsigned 8-bit integers in b.
2828 // https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
_mm_add_epi8(__m128i a,__m128i b)2829 FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
2830 {
2831     return vreinterpretq_m128i_s8(
2832         vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2833 }
2834 
2835 // Add packed double-precision (64-bit) floating-point elements in a and b, and
2836 // store the results in dst.
2837 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
_mm_add_pd(__m128d a,__m128d b)2838 FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
2839 {
2840 #if defined(__aarch64__)
2841     return vreinterpretq_m128d_f64(
2842         vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2843 #else
2844     double *da = (double *) &a;
2845     double *db = (double *) &b;
2846     double c[2];
2847     c[0] = da[0] + db[0];
2848     c[1] = da[1] + db[1];
2849     return vld1q_f32((float32_t *) c);
2850 #endif
2851 }
2852 
2853 // Add the lower double-precision (64-bit) floating-point element in a and b,
2854 // store the result in the lower element of dst, and copy the upper element from
2855 // a to the upper element of dst.
2856 //
2857 //   dst[63:0] := a[63:0] + b[63:0]
2858 //   dst[127:64] := a[127:64]
2859 //
2860 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
_mm_add_sd(__m128d a,__m128d b)2861 FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
2862 {
2863 #if defined(__aarch64__)
2864     return _mm_move_sd(a, _mm_add_pd(a, b));
2865 #else
2866     double *da = (double *) &a;
2867     double *db = (double *) &b;
2868     double c[2];
2869     c[0] = da[0] + db[0];
2870     c[1] = da[1];
2871     return vld1q_f32((float32_t *) c);
2872 #endif
2873 }
2874 
2875 // Add 64-bit integers a and b, and store the result in dst.
2876 //
2877 //   dst[63:0] := a[63:0] + b[63:0]
2878 //
2879 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
_mm_add_si64(__m64 a,__m64 b)2880 FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
2881 {
2882     return vreinterpret_m64_s64(
2883         vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2884 }
2885 
2886 // Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
2887 // and saturates.
2888 //
2889 //   r0 := SignedSaturate(a0 + b0)
2890 //   r1 := SignedSaturate(a1 + b1)
2891 //   ...
2892 //   r7 := SignedSaturate(a7 + b7)
2893 //
2894 // https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
_mm_adds_epi16(__m128i a,__m128i b)2895 FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
2896 {
2897     return vreinterpretq_m128i_s16(
2898         vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2899 }
2900 
2901 // Add packed signed 8-bit integers in a and b using saturation, and store the
2902 // results in dst.
2903 //
2904 //   FOR j := 0 to 15
2905 //     i := j*8
2906 //     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
2907 //   ENDFOR
2908 //
2909 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
_mm_adds_epi8(__m128i a,__m128i b)2910 FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
2911 {
2912     return vreinterpretq_m128i_s8(
2913         vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2914 }
2915 
2916 // Add packed unsigned 16-bit integers in a and b using saturation, and store
2917 // the results in dst.
2918 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
_mm_adds_epu16(__m128i a,__m128i b)2919 FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
2920 {
2921     return vreinterpretq_m128i_u16(
2922         vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
2923 }
2924 
2925 // Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
2926 // b and saturates..
2927 // https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
_mm_adds_epu8(__m128i a,__m128i b)2928 FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
2929 {
2930     return vreinterpretq_m128i_u8(
2931         vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2932 }
2933 
2934 // Compute the bitwise AND of packed double-precision (64-bit) floating-point
2935 // elements in a and b, and store the results in dst.
2936 //
2937 //   FOR j := 0 to 1
2938 //     i := j*64
2939 //     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
2940 //   ENDFOR
2941 //
2942 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
_mm_and_pd(__m128d a,__m128d b)2943 FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
2944 {
2945     return vreinterpretq_m128d_s64(
2946         vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
2947 }
2948 
2949 // Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
2950 // b.
2951 //
2952 //   r := a & b
2953 //
2954 // https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
_mm_and_si128(__m128i a,__m128i b)2955 FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
2956 {
2957     return vreinterpretq_m128i_s32(
2958         vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2959 }
2960 
2961 // Compute the bitwise NOT of packed double-precision (64-bit) floating-point
2962 // elements in a and then AND with b, and store the results in dst.
2963 //
2964 //   FOR j := 0 to 1
2965 // 	     i := j*64
2966 // 	     dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
2967 //   ENDFOR
2968 //
2969 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
_mm_andnot_pd(__m128d a,__m128d b)2970 FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
2971 {
2972     // *NOTE* argument swap
2973     return vreinterpretq_m128d_s64(
2974         vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
2975 }
2976 
2977 // Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
2978 // 128-bit value in a.
2979 //
2980 //   r := (~a) & b
2981 //
2982 // https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
_mm_andnot_si128(__m128i a,__m128i b)2983 FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
2984 {
2985     return vreinterpretq_m128i_s32(
2986         vbicq_s32(vreinterpretq_s32_m128i(b),
2987                   vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
2988 }
2989 
2990 // Computes the average of the 8 unsigned 16-bit integers in a and the 8
2991 // unsigned 16-bit integers in b and rounds.
2992 //
2993 //   r0 := (a0 + b0) / 2
2994 //   r1 := (a1 + b1) / 2
2995 //   ...
2996 //   r7 := (a7 + b7) / 2
2997 //
2998 // https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
_mm_avg_epu16(__m128i a,__m128i b)2999 FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
3000 {
3001     return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
3002                                  vreinterpretq_u16_m128i(b));
3003 }
3004 
3005 // Computes the average of the 16 unsigned 8-bit integers in a and the 16
3006 // unsigned 8-bit integers in b and rounds.
3007 //
3008 //   r0 := (a0 + b0) / 2
3009 //   r1 := (a1 + b1) / 2
3010 //   ...
3011 //   r15 := (a15 + b15) / 2
3012 //
3013 // https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
_mm_avg_epu8(__m128i a,__m128i b)3014 FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
3015 {
3016     return vreinterpretq_m128i_u8(
3017         vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3018 }
3019 
3020 // Shift a left by imm8 bytes while shifting in zeros, and store the results in
3021 // dst.
3022 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128
3023 #define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3024 
3025 // Shift a right by imm8 bytes while shifting in zeros, and store the results in
3026 // dst.
3027 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128
3028 #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3029 
3030 // Cast vector of type __m128d to type __m128. This intrinsic is only used for
3031 // compilation and does not generate any instructions, thus it has zero latency.
3032 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
_mm_castpd_ps(__m128d a)3033 FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
3034 {
3035     return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
3036 }
3037 
3038 // Cast vector of type __m128d to type __m128i. This intrinsic is only used for
3039 // compilation and does not generate any instructions, thus it has zero latency.
3040 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
_mm_castpd_si128(__m128d a)3041 FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
3042 {
3043     return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
3044 }
3045 
3046 // Cast vector of type __m128 to type __m128d. This intrinsic is only used for
3047 // compilation and does not generate any instructions, thus it has zero latency.
3048 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
_mm_castps_pd(__m128 a)3049 FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
3050 {
3051     return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
3052 }
3053 
3054 // Applies a type cast to reinterpret four 32-bit floating point values passed
3055 // in as a 128-bit parameter as packed 32-bit integers.
3056 // https://msdn.microsoft.com/en-us/library/bb514099.aspx
_mm_castps_si128(__m128 a)3057 FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
3058 {
3059     return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
3060 }
3061 
3062 // Cast vector of type __m128i to type __m128d. This intrinsic is only used for
3063 // compilation and does not generate any instructions, thus it has zero latency.
3064 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
_mm_castsi128_pd(__m128i a)3065 FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
3066 {
3067 #if defined(__aarch64__)
3068     return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
3069 #else
3070     return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
3071 #endif
3072 }
3073 
3074 // Applies a type cast to reinterpret four 32-bit integers passed in as a
3075 // 128-bit parameter as packed 32-bit floating point values.
3076 // https://msdn.microsoft.com/en-us/library/bb514029.aspx
_mm_castsi128_ps(__m128i a)3077 FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
3078 {
3079     return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
3080 }
3081 
3082 // Cache line containing p is flushed and invalidated from all caches in the
3083 // coherency domain. :
3084 // https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
_mm_clflush(void const * p)3085 FORCE_INLINE void _mm_clflush(void const *p)
3086 {
3087     (void) p;
3088     // no corollary for Neon?
3089 }
3090 
3091 // Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
3092 // unsigned 16-bit integers in b for equality.
3093 // https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
_mm_cmpeq_epi16(__m128i a,__m128i b)3094 FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
3095 {
3096     return vreinterpretq_m128i_u16(
3097         vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3098 }
3099 
3100 // Compare packed 32-bit integers in a and b for equality, and store the results
3101 // in dst
_mm_cmpeq_epi32(__m128i a,__m128i b)3102 FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
3103 {
3104     return vreinterpretq_m128i_u32(
3105         vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3106 }
3107 
3108 // Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
3109 // unsigned 8-bit integers in b for equality.
3110 // https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
_mm_cmpeq_epi8(__m128i a,__m128i b)3111 FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
3112 {
3113     return vreinterpretq_m128i_u8(
3114         vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3115 }
3116 
3117 // Compare packed double-precision (64-bit) floating-point elements in a and b
3118 // for equality, and store the results in dst.
3119 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
_mm_cmpeq_pd(__m128d a,__m128d b)3120 FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
3121 {
3122 #if defined(__aarch64__)
3123     return vreinterpretq_m128d_u64(
3124         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3125 #else
3126     // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3127     uint32x4_t cmp =
3128         vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3129     uint32x4_t swapped = vrev64q_u32(cmp);
3130     return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
3131 #endif
3132 }
3133 
3134 // Compare the lower double-precision (64-bit) floating-point elements in a and
3135 // b for equality, store the result in the lower element of dst, and copy the
3136 // upper element from a to the upper element of dst.
3137 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd
_mm_cmpeq_sd(__m128d a,__m128d b)3138 FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
3139 {
3140     return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
3141 }
3142 
3143 // Compare packed double-precision (64-bit) floating-point elements in a and b
3144 // for greater-than-or-equal, and store the results in dst.
3145 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd
_mm_cmpge_pd(__m128d a,__m128d b)3146 FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
3147 {
3148 #if defined(__aarch64__)
3149     return vreinterpretq_m128d_u64(
3150         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3151 #else
3152     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3153     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3154     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3155     uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3156     uint64_t d[2];
3157     d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3158     d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3159 
3160     return vreinterpretq_m128d_u64(vld1q_u64(d));
3161 #endif
3162 }
3163 
3164 // Compare the lower double-precision (64-bit) floating-point elements in a and
3165 // b for greater-than-or-equal, store the result in the lower element of dst,
3166 // and copy the upper element from a to the upper element of dst.
3167 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd
_mm_cmpge_sd(__m128d a,__m128d b)3168 FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
3169 {
3170 #if defined(__aarch64__)
3171     return _mm_move_sd(a, _mm_cmpge_pd(a, b));
3172 #else
3173     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3174     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3175     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3176     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3177     uint64_t d[2];
3178     d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3179     d[1] = a1;
3180 
3181     return vreinterpretq_m128d_u64(vld1q_u64(d));
3182 #endif
3183 }
3184 
3185 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
3186 // in b for greater than.
3187 //
3188 //   r0 := (a0 > b0) ? 0xffff : 0x0
3189 //   r1 := (a1 > b1) ? 0xffff : 0x0
3190 //   ...
3191 //   r7 := (a7 > b7) ? 0xffff : 0x0
3192 //
3193 // https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
_mm_cmpgt_epi16(__m128i a,__m128i b)3194 FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
3195 {
3196     return vreinterpretq_m128i_u16(
3197         vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3198 }
3199 
3200 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
3201 // in b for greater than.
3202 // https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
_mm_cmpgt_epi32(__m128i a,__m128i b)3203 FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
3204 {
3205     return vreinterpretq_m128i_u32(
3206         vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3207 }
3208 
3209 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
3210 // in b for greater than.
3211 //
3212 //   r0 := (a0 > b0) ? 0xff : 0x0
3213 //   r1 := (a1 > b1) ? 0xff : 0x0
3214 //   ...
3215 //   r15 := (a15 > b15) ? 0xff : 0x0
3216 //
3217 // https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
_mm_cmpgt_epi8(__m128i a,__m128i b)3218 FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
3219 {
3220     return vreinterpretq_m128i_u8(
3221         vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3222 }
3223 
3224 // Compare packed double-precision (64-bit) floating-point elements in a and b
3225 // for greater-than, and store the results in dst.
3226 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd
_mm_cmpgt_pd(__m128d a,__m128d b)3227 FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
3228 {
3229 #if defined(__aarch64__)
3230     return vreinterpretq_m128d_u64(
3231         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3232 #else
3233     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3234     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3235     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3236     uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3237     uint64_t d[2];
3238     d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3239     d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3240 
3241     return vreinterpretq_m128d_u64(vld1q_u64(d));
3242 #endif
3243 }
3244 
3245 // Compare the lower double-precision (64-bit) floating-point elements in a and
3246 // b for greater-than, store the result in the lower element of dst, and copy
3247 // the upper element from a to the upper element of dst.
3248 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd
_mm_cmpgt_sd(__m128d a,__m128d b)3249 FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
3250 {
3251 #if defined(__aarch64__)
3252     return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
3253 #else
3254     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3255     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3256     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3257     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3258     uint64_t d[2];
3259     d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3260     d[1] = a1;
3261 
3262     return vreinterpretq_m128d_u64(vld1q_u64(d));
3263 #endif
3264 }
3265 
3266 // Compare packed double-precision (64-bit) floating-point elements in a and b
3267 // for less-than-or-equal, and store the results in dst.
3268 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd
_mm_cmple_pd(__m128d a,__m128d b)3269 FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
3270 {
3271 #if defined(__aarch64__)
3272     return vreinterpretq_m128d_u64(
3273         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3274 #else
3275     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3276     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3277     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3278     uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3279     uint64_t d[2];
3280     d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3281     d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3282 
3283     return vreinterpretq_m128d_u64(vld1q_u64(d));
3284 #endif
3285 }
3286 
3287 // Compare the lower double-precision (64-bit) floating-point elements in a and
3288 // b for less-than-or-equal, store the result in the lower element of dst, and
3289 // copy the upper element from a to the upper element of dst.
3290 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd
_mm_cmple_sd(__m128d a,__m128d b)3291 FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
3292 {
3293 #if defined(__aarch64__)
3294     return _mm_move_sd(a, _mm_cmple_pd(a, b));
3295 #else
3296     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3297     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3298     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3299     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3300     uint64_t d[2];
3301     d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3302     d[1] = a1;
3303 
3304     return vreinterpretq_m128d_u64(vld1q_u64(d));
3305 #endif
3306 }
3307 
3308 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
3309 // in b for less than.
3310 //
3311 //   r0 := (a0 < b0) ? 0xffff : 0x0
3312 //   r1 := (a1 < b1) ? 0xffff : 0x0
3313 //   ...
3314 //   r7 := (a7 < b7) ? 0xffff : 0x0
3315 //
3316 // https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
_mm_cmplt_epi16(__m128i a,__m128i b)3317 FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
3318 {
3319     return vreinterpretq_m128i_u16(
3320         vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3321 }
3322 
3323 
3324 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
3325 // in b for less than.
3326 // https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
_mm_cmplt_epi32(__m128i a,__m128i b)3327 FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
3328 {
3329     return vreinterpretq_m128i_u32(
3330         vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3331 }
3332 
3333 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
3334 // in b for lesser than.
3335 // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
_mm_cmplt_epi8(__m128i a,__m128i b)3336 FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
3337 {
3338     return vreinterpretq_m128i_u8(
3339         vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3340 }
3341 
3342 // Compare packed double-precision (64-bit) floating-point elements in a and b
3343 // for less-than, and store the results in dst.
3344 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd
_mm_cmplt_pd(__m128d a,__m128d b)3345 FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
3346 {
3347 #if defined(__aarch64__)
3348     return vreinterpretq_m128d_u64(
3349         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3350 #else
3351     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3352     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3353     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3354     uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3355     uint64_t d[2];
3356     d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3357     d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3358 
3359     return vreinterpretq_m128d_u64(vld1q_u64(d));
3360 #endif
3361 }
3362 
3363 // Compare the lower double-precision (64-bit) floating-point elements in a and
3364 // b for less-than, store the result in the lower element of dst, and copy the
3365 // upper element from a to the upper element of dst.
3366 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd
_mm_cmplt_sd(__m128d a,__m128d b)3367 FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
3368 {
3369 #if defined(__aarch64__)
3370     return _mm_move_sd(a, _mm_cmplt_pd(a, b));
3371 #else
3372     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3373     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3374     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3375     uint64_t d[2];
3376     d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3377     d[1] = a1;
3378 
3379     return vreinterpretq_m128d_u64(vld1q_u64(d));
3380 #endif
3381 }
3382 
3383 // Compare packed double-precision (64-bit) floating-point elements in a and b
3384 // for not-equal, and store the results in dst.
3385 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd
_mm_cmpneq_pd(__m128d a,__m128d b)3386 FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
3387 {
3388 #if defined(__aarch64__)
3389     return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
3390         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3391 #else
3392     // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3393     uint32x4_t cmp =
3394         vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3395     uint32x4_t swapped = vrev64q_u32(cmp);
3396     return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
3397 #endif
3398 }
3399 
3400 // Compare the lower double-precision (64-bit) floating-point elements in a and
3401 // b for not-equal, store the result in the lower element of dst, and copy the
3402 // upper element from a to the upper element of dst.
3403 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd
_mm_cmpneq_sd(__m128d a,__m128d b)3404 FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
3405 {
3406     return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
3407 }
3408 
3409 // Compare packed double-precision (64-bit) floating-point elements in a and b
3410 // for not-greater-than-or-equal, and store the results in dst.
3411 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd
3412 #define _mm_cmpnge_pd(a, b) _mm_cmplt_pd(a, b)
3413 
3414 // Compare the lower double-precision (64-bit) floating-point elements in a and
3415 // b for not-greater-than-or-equal, store the result in the lower element of
3416 // dst, and copy the upper element from a to the upper element of dst.
3417 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd
3418 #define _mm_cmpnge_sd(a, b) _mm_cmplt_sd(a, b)
3419 
3420 // Compare packed double-precision (64-bit) floating-point elements in a and b
3421 // for not-greater-than, and store the results in dst.
3422 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cmpngt_pd
3423 #define _mm_cmpngt_pd(a, b) _mm_cmple_pd(a, b)
3424 
3425 // Compare the lower double-precision (64-bit) floating-point elements in a and
3426 // b for not-greater-than, store the result in the lower element of dst, and
3427 // copy the upper element from a to the upper element of dst.
3428 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd
3429 #define _mm_cmpngt_sd(a, b) _mm_cmple_sd(a, b)
3430 
3431 // Compare packed double-precision (64-bit) floating-point elements in a and b
3432 // for not-less-than-or-equal, and store the results in dst.
3433 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd
3434 #define _mm_cmpnle_pd(a, b) _mm_cmpgt_pd(a, b)
3435 
3436 // Compare the lower double-precision (64-bit) floating-point elements in a and
3437 // b for not-less-than-or-equal, store the result in the lower element of dst,
3438 // and copy the upper element from a to the upper element of dst.
3439 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd
3440 #define _mm_cmpnle_sd(a, b) _mm_cmpgt_sd(a, b)
3441 
3442 // Compare packed double-precision (64-bit) floating-point elements in a and b
3443 // for not-less-than, and store the results in dst.
3444 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd
3445 #define _mm_cmpnlt_pd(a, b) _mm_cmpge_pd(a, b)
3446 
3447 // Compare the lower double-precision (64-bit) floating-point elements in a and
3448 // b for not-less-than, store the result in the lower element of dst, and copy
3449 // the upper element from a to the upper element of dst.
3450 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd
3451 #define _mm_cmpnlt_sd(a, b) _mm_cmpge_sd(a, b)
3452 
3453 // Compare packed double-precision (64-bit) floating-point elements in a and b
3454 // to see if neither is NaN, and store the results in dst.
3455 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd
_mm_cmpord_pd(__m128d a,__m128d b)3456 FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
3457 {
3458 #if defined(__aarch64__)
3459     // Excluding NaNs, any two floating point numbers can be compared.
3460     uint64x2_t not_nan_a =
3461         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3462     uint64x2_t not_nan_b =
3463         vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3464     return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
3465 #else
3466     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3467     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3468     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3469     uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3470     uint64_t d[2];
3471     d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3472             (*(double *) &b0) == (*(double *) &b0))
3473                ? ~UINT64_C(0)
3474                : UINT64_C(0);
3475     d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3476             (*(double *) &b1) == (*(double *) &b1))
3477                ? ~UINT64_C(0)
3478                : UINT64_C(0);
3479 
3480     return vreinterpretq_m128d_u64(vld1q_u64(d));
3481 #endif
3482 }
3483 
3484 // Compare the lower double-precision (64-bit) floating-point elements in a and
3485 // b to see if neither is NaN, store the result in the lower element of dst, and
3486 // copy the upper element from a to the upper element of dst.
3487 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd
_mm_cmpord_sd(__m128d a,__m128d b)3488 FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
3489 {
3490 #if defined(__aarch64__)
3491     return _mm_move_sd(a, _mm_cmpord_pd(a, b));
3492 #else
3493     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3494     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3495     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3496     uint64_t d[2];
3497     d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3498             (*(double *) &b0) == (*(double *) &b0))
3499                ? ~UINT64_C(0)
3500                : UINT64_C(0);
3501     d[1] = a1;
3502 
3503     return vreinterpretq_m128d_u64(vld1q_u64(d));
3504 #endif
3505 }
3506 
3507 // Compare packed double-precision (64-bit) floating-point elements in a and b
3508 // to see if either is NaN, and store the results in dst.
3509 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd
_mm_cmpunord_pd(__m128d a,__m128d b)3510 FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
3511 {
3512 #if defined(__aarch64__)
3513     // Two NaNs are not equal in comparison operation.
3514     uint64x2_t not_nan_a =
3515         vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3516     uint64x2_t not_nan_b =
3517         vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3518     return vreinterpretq_m128d_s32(
3519         vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3520 #else
3521     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3522     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3523     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3524     uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3525     uint64_t d[2];
3526     d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3527             (*(double *) &b0) == (*(double *) &b0))
3528                ? UINT64_C(0)
3529                : ~UINT64_C(0);
3530     d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3531             (*(double *) &b1) == (*(double *) &b1))
3532                ? UINT64_C(0)
3533                : ~UINT64_C(0);
3534 
3535     return vreinterpretq_m128d_u64(vld1q_u64(d));
3536 #endif
3537 }
3538 
3539 // Compare the lower double-precision (64-bit) floating-point elements in a and
3540 // b to see if either is NaN, store the result in the lower element of dst, and
3541 // copy the upper element from a to the upper element of dst.
3542 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd
_mm_cmpunord_sd(__m128d a,__m128d b)3543 FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
3544 {
3545 #if defined(__aarch64__)
3546     return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
3547 #else
3548     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3549     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3550     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3551     uint64_t d[2];
3552     d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3553             (*(double *) &b0) == (*(double *) &b0))
3554                ? UINT64_C(0)
3555                : ~UINT64_C(0);
3556     d[1] = a1;
3557 
3558     return vreinterpretq_m128d_u64(vld1q_u64(d));
3559 #endif
3560 }
3561 
3562 // Compare the lower double-precision (64-bit) floating-point element in a and b
3563 // for greater-than-or-equal, and return the boolean result (0 or 1).
3564 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd
_mm_comige_sd(__m128d a,__m128d b)3565 FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
3566 {
3567 #if defined(__aarch64__)
3568     return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3569 #else
3570     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3571     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3572 
3573     return (*(double *) &a0 >= *(double *) &b0);
3574 #endif
3575 }
3576 
3577 // Compare the lower double-precision (64-bit) floating-point element in a and b
3578 // for greater-than, and return the boolean result (0 or 1).
3579 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd
_mm_comigt_sd(__m128d a,__m128d b)3580 FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
3581 {
3582 #if defined(__aarch64__)
3583     return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
3584 #else
3585     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3586     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3587 
3588     return (*(double *) &a0 > *(double *) &b0);
3589 #endif
3590 }
3591 
3592 // Compare the lower double-precision (64-bit) floating-point element in a and b
3593 // for less-than-or-equal, and return the boolean result (0 or 1).
3594 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd
_mm_comile_sd(__m128d a,__m128d b)3595 FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
3596 {
3597 #if defined(__aarch64__)
3598     return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
3599 #else
3600     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3601     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3602 
3603     return (*(double *) &a0 <= *(double *) &b0);
3604 #endif
3605 }
3606 
3607 // Compare the lower double-precision (64-bit) floating-point element in a and b
3608 // for less-than, and return the boolean result (0 or 1).
3609 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd
_mm_comilt_sd(__m128d a,__m128d b)3610 FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
3611 {
3612 #if defined(__aarch64__)
3613     return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
3614 #else
3615     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3616     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3617 
3618     return (*(double *) &a0 < *(double *) &b0);
3619 #endif
3620 }
3621 
3622 // Compare the lower double-precision (64-bit) floating-point element in a and b
3623 // for equality, and return the boolean result (0 or 1).
3624 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd
_mm_comieq_sd(__m128d a,__m128d b)3625 FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
3626 {
3627 #if defined(__aarch64__)
3628     return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
3629 #else
3630     uint32x4_t a_not_nan =
3631         vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
3632     uint32x4_t b_not_nan =
3633         vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
3634     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3635     uint32x4_t a_eq_b =
3636         vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3637     uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
3638                                        vreinterpretq_u64_u32(a_eq_b));
3639     return !!vgetq_lane_u64(and_results, 0);
3640 #endif
3641 }
3642 
3643 // Compare the lower double-precision (64-bit) floating-point element in a and b
3644 // for not-equal, and return the boolean result (0 or 1).
3645 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd
_mm_comineq_sd(__m128d a,__m128d b)3646 FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
3647 {
3648 #if defined(__aarch64__)
3649     return !vgetq_lane_u64(vceqq_f64(a, b), 0);
3650 #else
3651     // FIXME we should handle NaN condition here
3652     uint32x4_t a_eq_b =
3653         vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3654     return !vgetq_lane_u64(vreinterpretq_u64_u32(a_eq_b), 0);
3655 #endif
3656 }
3657 
3658 // Convert packed signed 32-bit integers in a to packed double-precision
3659 // (64-bit) floating-point elements, and store the results in dst.
3660 //
3661 //   FOR j := 0 to 1
3662 //     i := j*32
3663 //     m := j*64
3664 //     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
3665 //   ENDFOR
3666 //
3667 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd
_mm_cvtepi32_pd(__m128i a)3668 FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
3669 {
3670 #if defined(__aarch64__)
3671     return vreinterpretq_m128d_f64(
3672         vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
3673 #else
3674     double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
3675     double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
3676     return _mm_set_pd(a1, a0);
3677 #endif
3678 }
3679 
3680 // Converts the four signed 32-bit integer values of a to single-precision,
3681 // floating-point values
3682 // https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
_mm_cvtepi32_ps(__m128i a)3683 FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
3684 {
3685     return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
3686 }
3687 
3688 // Convert packed double-precision (64-bit) floating-point elements in a to
3689 // packed 32-bit integers, and store the results in dst.
3690 //
3691 //   FOR j := 0 to 1
3692 //      i := 32*j
3693 //      k := 64*j
3694 //      dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
3695 //   ENDFOR
3696 //
3697 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32
_mm_cvtpd_epi32(__m128d a)3698 FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
3699 {
3700     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3701     double d0 = ((double *) &rnd)[0];
3702     double d1 = ((double *) &rnd)[1];
3703     return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
3704 }
3705 
3706 // Convert packed double-precision (64-bit) floating-point elements in a to
3707 // packed 32-bit integers, and store the results in dst.
3708 //
3709 //   FOR j := 0 to 1
3710 //      i := 32*j
3711 //      k := 64*j
3712 //      dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
3713 //   ENDFOR
3714 //
3715 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_pi32
_mm_cvtpd_pi32(__m128d a)3716 FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
3717 {
3718     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3719     double d0 = ((double *) &rnd)[0];
3720     double d1 = ((double *) &rnd)[1];
3721     int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
3722     return vreinterpret_m64_s32(vld1_s32(data));
3723 }
3724 
3725 // Convert packed double-precision (64-bit) floating-point elements in a to
3726 // packed single-precision (32-bit) floating-point elements, and store the
3727 // results in dst.
3728 //
3729 //   FOR j := 0 to 1
3730 //     i := 32*j
3731 //     k := 64*j
3732 //     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
3733 //   ENDFOR
3734 //   dst[127:64] := 0
3735 //
3736 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
_mm_cvtpd_ps(__m128d a)3737 FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
3738 {
3739 #if defined(__aarch64__)
3740     float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
3741     return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
3742 #else
3743     float a0 = (float) ((double *) &a)[0];
3744     float a1 = (float) ((double *) &a)[1];
3745     return _mm_set_ps(0, 0, a1, a0);
3746 #endif
3747 }
3748 
3749 // Convert packed signed 32-bit integers in a to packed double-precision
3750 // (64-bit) floating-point elements, and store the results in dst.
3751 //
3752 //   FOR j := 0 to 1
3753 //     i := j*32
3754 //     m := j*64
3755 //     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
3756 //   ENDFOR
3757 //
3758 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd
_mm_cvtpi32_pd(__m64 a)3759 FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
3760 {
3761 #if defined(__aarch64__)
3762     return vreinterpretq_m128d_f64(
3763         vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
3764 #else
3765     double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
3766     double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
3767     return _mm_set_pd(a1, a0);
3768 #endif
3769 }
3770 
3771 // Converts the four single-precision, floating-point values of a to signed
3772 // 32-bit integer values.
3773 //
3774 //   r0 := (int) a0
3775 //   r1 := (int) a1
3776 //   r2 := (int) a2
3777 //   r3 := (int) a3
3778 //
3779 // https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
3780 // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
3781 // does not support! It is supported on ARMv8-A however.
_mm_cvtps_epi32(__m128 a)3782 FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
3783 {
3784 #if defined(__aarch64__)
3785     switch (_MM_GET_ROUNDING_MODE()) {
3786     case _MM_ROUND_NEAREST:
3787         return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
3788     case _MM_ROUND_DOWN:
3789         return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
3790     case _MM_ROUND_UP:
3791         return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
3792     default:  // _MM_ROUND_TOWARD_ZERO
3793         return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
3794     }
3795 #else
3796     float *f = (float *) &a;
3797     switch (_MM_GET_ROUNDING_MODE()) {
3798     case _MM_ROUND_NEAREST: {
3799         uint32x4_t signmask = vdupq_n_u32(0x80000000);
3800         float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
3801                                      vdupq_n_f32(0.5f)); /* +/- 0.5 */
3802         int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
3803             vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
3804         int32x4_t r_trunc = vcvtq_s32_f32(
3805             vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
3806         int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
3807             vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
3808         int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
3809                                      vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
3810         float32x4_t delta = vsubq_f32(
3811             vreinterpretq_f32_m128(a),
3812             vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
3813         uint32x4_t is_delta_half =
3814             vceqq_f32(delta, half); /* delta == +/- 0.5 */
3815         return vreinterpretq_m128i_s32(
3816             vbslq_s32(is_delta_half, r_even, r_normal));
3817     }
3818     case _MM_ROUND_DOWN:
3819         return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
3820                              floorf(f[0]));
3821     case _MM_ROUND_UP:
3822         return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
3823                              ceilf(f[0]));
3824     default:  // _MM_ROUND_TOWARD_ZERO
3825         return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
3826                              (int32_t) f[0]);
3827     }
3828 #endif
3829 }
3830 
3831 // Convert packed single-precision (32-bit) floating-point elements in a to
3832 // packed double-precision (64-bit) floating-point elements, and store the
3833 // results in dst.
3834 //
3835 //   FOR j := 0 to 1
3836 //     i := 64*j
3837 //     k := 32*j
3838 //     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
3839 //   ENDFOR
3840 //
3841 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
_mm_cvtps_pd(__m128 a)3842 FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
3843 {
3844 #if defined(__aarch64__)
3845     return vreinterpretq_m128d_f64(
3846         vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
3847 #else
3848     double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
3849     double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
3850     return _mm_set_pd(a1, a0);
3851 #endif
3852 }
3853 
3854 // Copy the lower double-precision (64-bit) floating-point element of a to dst.
3855 //
3856 //   dst[63:0] := a[63:0]
3857 //
3858 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
_mm_cvtsd_f64(__m128d a)3859 FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
3860 {
3861 #if defined(__aarch64__)
3862     return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
3863 #else
3864     return ((double *) &a)[0];
3865 #endif
3866 }
3867 
3868 // Convert the lower double-precision (64-bit) floating-point element in a to a
3869 // 32-bit integer, and store the result in dst.
3870 //
3871 //   dst[31:0] := Convert_FP64_To_Int32(a[63:0])
3872 //
3873 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32
_mm_cvtsd_si32(__m128d a)3874 FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
3875 {
3876 #if defined(__aarch64__)
3877     return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
3878 #else
3879     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3880     double ret = ((double *) &rnd)[0];
3881     return (int32_t) ret;
3882 #endif
3883 }
3884 
3885 // Convert the lower double-precision (64-bit) floating-point element in a to a
3886 // 64-bit integer, and store the result in dst.
3887 //
3888 //   dst[63:0] := Convert_FP64_To_Int64(a[63:0])
3889 //
3890 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64
_mm_cvtsd_si64(__m128d a)3891 FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
3892 {
3893 #if defined(__aarch64__)
3894     return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
3895 #else
3896     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3897     double ret = ((double *) &rnd)[0];
3898     return (int64_t) ret;
3899 #endif
3900 }
3901 
3902 // Convert the lower double-precision (64-bit) floating-point element in a to a
3903 // 64-bit integer, and store the result in dst.
3904 //
3905 //   dst[63:0] := Convert_FP64_To_Int64(a[63:0])
3906 //
3907 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x
3908 #define _mm_cvtsd_si64x _mm_cvtsd_si64
3909 
3910 // Convert the lower double-precision (64-bit) floating-point element in b to a
3911 // single-precision (32-bit) floating-point element, store the result in the
3912 // lower element of dst, and copy the upper 3 packed elements from a to the
3913 // upper elements of dst.
3914 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss
_mm_cvtsd_ss(__m128 a,__m128d b)3915 FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
3916 {
3917 #if defined(__aarch64__)
3918     return vreinterpretq_m128_f32(vsetq_lane_f32(
3919         vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
3920         vreinterpretq_f32_m128(a), 0));
3921 #else
3922     return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
3923                                                  vreinterpretq_f32_m128(a), 0));
3924 #endif
3925 }
3926 
3927 // Copy the lower 32-bit integer in a to dst.
3928 //
3929 //   dst[31:0] := a[31:0]
3930 //
3931 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
_mm_cvtsi128_si32(__m128i a)3932 FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
3933 {
3934     return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
3935 }
3936 
3937 // Copy the lower 64-bit integer in a to dst.
3938 //
3939 //   dst[63:0] := a[63:0]
3940 //
3941 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
_mm_cvtsi128_si64(__m128i a)3942 FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
3943 {
3944     return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
3945 }
3946 
3947 // Copy the lower 64-bit integer in a to dst.
3948 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
3949 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
3950 
3951 // Convert the signed 32-bit integer b to a double-precision (64-bit)
3952 // floating-point element, store the result in the lower element of dst, and
3953 // copy the upper element from a to the upper element of dst.
3954 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd
_mm_cvtsi32_sd(__m128d a,int32_t b)3955 FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
3956 {
3957 #if defined(__aarch64__)
3958     return vreinterpretq_m128d_f64(
3959         vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
3960 #else
3961     double bf = (double) b;
3962     return vreinterpretq_m128d_s64(
3963         vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
3964 #endif
3965 }
3966 
3967 // Copy the lower 64-bit integer in a to dst.
3968 //
3969 //   dst[63:0] := a[63:0]
3970 //
3971 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
3972 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
3973 
3974 // Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
3975 // zero extending the upper bits.
3976 //
3977 //   r0 := a
3978 //   r1 := 0x0
3979 //   r2 := 0x0
3980 //   r3 := 0x0
3981 //
3982 // https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
_mm_cvtsi32_si128(int a)3983 FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
3984 {
3985     return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
3986 }
3987 
3988 // Convert the signed 64-bit integer b to a double-precision (64-bit)
3989 // floating-point element, store the result in the lower element of dst, and
3990 // copy the upper element from a to the upper element of dst.
3991 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd
_mm_cvtsi64_sd(__m128d a,int64_t b)3992 FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
3993 {
3994 #if defined(__aarch64__)
3995     return vreinterpretq_m128d_f64(
3996         vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
3997 #else
3998     double bf = (double) b;
3999     return vreinterpretq_m128d_s64(
4000         vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4001 #endif
4002 }
4003 
4004 // Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
4005 // zero extending the upper bits.
4006 //
4007 //   r0 := a
4008 //   r1 := 0x0
_mm_cvtsi64_si128(int64_t a)4009 FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
4010 {
4011     return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4012 }
4013 
4014 // Copy 64-bit integer a to the lower element of dst, and zero the upper
4015 // element.
4016 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128
4017 #define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4018 
4019 // Convert the signed 64-bit integer b to a double-precision (64-bit)
4020 // floating-point element, store the result in the lower element of dst, and
4021 // copy the upper element from a to the upper element of dst.
4022 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd
4023 #define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4024 
4025 // Convert the lower single-precision (32-bit) floating-point element in b to a
4026 // double-precision (64-bit) floating-point element, store the result in the
4027 // lower element of dst, and copy the upper element from a to the upper element
4028 // of dst.
4029 //
4030 //   dst[63:0] := Convert_FP32_To_FP64(b[31:0])
4031 //   dst[127:64] := a[127:64]
4032 //
4033 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
_mm_cvtss_sd(__m128d a,__m128 b)4034 FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
4035 {
4036     double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
4037 #if defined(__aarch64__)
4038     return vreinterpretq_m128d_f64(
4039         vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4040 #else
4041     return vreinterpretq_m128d_s64(
4042         vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
4043 #endif
4044 }
4045 
4046 // Convert packed double-precision (64-bit) floating-point elements in a to
4047 // packed 32-bit integers with truncation, and store the results in dst.
4048 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32
_mm_cvttpd_epi32(__m128d a)4049 FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
4050 {
4051     double a0 = ((double *) &a)[0];
4052     double a1 = ((double *) &a)[1];
4053     return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
4054 }
4055 
4056 // Convert packed double-precision (64-bit) floating-point elements in a to
4057 // packed 32-bit integers with truncation, and store the results in dst.
4058 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_pi32
_mm_cvttpd_pi32(__m128d a)4059 FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
4060 {
4061     double a0 = ((double *) &a)[0];
4062     double a1 = ((double *) &a)[1];
4063     int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
4064     return vreinterpret_m64_s32(vld1_s32(data));
4065 }
4066 
4067 // Converts the four single-precision, floating-point values of a to signed
4068 // 32-bit integer values using truncate.
4069 // https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
_mm_cvttps_epi32(__m128 a)4070 FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
4071 {
4072     return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4073 }
4074 
4075 // Convert the lower double-precision (64-bit) floating-point element in a to a
4076 // 32-bit integer with truncation, and store the result in dst.
4077 //
4078 //   dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
4079 //
4080 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32
_mm_cvttsd_si32(__m128d a)4081 FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
4082 {
4083     double ret = *((double *) &a);
4084     return (int32_t) ret;
4085 }
4086 
4087 // Convert the lower double-precision (64-bit) floating-point element in a to a
4088 // 64-bit integer with truncation, and store the result in dst.
4089 //
4090 //   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4091 //
4092 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
_mm_cvttsd_si64(__m128d a)4093 FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
4094 {
4095 #if defined(__aarch64__)
4096     return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4097 #else
4098     double ret = *((double *) &a);
4099     return (int64_t) ret;
4100 #endif
4101 }
4102 
4103 // Convert the lower double-precision (64-bit) floating-point element in a to a
4104 // 64-bit integer with truncation, and store the result in dst.
4105 //
4106 //   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4107 //
4108 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
4109 #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4110 
4111 // Divide packed double-precision (64-bit) floating-point elements in a by
4112 // packed elements in b, and store the results in dst.
4113 //
4114 //  FOR j := 0 to 1
4115 //    i := 64*j
4116 //    dst[i+63:i] := a[i+63:i] / b[i+63:i]
4117 //  ENDFOR
4118 //
4119 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd
_mm_div_pd(__m128d a,__m128d b)4120 FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
4121 {
4122 #if defined(__aarch64__)
4123     return vreinterpretq_m128d_f64(
4124         vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4125 #else
4126     double *da = (double *) &a;
4127     double *db = (double *) &b;
4128     double c[2];
4129     c[0] = da[0] / db[0];
4130     c[1] = da[1] / db[1];
4131     return vld1q_f32((float32_t *) c);
4132 #endif
4133 }
4134 
4135 // Divide the lower double-precision (64-bit) floating-point element in a by the
4136 // lower double-precision (64-bit) floating-point element in b, store the result
4137 // in the lower element of dst, and copy the upper element from a to the upper
4138 // element of dst.
4139 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd
_mm_div_sd(__m128d a,__m128d b)4140 FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
4141 {
4142 #if defined(__aarch64__)
4143     float64x2_t tmp =
4144         vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4145     return vreinterpretq_m128d_f64(
4146         vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4147 #else
4148     return _mm_move_sd(a, _mm_div_pd(a, b));
4149 #endif
4150 }
4151 
4152 // Extracts the selected signed or unsigned 16-bit integer from a and zero
4153 // extends.
4154 // https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
4155 // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
4156 #define _mm_extract_epi16(a, imm) \
4157     vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4158 
4159 // Inserts the least significant 16 bits of b into the selected 16-bit integer
4160 // of a.
4161 // https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
4162 // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
4163 //                                       __constrange(0,8) int imm)
4164 #define _mm_insert_epi16(a, b, imm)                                  \
4165     __extension__({                                                  \
4166         vreinterpretq_m128i_s16(                                     \
4167             vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
4168     })
4169 
4170 // Loads two double-precision from 16-byte aligned memory, floating-point
4171 // values.
4172 //
4173 //   dst[127:0] := MEM[mem_addr+127:mem_addr]
4174 //
4175 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
_mm_load_pd(const double * p)4176 FORCE_INLINE __m128d _mm_load_pd(const double *p)
4177 {
4178 #if defined(__aarch64__)
4179     return vreinterpretq_m128d_f64(vld1q_f64(p));
4180 #else
4181     const float *fp = (const float *) p;
4182     float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
4183     return vreinterpretq_m128d_f32(vld1q_f32(data));
4184 #endif
4185 }
4186 
4187 // Load a double-precision (64-bit) floating-point element from memory into both
4188 // elements of dst.
4189 //
4190 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
4191 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
4192 //
4193 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
4194 #define _mm_load_pd1 _mm_load1_pd
4195 
4196 // Load a double-precision (64-bit) floating-point element from memory into the
4197 // lower of dst, and zero the upper element. mem_addr does not need to be
4198 // aligned on any particular boundary.
4199 //
4200 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
4201 //   dst[127:64] := 0
4202 //
4203 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
_mm_load_sd(const double * p)4204 FORCE_INLINE __m128d _mm_load_sd(const double *p)
4205 {
4206 #if defined(__aarch64__)
4207     return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4208 #else
4209     const float *fp = (const float *) p;
4210     float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
4211     return vreinterpretq_m128d_f32(vld1q_f32(data));
4212 #endif
4213 }
4214 
4215 // Loads 128-bit value. :
4216 // https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
_mm_load_si128(const __m128i * p)4217 FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
4218 {
4219     return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4220 }
4221 
4222 // Load a double-precision (64-bit) floating-point element from memory into both
4223 // elements of dst.
4224 //
4225 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
4226 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
4227 //
4228 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
_mm_load1_pd(const double * p)4229 FORCE_INLINE __m128d _mm_load1_pd(const double *p)
4230 {
4231 #if defined(__aarch64__)
4232     return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4233 #else
4234     return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4235 #endif
4236 }
4237 
4238 // Load a double-precision (64-bit) floating-point element from memory into the
4239 // upper element of dst, and copy the lower element from a to dst. mem_addr does
4240 // not need to be aligned on any particular boundary.
4241 //
4242 //   dst[63:0] := a[63:0]
4243 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
4244 //
4245 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
_mm_loadh_pd(__m128d a,const double * p)4246 FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
4247 {
4248 #if defined(__aarch64__)
4249     return vreinterpretq_m128d_f64(
4250         vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4251 #else
4252     return vreinterpretq_m128d_f32(vcombine_f32(
4253         vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4254 #endif
4255 }
4256 
4257 // Load 64-bit integer from memory into the first element of dst.
4258 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
_mm_loadl_epi64(__m128i const * p)4259 FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
4260 {
4261     /* Load the lower 64 bits of the value pointed to by p into the
4262      * lower 64 bits of the result, zeroing the upper 64 bits of the result.
4263      */
4264     return vreinterpretq_m128i_s32(
4265         vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
4266 }
4267 
4268 // Load a double-precision (64-bit) floating-point element from memory into the
4269 // lower element of dst, and copy the upper element from a to dst. mem_addr does
4270 // not need to be aligned on any particular boundary.
4271 //
4272 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
4273 //   dst[127:64] := a[127:64]
4274 //
4275 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
_mm_loadl_pd(__m128d a,const double * p)4276 FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
4277 {
4278 #if defined(__aarch64__)
4279     return vreinterpretq_m128d_f64(
4280         vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4281 #else
4282     return vreinterpretq_m128d_f32(
4283         vcombine_f32(vld1_f32((const float *) p),
4284                      vget_high_f32(vreinterpretq_f32_m128d(a))));
4285 #endif
4286 }
4287 
4288 // Load 2 double-precision (64-bit) floating-point elements from memory into dst
4289 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
4290 // general-protection exception may be generated.
4291 //
4292 //   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
4293 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
4294 //
4295 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
_mm_loadr_pd(const double * p)4296 FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
4297 {
4298 #if defined(__aarch64__)
4299     float64x2_t v = vld1q_f64(p);
4300     return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4301 #else
4302     int64x2_t v = vld1q_s64((const int64_t *) p);
4303     return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
4304 #endif
4305 }
4306 
4307 // Loads two double-precision from unaligned memory, floating-point values.
4308 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
_mm_loadu_pd(const double * p)4309 FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
4310 {
4311     return _mm_load_pd(p);
4312 }
4313 
4314 // Loads 128-bit value. :
4315 // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
_mm_loadu_si128(const __m128i * p)4316 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
4317 {
4318     return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4319 }
4320 
4321 // Load unaligned 32-bit integer from memory into the first element of dst.
4322 //
4323 //   dst[31:0] := MEM[mem_addr+31:mem_addr]
4324 //   dst[MAX:32] := 0
4325 //
4326 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
_mm_loadu_si32(const void * p)4327 FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
4328 {
4329     return vreinterpretq_m128i_s32(
4330         vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4331 }
4332 
4333 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
4334 // integers from b.
4335 //
4336 //   r0 := (a0 * b0) + (a1 * b1)
4337 //   r1 := (a2 * b2) + (a3 * b3)
4338 //   r2 := (a4 * b4) + (a5 * b5)
4339 //   r3 := (a6 * b6) + (a7 * b7)
4340 // https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
_mm_madd_epi16(__m128i a,__m128i b)4341 FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
4342 {
4343     int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
4344                               vget_low_s16(vreinterpretq_s16_m128i(b)));
4345     int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
4346                                vget_high_s16(vreinterpretq_s16_m128i(b)));
4347 
4348     int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4349     int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4350 
4351     return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
4352 }
4353 
4354 // Conditionally store 8-bit integer elements from a into memory using mask
4355 // (elements are not stored when the highest bit is not set in the corresponding
4356 // element) and a non-temporal memory hint. mem_addr does not need to be aligned
4357 // on any particular boundary.
4358 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128
_mm_maskmoveu_si128(__m128i a,__m128i mask,char * mem_addr)4359 FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
4360 {
4361     int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
4362     __m128 b = _mm_load_ps((const float *) mem_addr);
4363     int8x16_t masked =
4364         vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
4365                  vreinterpretq_s8_m128(b));
4366     vst1q_s8((int8_t *) mem_addr, masked);
4367 }
4368 
4369 // Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
4370 // signed 16-bit integers from b.
4371 // https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
_mm_max_epi16(__m128i a,__m128i b)4372 FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
4373 {
4374     return vreinterpretq_m128i_s16(
4375         vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4376 }
4377 
4378 // Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
4379 // 16 unsigned 8-bit integers from b.
4380 // https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
_mm_max_epu8(__m128i a,__m128i b)4381 FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
4382 {
4383     return vreinterpretq_m128i_u8(
4384         vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4385 }
4386 
4387 // Compare packed double-precision (64-bit) floating-point elements in a and b,
4388 // and store packed maximum values in dst.
4389 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd
_mm_max_pd(__m128d a,__m128d b)4390 FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
4391 {
4392 #if defined(__aarch64__)
4393     return vreinterpretq_m128d_f64(
4394         vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4395 #else
4396     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4397     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4398     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4399     uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4400     uint64_t d[2];
4401     d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
4402     d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
4403 
4404     return vreinterpretq_m128d_u64(vld1q_u64(d));
4405 #endif
4406 }
4407 
4408 // Compare the lower double-precision (64-bit) floating-point elements in a and
4409 // b, store the maximum value in the lower element of dst, and copy the upper
4410 // element from a to the upper element of dst.
4411 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd
_mm_max_sd(__m128d a,__m128d b)4412 FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
4413 {
4414 #if defined(__aarch64__)
4415     return _mm_move_sd(a, _mm_max_pd(a, b));
4416 #else
4417     double *da = (double *) &a;
4418     double *db = (double *) &b;
4419     double c[2] = {fmax(da[0], db[0]), da[1]};
4420     return vld1q_f32((float32_t *) c);
4421 #endif
4422 }
4423 
4424 // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
4425 // signed 16-bit integers from b.
4426 // https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
_mm_min_epi16(__m128i a,__m128i b)4427 FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
4428 {
4429     return vreinterpretq_m128i_s16(
4430         vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4431 }
4432 
4433 // Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
4434 // 16 unsigned 8-bit integers from b.
4435 // https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
_mm_min_epu8(__m128i a,__m128i b)4436 FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
4437 {
4438     return vreinterpretq_m128i_u8(
4439         vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4440 }
4441 
4442 // Compare packed double-precision (64-bit) floating-point elements in a and b,
4443 // and store packed minimum values in dst.
4444 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd
_mm_min_pd(__m128d a,__m128d b)4445 FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
4446 {
4447 #if defined(__aarch64__)
4448     return vreinterpretq_m128d_f64(
4449         vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4450 #else
4451     uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4452     uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4453     uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4454     uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4455     uint64_t d[2];
4456     d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
4457     d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
4458     return vreinterpretq_m128d_u64(vld1q_u64(d));
4459 #endif
4460 }
4461 
4462 // Compare the lower double-precision (64-bit) floating-point elements in a and
4463 // b, store the minimum value in the lower element of dst, and copy the upper
4464 // element from a to the upper element of dst.
4465 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd
_mm_min_sd(__m128d a,__m128d b)4466 FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
4467 {
4468 #if defined(__aarch64__)
4469     return _mm_move_sd(a, _mm_min_pd(a, b));
4470 #else
4471     double *da = (double *) &a;
4472     double *db = (double *) &b;
4473     double c[2] = {fmin(da[0], db[0]), da[1]};
4474     return vld1q_f32((float32_t *) c);
4475 #endif
4476 }
4477 
4478 // Copy the lower 64-bit integer in a to the lower element of dst, and zero the
4479 // upper element.
4480 //
4481 //   dst[63:0] := a[63:0]
4482 //   dst[127:64] := 0
4483 //
4484 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
_mm_move_epi64(__m128i a)4485 FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
4486 {
4487     return vreinterpretq_m128i_s64(
4488         vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
4489 }
4490 
4491 // Move the lower double-precision (64-bit) floating-point element from b to the
4492 // lower element of dst, and copy the upper element from a to the upper element
4493 // of dst.
4494 //
4495 //   dst[63:0] := b[63:0]
4496 //   dst[127:64] := a[127:64]
4497 //
4498 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
_mm_move_sd(__m128d a,__m128d b)4499 FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
4500 {
4501     return vreinterpretq_m128d_f32(
4502         vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
4503                      vget_high_f32(vreinterpretq_f32_m128d(a))));
4504 }
4505 
4506 // NEON does not provide a version of this function.
4507 // Creates a 16-bit mask from the most significant bits of the 16 signed or
4508 // unsigned 8-bit integers in a and zero extends the upper bits.
4509 // https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
_mm_movemask_epi8(__m128i a)4510 FORCE_INLINE int _mm_movemask_epi8(__m128i a)
4511 {
4512     // Use increasingly wide shifts+adds to collect the sign bits
4513     // together.
4514     // Since the widening shifts would be rather confusing to follow in little
4515     // endian, everything will be illustrated in big endian order instead. This
4516     // has a different result - the bits would actually be reversed on a big
4517     // endian machine.
4518 
4519     // Starting input (only half the elements are shown):
4520     // 89 ff 1d c0 00 10 99 33
4521     uint8x16_t input = vreinterpretq_u8_m128i(a);
4522 
4523     // Shift out everything but the sign bits with an unsigned shift right.
4524     //
4525     // Bytes of the vector::
4526     // 89 ff 1d c0 00 10 99 33
4527     // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
4528     //  |  |  |  |  |  |  |  |
4529     // 01 01 00 01 00 00 01 00
4530     //
4531     // Bits of first important lane(s):
4532     // 10001001 (89)
4533     // \______
4534     //        |
4535     // 00000001 (01)
4536     uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4537 
4538     // Merge the even lanes together with a 16-bit unsigned shift right + add.
4539     // 'xx' represents garbage data which will be ignored in the final result.
4540     // In the important bytes, the add functions like a binary OR.
4541     //
4542     // 01 01 00 01 00 00 01 00
4543     //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
4544     //    \|    \|    \|    \|
4545     // xx 03 xx 01 xx 00 xx 02
4546     //
4547     // 00000001 00000001 (01 01)
4548     //        \_______ |
4549     //                \|
4550     // xxxxxxxx xxxxxx11 (xx 03)
4551     uint32x4_t paired16 =
4552         vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4553 
4554     // Repeat with a wider 32-bit shift + add.
4555     // xx 03 xx 01 xx 00 xx 02
4556     //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
4557     //     14))
4558     //          \|          \|
4559     // xx xx xx 0d xx xx xx 02
4560     //
4561     // 00000011 00000001 (03 01)
4562     //        \\_____ ||
4563     //         '----.\||
4564     // xxxxxxxx xxxx1101 (xx 0d)
4565     uint64x2_t paired32 =
4566         vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
4567 
4568     // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
4569     // lanes. xx xx xx 0d xx xx xx 02
4570     //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
4571     //            28))
4572     //                      \|
4573     // xx xx xx xx xx xx xx d2
4574     //
4575     // 00001101 00000010 (0d 02)
4576     //     \   \___ |  |
4577     //      '---.  \|  |
4578     // xxxxxxxx 11010010 (xx d2)
4579     uint8x16_t paired64 =
4580         vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
4581 
4582     // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
4583     // xx xx xx xx xx xx xx d2
4584     //                      ||  return paired64[0]
4585     //                      d2
4586     // Note: Little endian would return the correct value 4b (01001011) instead.
4587     return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
4588 }
4589 
4590 // Set each bit of mask dst based on the most significant bit of the
4591 // corresponding packed double-precision (64-bit) floating-point element in a.
4592 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd
_mm_movemask_pd(__m128d a)4593 FORCE_INLINE int _mm_movemask_pd(__m128d a)
4594 {
4595     uint64x2_t input = vreinterpretq_u64_m128d(a);
4596     uint64x2_t high_bits = vshrq_n_u64(input, 63);
4597     return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
4598 }
4599 
4600 // Copy the lower 64-bit integer in a to dst.
4601 //
4602 //   dst[63:0] := a[63:0]
4603 //
4604 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
_mm_movepi64_pi64(__m128i a)4605 FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
4606 {
4607     return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
4608 }
4609 
4610 // Copy the 64-bit integer a to the lower element of dst, and zero the upper
4611 // element.
4612 //
4613 //   dst[63:0] := a[63:0]
4614 //   dst[127:64] := 0
4615 //
4616 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
_mm_movpi64_epi64(__m64 a)4617 FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
4618 {
4619     return vreinterpretq_m128i_s64(
4620         vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
4621 }
4622 
4623 // Multiply the low unsigned 32-bit integers from each packed 64-bit element in
4624 // a and b, and store the unsigned 64-bit results in dst.
4625 //
4626 //   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
4627 //   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
_mm_mul_epu32(__m128i a,__m128i b)4628 FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
4629 {
4630     // vmull_u32 upcasts instead of masking, so we downcast.
4631     uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
4632     uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
4633     return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
4634 }
4635 
4636 // Multiply packed double-precision (64-bit) floating-point elements in a and b,
4637 // and store the results in dst.
4638 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
_mm_mul_pd(__m128d a,__m128d b)4639 FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
4640 {
4641 #if defined(__aarch64__)
4642     return vreinterpretq_m128d_f64(
4643         vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4644 #else
4645     double *da = (double *) &a;
4646     double *db = (double *) &b;
4647     double c[2];
4648     c[0] = da[0] * db[0];
4649     c[1] = da[1] * db[1];
4650     return vld1q_f32((float32_t *) c);
4651 #endif
4652 }
4653 
4654 // Multiply the lower double-precision (64-bit) floating-point element in a and
4655 // b, store the result in the lower element of dst, and copy the upper element
4656 // from a to the upper element of dst.
4657 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
_mm_mul_sd(__m128d a,__m128d b)4658 FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
4659 {
4660     return _mm_move_sd(a, _mm_mul_pd(a, b));
4661 }
4662 
4663 // Multiply the low unsigned 32-bit integers from a and b, and store the
4664 // unsigned 64-bit result in dst.
4665 //
4666 //   dst[63:0] := a[31:0] * b[31:0]
4667 //
4668 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
_mm_mul_su32(__m64 a,__m64 b)4669 FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
4670 {
4671     return vreinterpret_m64_u64(vget_low_u64(
4672         vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
4673 }
4674 
4675 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
4676 // integers from b.
4677 //
4678 //   r0 := (a0 * b0)[31:16]
4679 //   r1 := (a1 * b1)[31:16]
4680 //   ...
4681 //   r7 := (a7 * b7)[31:16]
4682 //
4683 // https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
_mm_mulhi_epi16(__m128i a,__m128i b)4684 FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
4685 {
4686     /* FIXME: issue with large values because of result saturation */
4687     // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
4688     // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
4689     // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
4690     int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
4691     int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
4692     int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4693     int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
4694     int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
4695     int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4696     uint16x8x2_t r =
4697         vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4698     return vreinterpretq_m128i_u16(r.val[1]);
4699 }
4700 
4701 // Multiply the packed unsigned 16-bit integers in a and b, producing
4702 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
4703 // integers in dst.
4704 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
_mm_mulhi_epu16(__m128i a,__m128i b)4705 FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
4706 {
4707     uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
4708     uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
4709     uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4710 #if defined(__aarch64__)
4711     uint32x4_t ab7654 =
4712         vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
4713     uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4714                               vreinterpretq_u16_u32(ab7654));
4715     return vreinterpretq_m128i_u16(r);
4716 #else
4717     uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
4718     uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
4719     uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4720     uint16x8x2_t r =
4721         vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4722     return vreinterpretq_m128i_u16(r.val[1]);
4723 #endif
4724 }
4725 
4726 // Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
4727 // unsigned 16-bit integers from b.
4728 //
4729 //   r0 := (a0 * b0)[15:0]
4730 //   r1 := (a1 * b1)[15:0]
4731 //   ...
4732 //   r7 := (a7 * b7)[15:0]
4733 //
4734 // https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
_mm_mullo_epi16(__m128i a,__m128i b)4735 FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
4736 {
4737     return vreinterpretq_m128i_s16(
4738         vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4739 }
4740 
4741 // Compute the bitwise OR of packed double-precision (64-bit) floating-point
4742 // elements in a and b, and store the results in dst.
4743 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
_mm_or_pd(__m128d a,__m128d b)4744 FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
4745 {
4746     return vreinterpretq_m128d_s64(
4747         vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
4748 }
4749 
4750 // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
4751 //
4752 //   r := a | b
4753 //
4754 // https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
_mm_or_si128(__m128i a,__m128i b)4755 FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
4756 {
4757     return vreinterpretq_m128i_s32(
4758         vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4759 }
4760 
4761 // Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
4762 // saturates.
4763 // https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
_mm_packs_epi16(__m128i a,__m128i b)4764 FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
4765 {
4766     return vreinterpretq_m128i_s8(
4767         vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
4768                     vqmovn_s16(vreinterpretq_s16_m128i(b))));
4769 }
4770 
4771 // Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
4772 // and saturates.
4773 //
4774 //   r0 := SignedSaturate(a0)
4775 //   r1 := SignedSaturate(a1)
4776 //   r2 := SignedSaturate(a2)
4777 //   r3 := SignedSaturate(a3)
4778 //   r4 := SignedSaturate(b0)
4779 //   r5 := SignedSaturate(b1)
4780 //   r6 := SignedSaturate(b2)
4781 //   r7 := SignedSaturate(b3)
4782 //
4783 // https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
_mm_packs_epi32(__m128i a,__m128i b)4784 FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
4785 {
4786     return vreinterpretq_m128i_s16(
4787         vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
4788                      vqmovn_s32(vreinterpretq_s32_m128i(b))));
4789 }
4790 
4791 // Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
4792 // integers and saturates.
4793 //
4794 //   r0 := UnsignedSaturate(a0)
4795 //   r1 := UnsignedSaturate(a1)
4796 //   ...
4797 //   r7 := UnsignedSaturate(a7)
4798 //   r8 := UnsignedSaturate(b0)
4799 //   r9 := UnsignedSaturate(b1)
4800 //   ...
4801 //   r15 := UnsignedSaturate(b7)
4802 //
4803 // https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
_mm_packus_epi16(const __m128i a,const __m128i b)4804 FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
4805 {
4806     return vreinterpretq_m128i_u8(
4807         vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
4808                     vqmovun_s16(vreinterpretq_s16_m128i(b))));
4809 }
4810 
4811 // Pause the processor. This is typically used in spin-wait loops and depending
4812 // on the x86 processor typical values are in the 40-100 cycle range. The
4813 // 'yield' instruction isn't a good fit beacuse it's effectively a nop on most
4814 // Arm cores. Experience with several databases has shown has shown an 'isb' is
4815 // a reasonable approximation.
_mm_pause()4816 FORCE_INLINE void _mm_pause()
4817 {
4818     __asm__ __volatile__("isb\n");
4819 }
4820 
4821 // Compute the absolute differences of packed unsigned 8-bit integers in a and
4822 // b, then horizontally sum each consecutive 8 differences to produce two
4823 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
4824 // 16 bits of 64-bit elements in dst.
4825 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
_mm_sad_epu8(__m128i a,__m128i b)4826 FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
4827 {
4828     uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
4829     return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
4830 }
4831 
4832 // Sets the 8 signed 16-bit integer values.
4833 // https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
_mm_set_epi16(short i7,short i6,short i5,short i4,short i3,short i2,short i1,short i0)4834 FORCE_INLINE __m128i _mm_set_epi16(short i7,
4835                                    short i6,
4836                                    short i5,
4837                                    short i4,
4838                                    short i3,
4839                                    short i2,
4840                                    short i1,
4841                                    short i0)
4842 {
4843     int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
4844     return vreinterpretq_m128i_s16(vld1q_s16(data));
4845 }
4846 
4847 // Sets the 4 signed 32-bit integer values.
4848 // https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
_mm_set_epi32(int i3,int i2,int i1,int i0)4849 FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
4850 {
4851     int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
4852     return vreinterpretq_m128i_s32(vld1q_s32(data));
4853 }
4854 
4855 // Returns the __m128i structure with its two 64-bit integer values
4856 // initialized to the values of the two 64-bit integers passed in.
4857 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
_mm_set_epi64(__m64 i1,__m64 i2)4858 FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
4859 {
4860     return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
4861 }
4862 
4863 // Returns the __m128i structure with its two 64-bit integer values
4864 // initialized to the values of the two 64-bit integers passed in.
4865 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
_mm_set_epi64x(int64_t i1,int64_t i2)4866 FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
4867 {
4868     return vreinterpretq_m128i_s64(
4869         vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
4870 }
4871 
4872 // Sets the 16 signed 8-bit integer values.
4873 // https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
_mm_set_epi8(signed char b15,signed char b14,signed char b13,signed char b12,signed char b11,signed char b10,signed char b9,signed char b8,signed char b7,signed char b6,signed char b5,signed char b4,signed char b3,signed char b2,signed char b1,signed char b0)4874 FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
4875                                   signed char b14,
4876                                   signed char b13,
4877                                   signed char b12,
4878                                   signed char b11,
4879                                   signed char b10,
4880                                   signed char b9,
4881                                   signed char b8,
4882                                   signed char b7,
4883                                   signed char b6,
4884                                   signed char b5,
4885                                   signed char b4,
4886                                   signed char b3,
4887                                   signed char b2,
4888                                   signed char b1,
4889                                   signed char b0)
4890 {
4891     int8_t ALIGN_STRUCT(16)
4892         data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
4893                     (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
4894                     (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
4895                     (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
4896     return (__m128i) vld1q_s8(data);
4897 }
4898 
4899 // Set packed double-precision (64-bit) floating-point elements in dst with the
4900 // supplied values.
4901 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
_mm_set_pd(double e1,double e0)4902 FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
4903 {
4904     double ALIGN_STRUCT(16) data[2] = {e0, e1};
4905 #if defined(__aarch64__)
4906     return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
4907 #else
4908     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
4909 #endif
4910 }
4911 
4912 // Broadcast double-precision (64-bit) floating-point value a to all elements of
4913 // dst.
4914 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
4915 #define _mm_set_pd1 _mm_set1_pd
4916 
4917 // Copy double-precision (64-bit) floating-point element a to the lower element
4918 // of dst, and zero the upper element.
4919 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
_mm_set_sd(double a)4920 FORCE_INLINE __m128d _mm_set_sd(double a)
4921 {
4922     return _mm_set_pd(0, a);
4923 }
4924 
4925 // Sets the 8 signed 16-bit integer values to w.
4926 //
4927 //   r0 := w
4928 //   r1 := w
4929 //   ...
4930 //   r7 := w
4931 //
4932 // https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
_mm_set1_epi16(short w)4933 FORCE_INLINE __m128i _mm_set1_epi16(short w)
4934 {
4935     return vreinterpretq_m128i_s16(vdupq_n_s16(w));
4936 }
4937 
4938 // Sets the 4 signed 32-bit integer values to i.
4939 //
4940 //   r0 := i
4941 //   r1 := i
4942 //   r2 := i
4943 //   r3 := I
4944 //
4945 // https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
_mm_set1_epi32(int _i)4946 FORCE_INLINE __m128i _mm_set1_epi32(int _i)
4947 {
4948     return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
4949 }
4950 
4951 // Sets the 2 signed 64-bit integer values to i.
4952 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
_mm_set1_epi64(__m64 _i)4953 FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
4954 {
4955     return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
4956 }
4957 
4958 // Sets the 2 signed 64-bit integer values to i.
4959 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
_mm_set1_epi64x(int64_t _i)4960 FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
4961 {
4962     return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
4963 }
4964 
4965 // Sets the 16 signed 8-bit integer values to b.
4966 //
4967 //   r0 := b
4968 //   r1 := b
4969 //   ...
4970 //   r15 := b
4971 //
4972 // https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
_mm_set1_epi8(signed char w)4973 FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
4974 {
4975     return vreinterpretq_m128i_s8(vdupq_n_s8(w));
4976 }
4977 
4978 // Broadcast double-precision (64-bit) floating-point value a to all elements of
4979 // dst.
4980 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
_mm_set1_pd(double d)4981 FORCE_INLINE __m128d _mm_set1_pd(double d)
4982 {
4983 #if defined(__aarch64__)
4984     return vreinterpretq_m128d_f64(vdupq_n_f64(d));
4985 #else
4986     return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
4987 #endif
4988 }
4989 
4990 // Sets the 8 signed 16-bit integer values in reverse order.
4991 //
4992 // Return Value
4993 //   r0 := w0
4994 //   r1 := w1
4995 //   ...
4996 //   r7 := w7
_mm_setr_epi16(short w0,short w1,short w2,short w3,short w4,short w5,short w6,short w7)4997 FORCE_INLINE __m128i _mm_setr_epi16(short w0,
4998                                     short w1,
4999                                     short w2,
5000                                     short w3,
5001                                     short w4,
5002                                     short w5,
5003                                     short w6,
5004                                     short w7)
5005 {
5006     int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
5007     return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
5008 }
5009 
5010 // Sets the 4 signed 32-bit integer values in reverse order
5011 // https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
_mm_setr_epi32(int i3,int i2,int i1,int i0)5012 FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
5013 {
5014     int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
5015     return vreinterpretq_m128i_s32(vld1q_s32(data));
5016 }
5017 
5018 // Set packed 64-bit integers in dst with the supplied values in reverse order.
5019 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
_mm_setr_epi64(__m64 e1,__m64 e0)5020 FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
5021 {
5022     return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
5023 }
5024 
5025 // Sets the 16 signed 8-bit integer values in reverse order.
5026 // https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
_mm_setr_epi8(signed char b0,signed char b1,signed char b2,signed char b3,signed char b4,signed char b5,signed char b6,signed char b7,signed char b8,signed char b9,signed char b10,signed char b11,signed char b12,signed char b13,signed char b14,signed char b15)5027 FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
5028                                    signed char b1,
5029                                    signed char b2,
5030                                    signed char b3,
5031                                    signed char b4,
5032                                    signed char b5,
5033                                    signed char b6,
5034                                    signed char b7,
5035                                    signed char b8,
5036                                    signed char b9,
5037                                    signed char b10,
5038                                    signed char b11,
5039                                    signed char b12,
5040                                    signed char b13,
5041                                    signed char b14,
5042                                    signed char b15)
5043 {
5044     int8_t ALIGN_STRUCT(16)
5045         data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
5046                     (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
5047                     (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
5048                     (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5049     return (__m128i) vld1q_s8(data);
5050 }
5051 
5052 // Set packed double-precision (64-bit) floating-point elements in dst with the
5053 // supplied values in reverse order.
5054 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
_mm_setr_pd(double e1,double e0)5055 FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
5056 {
5057     return _mm_set_pd(e0, e1);
5058 }
5059 
5060 // Return vector of type __m128d with all elements set to zero.
5061 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
_mm_setzero_pd(void)5062 FORCE_INLINE __m128d _mm_setzero_pd(void)
5063 {
5064 #if defined(__aarch64__)
5065     return vreinterpretq_m128d_f64(vdupq_n_f64(0));
5066 #else
5067     return vreinterpretq_m128d_f32(vdupq_n_f32(0));
5068 #endif
5069 }
5070 
5071 // Sets the 128-bit value to zero
5072 // https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
_mm_setzero_si128(void)5073 FORCE_INLINE __m128i _mm_setzero_si128(void)
5074 {
5075     return vreinterpretq_m128i_s32(vdupq_n_s32(0));
5076 }
5077 
5078 // Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
5079 // https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
5080 // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
5081 //                                        __constrange(0,255) int imm)
5082 #if __has_builtin(__builtin_shufflevector)
5083 #define _mm_shuffle_epi32(a, imm)                              \
5084     __extension__({                                            \
5085         int32x4_t _input = vreinterpretq_s32_m128i(a);         \
5086         int32x4_t _shuf = __builtin_shufflevector(             \
5087             _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
5088             ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
5089         vreinterpretq_m128i_s32(_shuf);                        \
5090     })
5091 #else  // generic
5092 #define _mm_shuffle_epi32(a, imm)                        \
5093     __extension__({                                      \
5094         __m128i ret;                                     \
5095         switch (imm) {                                   \
5096         case _MM_SHUFFLE(1, 0, 3, 2):                    \
5097             ret = _mm_shuffle_epi_1032((a));             \
5098             break;                                       \
5099         case _MM_SHUFFLE(2, 3, 0, 1):                    \
5100             ret = _mm_shuffle_epi_2301((a));             \
5101             break;                                       \
5102         case _MM_SHUFFLE(0, 3, 2, 1):                    \
5103             ret = _mm_shuffle_epi_0321((a));             \
5104             break;                                       \
5105         case _MM_SHUFFLE(2, 1, 0, 3):                    \
5106             ret = _mm_shuffle_epi_2103((a));             \
5107             break;                                       \
5108         case _MM_SHUFFLE(1, 0, 1, 0):                    \
5109             ret = _mm_shuffle_epi_1010((a));             \
5110             break;                                       \
5111         case _MM_SHUFFLE(1, 0, 0, 1):                    \
5112             ret = _mm_shuffle_epi_1001((a));             \
5113             break;                                       \
5114         case _MM_SHUFFLE(0, 1, 0, 1):                    \
5115             ret = _mm_shuffle_epi_0101((a));             \
5116             break;                                       \
5117         case _MM_SHUFFLE(2, 2, 1, 1):                    \
5118             ret = _mm_shuffle_epi_2211((a));             \
5119             break;                                       \
5120         case _MM_SHUFFLE(0, 1, 2, 2):                    \
5121             ret = _mm_shuffle_epi_0122((a));             \
5122             break;                                       \
5123         case _MM_SHUFFLE(3, 3, 3, 2):                    \
5124             ret = _mm_shuffle_epi_3332((a));             \
5125             break;                                       \
5126         case _MM_SHUFFLE(0, 0, 0, 0):                    \
5127             ret = _mm_shuffle_epi32_splat((a), 0);       \
5128             break;                                       \
5129         case _MM_SHUFFLE(1, 1, 1, 1):                    \
5130             ret = _mm_shuffle_epi32_splat((a), 1);       \
5131             break;                                       \
5132         case _MM_SHUFFLE(2, 2, 2, 2):                    \
5133             ret = _mm_shuffle_epi32_splat((a), 2);       \
5134             break;                                       \
5135         case _MM_SHUFFLE(3, 3, 3, 3):                    \
5136             ret = _mm_shuffle_epi32_splat((a), 3);       \
5137             break;                                       \
5138         default:                                         \
5139             ret = _mm_shuffle_epi32_default((a), (imm)); \
5140             break;                                       \
5141         }                                                \
5142         ret;                                             \
5143     })
5144 #endif
5145 
5146 // Shuffle double-precision (64-bit) floating-point elements using the control
5147 // in imm8, and store the results in dst.
5148 //
5149 //   dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
5150 //   dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
5151 //
5152 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
5153 #if __has_builtin(__builtin_shufflevector)
5154 #define _mm_shuffle_pd(a, b, imm8)                                          \
5155     vreinterpretq_m128d_s64(__builtin_shufflevector(                        \
5156         vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
5157         ((imm8 & 0x2) >> 1) + 2))
5158 #else
5159 #define _mm_shuffle_pd(a, b, imm8)                                     \
5160     _mm_castsi128_pd(_mm_set_epi64x(                                   \
5161         vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
5162         vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
5163 #endif
5164 
5165 // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
5166 //                                          __constrange(0,255) int imm)
5167 #if __has_builtin(__builtin_shufflevector)
5168 #define _mm_shufflehi_epi16(a, imm)                             \
5169     __extension__({                                             \
5170         int16x8_t _input = vreinterpretq_s16_m128i(a);          \
5171         int16x8_t _shuf = __builtin_shufflevector(              \
5172             _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
5173             (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
5174             (((imm) >> 6) & 0x3) + 4);                          \
5175         vreinterpretq_m128i_s16(_shuf);                         \
5176     })
5177 #else  // generic
5178 #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
5179 #endif
5180 
5181 // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
5182 //                                          __constrange(0,255) int imm)
5183 #if __has_builtin(__builtin_shufflevector)
5184 #define _mm_shufflelo_epi16(a, imm)                                  \
5185     __extension__({                                                  \
5186         int16x8_t _input = vreinterpretq_s16_m128i(a);               \
5187         int16x8_t _shuf = __builtin_shufflevector(                   \
5188             _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
5189             (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
5190         vreinterpretq_m128i_s16(_shuf);                              \
5191     })
5192 #else  // generic
5193 #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
5194 #endif
5195 
5196 // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
5197 // shifting in zeros.
5198 //
5199 //   r0 := a0 << count
5200 //   r1 := a1 << count
5201 //   ...
5202 //   r7 := a7 << count
5203 //
5204 // https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
_mm_sll_epi16(__m128i a,__m128i count)5205 FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
5206 {
5207     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5208     if (unlikely(c > 15))
5209         return _mm_setzero_si128();
5210 
5211     int16x8_t vc = vdupq_n_s16((int16_t) c);
5212     return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
5213 }
5214 
5215 // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
5216 // shifting in zeros.
5217 //
5218 // r0 := a0 << count
5219 // r1 := a1 << count
5220 // r2 := a2 << count
5221 // r3 := a3 << count
5222 //
5223 // https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
_mm_sll_epi32(__m128i a,__m128i count)5224 FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
5225 {
5226     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5227     if (unlikely(c > 31))
5228         return _mm_setzero_si128();
5229 
5230     int32x4_t vc = vdupq_n_s32((int32_t) c);
5231     return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
5232 }
5233 
5234 // Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
5235 // shifting in zeros.
5236 //
5237 // r0 := a0 << count
5238 // r1 := a1 << count
5239 //
5240 // https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
_mm_sll_epi64(__m128i a,__m128i count)5241 FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
5242 {
5243     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5244     if (unlikely(c > 63))
5245         return _mm_setzero_si128();
5246 
5247     int64x2_t vc = vdupq_n_s64((int64_t) c);
5248     return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
5249 }
5250 
5251 // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
5252 // shifting in zeros.
5253 //
5254 //   r0 := a0 << count
5255 //   r1 := a1 << count
5256 //   ...
5257 //   r7 := a7 << count
5258 //
5259 // https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
5260 #define _mm_slli_epi16(a, imm)                                   \
5261     __extension__({                                              \
5262         __m128i ret;                                             \
5263         if (unlikely((imm)) <= 0) {                              \
5264             ret = a;                                             \
5265         }                                                        \
5266         if (unlikely((imm) > 15)) {                              \
5267             ret = _mm_setzero_si128();                           \
5268         } else {                                                 \
5269             ret = vreinterpretq_m128i_s16(                       \
5270                 vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
5271         }                                                        \
5272         ret;                                                     \
5273     })
5274 
5275 // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
5276 // shifting in zeros. :
5277 // https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
5278 // FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
_mm_slli_epi32(__m128i a,int imm)5279 FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
5280 {
5281     if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */
5282         return a;
5283     if (unlikely(imm > 31))
5284         return _mm_setzero_si128();
5285     return vreinterpretq_m128i_s32(
5286         vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
5287 }
5288 
5289 // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
5290 // store the results in dst.
_mm_slli_epi64(__m128i a,int imm)5291 FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
5292 {
5293     if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */
5294         return a;
5295     if (unlikely(imm > 63))
5296         return _mm_setzero_si128();
5297     return vreinterpretq_m128i_s64(
5298         vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
5299 }
5300 
5301 // Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
5302 // must be an immediate.
5303 //
5304 //   r := a << (imm * 8)
5305 //
5306 // https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
5307 // FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
5308 #define _mm_slli_si128(a, imm)                                          \
5309     __extension__({                                                     \
5310         __m128i ret;                                                    \
5311         if (unlikely((imm) <= 0)) {                                     \
5312             ret = a;                                                    \
5313         }                                                               \
5314         if (unlikely((imm) > 15)) {                                     \
5315             ret = _mm_setzero_si128();                                  \
5316         } else {                                                        \
5317             ret = vreinterpretq_m128i_s8(vextq_s8(                      \
5318                 vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
5319         }                                                               \
5320         ret;                                                            \
5321     })
5322 
5323 // Compute the square root of packed double-precision (64-bit) floating-point
5324 // elements in a, and store the results in dst.
5325 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd
_mm_sqrt_pd(__m128d a)5326 FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
5327 {
5328 #if defined(__aarch64__)
5329     return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
5330 #else
5331     double a0 = sqrt(((double *) &a)[0]);
5332     double a1 = sqrt(((double *) &a)[1]);
5333     return _mm_set_pd(a1, a0);
5334 #endif
5335 }
5336 
5337 // Compute the square root of the lower double-precision (64-bit) floating-point
5338 // element in b, store the result in the lower element of dst, and copy the
5339 // upper element from a to the upper element of dst.
5340 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd
_mm_sqrt_sd(__m128d a,__m128d b)5341 FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
5342 {
5343 #if defined(__aarch64__)
5344     return _mm_move_sd(a, _mm_sqrt_pd(b));
5345 #else
5346     return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
5347 #endif
5348 }
5349 
5350 // Shifts the 8 signed 16-bit integers in a right by count bits while shifting
5351 // in the sign bit.
5352 //
5353 //   r0 := a0 >> count
5354 //   r1 := a1 >> count
5355 //   ...
5356 //   r7 := a7 >> count
5357 //
5358 // https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
_mm_sra_epi16(__m128i a,__m128i count)5359 FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
5360 {
5361     int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5362     if (unlikely(c > 15))
5363         return _mm_cmplt_epi16(a, _mm_setzero_si128());
5364     return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
5365 }
5366 
5367 // Shifts the 4 signed 32-bit integers in a right by count bits while shifting
5368 // in the sign bit.
5369 //
5370 //   r0 := a0 >> count
5371 //   r1 := a1 >> count
5372 //   r2 := a2 >> count
5373 //   r3 := a3 >> count
5374 //
5375 // https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
_mm_sra_epi32(__m128i a,__m128i count)5376 FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
5377 {
5378     int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5379     if (unlikely(c > 31))
5380         return _mm_cmplt_epi32(a, _mm_setzero_si128());
5381     return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
5382 }
5383 
5384 // Shift packed 16-bit integers in a right by imm while shifting in sign
5385 // bits, and store the results in dst.
5386 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
_mm_srai_epi16(__m128i a,int imm)5387 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
5388 {
5389     const int count = (imm & ~15) ? 15 : imm;
5390     return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
5391 }
5392 
5393 // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
5394 // and store the results in dst.
5395 //
5396 //   FOR j := 0 to 3
5397 //     i := j*32
5398 //     IF imm8[7:0] > 31
5399 //       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
5400 //     ELSE
5401 //       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
5402 //     FI
5403 //   ENDFOR
5404 //
5405 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
5406 // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
5407 #define _mm_srai_epi32(a, imm)                                             \
5408     __extension__({                                                        \
5409         __m128i ret;                                                       \
5410         if (unlikely((imm) == 0)) {                                        \
5411             ret = a;                                                       \
5412         } else if (likely(0 < (imm) && (imm) < 32)) {                      \
5413             ret = vreinterpretq_m128i_s32(                                 \
5414                 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
5415         } else {                                                           \
5416             ret = vreinterpretq_m128i_s32(                                 \
5417                 vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));              \
5418         }                                                                  \
5419         ret;                                                               \
5420     })
5421 
5422 // Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
5423 // while shifting in zeros.
5424 //
5425 // r0 := srl(a0, count)
5426 // r1 := srl(a1, count)
5427 // ...
5428 // r7 := srl(a7, count)
5429 //
5430 // https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
_mm_srl_epi16(__m128i a,__m128i count)5431 FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
5432 {
5433     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5434     if (unlikely(c > 15))
5435         return _mm_setzero_si128();
5436 
5437     int16x8_t vc = vdupq_n_s16(-(int16_t) c);
5438     return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
5439 }
5440 
5441 // Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
5442 // while shifting in zeros.
5443 //
5444 // r0 := srl(a0, count)
5445 // r1 := srl(a1, count)
5446 // r2 := srl(a2, count)
5447 // r3 := srl(a3, count)
5448 //
5449 // https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
_mm_srl_epi32(__m128i a,__m128i count)5450 FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
5451 {
5452     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5453     if (unlikely(c > 31))
5454         return _mm_setzero_si128();
5455 
5456     int32x4_t vc = vdupq_n_s32(-(int32_t) c);
5457     return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
5458 }
5459 
5460 // Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
5461 // while shifting in zeros.
5462 //
5463 // r0 := srl(a0, count)
5464 // r1 := srl(a1, count)
5465 //
5466 // https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
_mm_srl_epi64(__m128i a,__m128i count)5467 FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
5468 {
5469     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5470     if (unlikely(c > 63))
5471         return _mm_setzero_si128();
5472 
5473     int64x2_t vc = vdupq_n_s64(-(int64_t) c);
5474     return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
5475 }
5476 
5477 // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
5478 // store the results in dst.
5479 //
5480 //   FOR j := 0 to 7
5481 //     i := j*16
5482 //     IF imm8[7:0] > 15
5483 //       dst[i+15:i] := 0
5484 //     ELSE
5485 //       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
5486 //     FI
5487 //   ENDFOR
5488 //
5489 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
5490 #define _mm_srli_epi16(a, imm)                                             \
5491     __extension__({                                                        \
5492         __m128i ret;                                                       \
5493         if (unlikely(imm) == 0) {                                          \
5494             ret = a;                                                       \
5495         } else if (likely(0 < (imm) && (imm) < 16)) {                      \
5496             ret = vreinterpretq_m128i_u16(                                 \
5497                 vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
5498         } else {                                                           \
5499             ret = _mm_setzero_si128();                                     \
5500         }                                                                  \
5501         ret;                                                               \
5502     })
5503 
5504 // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
5505 // store the results in dst.
5506 //
5507 //   FOR j := 0 to 3
5508 //     i := j*32
5509 //     IF imm8[7:0] > 31
5510 //       dst[i+31:i] := 0
5511 //     ELSE
5512 //       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
5513 //     FI
5514 //   ENDFOR
5515 //
5516 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
5517 // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
5518 #define _mm_srli_epi32(a, imm)                                             \
5519     __extension__({                                                        \
5520         __m128i ret;                                                       \
5521         if (unlikely((imm) == 0)) {                                        \
5522             ret = a;                                                       \
5523         } else if (likely(0 < (imm) && (imm) < 32)) {                      \
5524             ret = vreinterpretq_m128i_u32(                                 \
5525                 vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
5526         } else {                                                           \
5527             ret = _mm_setzero_si128();                                     \
5528         }                                                                  \
5529         ret;                                                               \
5530     })
5531 
5532 // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
5533 // store the results in dst.
5534 //
5535 //   FOR j := 0 to 1
5536 //     i := j*64
5537 //     IF imm8[7:0] > 63
5538 //       dst[i+63:i] := 0
5539 //     ELSE
5540 //       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
5541 //     FI
5542 //   ENDFOR
5543 //
5544 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
5545 #define _mm_srli_epi64(a, imm)                                             \
5546     __extension__({                                                        \
5547         __m128i ret;                                                       \
5548         if (unlikely((imm) == 0)) {                                        \
5549             ret = a;                                                       \
5550         } else if (likely(0 < (imm) && (imm) < 64)) {                      \
5551             ret = vreinterpretq_m128i_u64(                                 \
5552                 vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
5553         } else {                                                           \
5554             ret = _mm_setzero_si128();                                     \
5555         }                                                                  \
5556         ret;                                                               \
5557     })
5558 
5559 // Shifts the 128 - bit value in a right by imm bytes while shifting in
5560 // zeros.imm must be an immediate.
5561 //
5562 //   r := srl(a, imm*8)
5563 //
5564 // https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
5565 // FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
5566 #define _mm_srli_si128(a, imm)                                              \
5567     __extension__({                                                         \
5568         __m128i ret;                                                        \
5569         if (unlikely((imm) <= 0)) {                                         \
5570             ret = a;                                                        \
5571         }                                                                   \
5572         if (unlikely((imm) > 15)) {                                         \
5573             ret = _mm_setzero_si128();                                      \
5574         } else {                                                            \
5575             ret = vreinterpretq_m128i_s8(                                   \
5576                 vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
5577         }                                                                   \
5578         ret;                                                                \
5579     })
5580 
5581 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5582 // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
5583 // or a general-protection exception may be generated.
5584 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
_mm_store_pd(double * mem_addr,__m128d a)5585 FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
5586 {
5587 #if defined(__aarch64__)
5588     vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
5589 #else
5590     vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
5591 #endif
5592 }
5593 
5594 // Store the lower double-precision (64-bit) floating-point element from a into
5595 // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5596 // boundary or a general-protection exception may be generated.
5597 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
_mm_store_pd1(double * mem_addr,__m128d a)5598 FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
5599 {
5600 #if defined(__aarch64__)
5601     float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
5602     vst1q_f64((float64_t *) mem_addr,
5603               vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
5604 #else
5605     float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
5606     vst1q_f32((float32_t *) mem_addr,
5607               vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
5608 #endif
5609 }
5610 
5611 // Store the lower double-precision (64-bit) floating-point element from a into
5612 // memory. mem_addr does not need to be aligned on any particular boundary.
5613 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_store_sd
_mm_store_sd(double * mem_addr,__m128d a)5614 FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
5615 {
5616 #if defined(__aarch64__)
5617     vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5618 #else
5619     vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
5620 #endif
5621 }
5622 
5623 // Stores four 32-bit integer values as (as a __m128i value) at the address p.
5624 // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
_mm_store_si128(__m128i * p,__m128i a)5625 FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
5626 {
5627     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5628 }
5629 
5630 // Store the lower double-precision (64-bit) floating-point element from a into
5631 // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5632 // boundary or a general-protection exception may be generated.
5633 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
5634 #define _mm_store1_pd _mm_store_pd1
5635 
5636 // Store the upper double-precision (64-bit) floating-point element from a into
5637 // memory.
5638 //
5639 //   MEM[mem_addr+63:mem_addr] := a[127:64]
5640 //
5641 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
_mm_storeh_pd(double * mem_addr,__m128d a)5642 FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
5643 {
5644 #if defined(__aarch64__)
5645     vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
5646 #else
5647     vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
5648 #endif
5649 }
5650 
5651 // Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
5652 // https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
_mm_storel_epi64(__m128i * a,__m128i b)5653 FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
5654 {
5655     uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
5656     uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
5657     *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
5658 }
5659 
5660 // Store the lower double-precision (64-bit) floating-point element from a into
5661 // memory.
5662 //
5663 //   MEM[mem_addr+63:mem_addr] := a[63:0]
5664 //
5665 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
_mm_storel_pd(double * mem_addr,__m128d a)5666 FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
5667 {
5668 #if defined(__aarch64__)
5669     vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5670 #else
5671     vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
5672 #endif
5673 }
5674 
5675 // Store 2 double-precision (64-bit) floating-point elements from a into memory
5676 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
5677 // general-protection exception may be generated.
5678 //
5679 //   MEM[mem_addr+63:mem_addr] := a[127:64]
5680 //   MEM[mem_addr+127:mem_addr+64] := a[63:0]
5681 //
5682 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
_mm_storer_pd(double * mem_addr,__m128d a)5683 FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
5684 {
5685     float32x4_t f = vreinterpretq_f32_m128d(a);
5686     _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
5687 }
5688 
5689 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5690 // elements) from a into memory. mem_addr does not need to be aligned on any
5691 // particular boundary.
5692 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
_mm_storeu_pd(double * mem_addr,__m128d a)5693 FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
5694 {
5695     _mm_store_pd(mem_addr, a);
5696 }
5697 
5698 // Stores 128-bits of integer data a at the address p.
5699 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128
_mm_storeu_si128(__m128i * p,__m128i a)5700 FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
5701 {
5702     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5703 }
5704 
5705 // Stores 32-bits of integer data a at the address p.
5706 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32
_mm_storeu_si32(void * p,__m128i a)5707 FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
5708 {
5709     vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
5710 }
5711 
5712 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5713 // elements) from a into memory using a non-temporal memory hint. mem_addr must
5714 // be aligned on a 16-byte boundary or a general-protection exception may be
5715 // generated.
5716 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd
_mm_stream_pd(double * p,__m128d a)5717 FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
5718 {
5719 #if __has_builtin(__builtin_nontemporal_store)
5720     __builtin_nontemporal_store(a, (float32x4_t *) p);
5721 #elif defined(__aarch64__)
5722     vst1q_f64(p, vreinterpretq_f64_m128d(a));
5723 #else
5724     vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
5725 #endif
5726 }
5727 
5728 // Stores the data in a to the address p without polluting the caches.  If the
5729 // cache line containing address p is already in the cache, the cache will be
5730 // updated.
5731 // https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
_mm_stream_si128(__m128i * p,__m128i a)5732 FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
5733 {
5734 #if __has_builtin(__builtin_nontemporal_store)
5735     __builtin_nontemporal_store(a, p);
5736 #else
5737     vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
5738 #endif
5739 }
5740 
5741 // Store 32-bit integer a into memory using a non-temporal hint to minimize
5742 // cache pollution. If the cache line containing address mem_addr is already in
5743 // the cache, the cache will be updated.
5744 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32
_mm_stream_si32(int * p,int a)5745 FORCE_INLINE void _mm_stream_si32(int *p, int a)
5746 {
5747     vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
5748 }
5749 
5750 // Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
5751 // store the results in dst.
5752 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
_mm_sub_epi16(__m128i a,__m128i b)5753 FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
5754 {
5755     return vreinterpretq_m128i_s16(
5756         vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5757 }
5758 
5759 // Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
5760 // unsigned 32-bit integers of a.
5761 //
5762 //   r0 := a0 - b0
5763 //   r1 := a1 - b1
5764 //   r2 := a2 - b2
5765 //   r3 := a3 - b3
5766 //
5767 // https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
_mm_sub_epi32(__m128i a,__m128i b)5768 FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
5769 {
5770     return vreinterpretq_m128i_s32(
5771         vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5772 }
5773 
5774 // Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
5775 // and store the results in dst.
5776 //    r0 := a0 - b0
5777 //    r1 := a1 - b1
_mm_sub_epi64(__m128i a,__m128i b)5778 FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
5779 {
5780     return vreinterpretq_m128i_s64(
5781         vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
5782 }
5783 
5784 // Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
5785 // store the results in dst.
5786 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
_mm_sub_epi8(__m128i a,__m128i b)5787 FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
5788 {
5789     return vreinterpretq_m128i_s8(
5790         vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5791 }
5792 
5793 // Subtract packed double-precision (64-bit) floating-point elements in b from
5794 // packed double-precision (64-bit) floating-point elements in a, and store the
5795 // results in dst.
5796 //
5797 //   FOR j := 0 to 1
5798 //     i := j*64
5799 //     dst[i+63:i] := a[i+63:i] - b[i+63:i]
5800 //   ENDFOR
5801 //
5802 //  https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
_mm_sub_pd(__m128d a,__m128d b)5803 FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
5804 {
5805 #if defined(__aarch64__)
5806     return vreinterpretq_m128d_f64(
5807         vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5808 #else
5809     double *da = (double *) &a;
5810     double *db = (double *) &b;
5811     double c[2];
5812     c[0] = da[0] - db[0];
5813     c[1] = da[1] - db[1];
5814     return vld1q_f32((float32_t *) c);
5815 #endif
5816 }
5817 
5818 // Subtract the lower double-precision (64-bit) floating-point element in b from
5819 // the lower double-precision (64-bit) floating-point element in a, store the
5820 // result in the lower element of dst, and copy the upper element from a to the
5821 // upper element of dst.
5822 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
_mm_sub_sd(__m128d a,__m128d b)5823 FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
5824 {
5825     return _mm_move_sd(a, _mm_sub_pd(a, b));
5826 }
5827 
5828 // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
5829 //
5830 //   dst[63:0] := a[63:0] - b[63:0]
5831 //
5832 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
_mm_sub_si64(__m64 a,__m64 b)5833 FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
5834 {
5835     return vreinterpret_m64_s64(
5836         vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
5837 }
5838 
5839 // Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
5840 // of a and saturates.
5841 //
5842 //   r0 := SignedSaturate(a0 - b0)
5843 //   r1 := SignedSaturate(a1 - b1)
5844 //   ...
5845 //   r7 := SignedSaturate(a7 - b7)
5846 //
5847 // https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
_mm_subs_epi16(__m128i a,__m128i b)5848 FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
5849 {
5850     return vreinterpretq_m128i_s16(
5851         vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5852 }
5853 
5854 // Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
5855 // of a and saturates.
5856 //
5857 //   r0 := SignedSaturate(a0 - b0)
5858 //   r1 := SignedSaturate(a1 - b1)
5859 //   ...
5860 //   r15 := SignedSaturate(a15 - b15)
5861 //
5862 // https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
_mm_subs_epi8(__m128i a,__m128i b)5863 FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
5864 {
5865     return vreinterpretq_m128i_s8(
5866         vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5867 }
5868 
5869 // Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
5870 // integers of a and saturates..
5871 // https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
_mm_subs_epu16(__m128i a,__m128i b)5872 FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
5873 {
5874     return vreinterpretq_m128i_u16(
5875         vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
5876 }
5877 
5878 // Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
5879 // integers of a and saturates.
5880 //
5881 //   r0 := UnsignedSaturate(a0 - b0)
5882 //   r1 := UnsignedSaturate(a1 - b1)
5883 //   ...
5884 //   r15 := UnsignedSaturate(a15 - b15)
5885 //
5886 // https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
_mm_subs_epu8(__m128i a,__m128i b)5887 FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
5888 {
5889     return vreinterpretq_m128i_u8(
5890         vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
5891 }
5892 
5893 #define _mm_ucomieq_sd _mm_comieq_sd
5894 #define _mm_ucomige_sd _mm_comige_sd
5895 #define _mm_ucomigt_sd _mm_comigt_sd
5896 #define _mm_ucomile_sd _mm_comile_sd
5897 #define _mm_ucomilt_sd _mm_comilt_sd
5898 #define _mm_ucomineq_sd _mm_comineq_sd
5899 
5900 // Return vector of type __m128d with undefined elements.
5901 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd
_mm_undefined_pd(void)5902 FORCE_INLINE __m128d _mm_undefined_pd(void)
5903 {
5904 #if defined(__GNUC__) || defined(__clang__)
5905 #pragma GCC diagnostic push
5906 #pragma GCC diagnostic ignored "-Wuninitialized"
5907 #endif
5908     __m128d a;
5909     return a;
5910 #if defined(__GNUC__) || defined(__clang__)
5911 #pragma GCC diagnostic pop
5912 #endif
5913 }
5914 
5915 // Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
5916 // upper 4 signed or unsigned 16-bit integers in b.
5917 //
5918 //   r0 := a4
5919 //   r1 := b4
5920 //   r2 := a5
5921 //   r3 := b5
5922 //   r4 := a6
5923 //   r5 := b6
5924 //   r6 := a7
5925 //   r7 := b7
5926 //
5927 // https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
_mm_unpackhi_epi16(__m128i a,__m128i b)5928 FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
5929 {
5930 #if defined(__aarch64__)
5931     return vreinterpretq_m128i_s16(
5932         vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5933 #else
5934     int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
5935     int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
5936     int16x4x2_t result = vzip_s16(a1, b1);
5937     return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5938 #endif
5939 }
5940 
5941 // Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
5942 // upper 2 signed or unsigned 32-bit integers in b.
5943 // https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
_mm_unpackhi_epi32(__m128i a,__m128i b)5944 FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
5945 {
5946 #if defined(__aarch64__)
5947     return vreinterpretq_m128i_s32(
5948         vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5949 #else
5950     int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
5951     int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
5952     int32x2x2_t result = vzip_s32(a1, b1);
5953     return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5954 #endif
5955 }
5956 
5957 // Interleaves the upper signed or unsigned 64-bit integer in a with the
5958 // upper signed or unsigned 64-bit integer in b.
5959 //
5960 //   r0 := a1
5961 //   r1 := b1
_mm_unpackhi_epi64(__m128i a,__m128i b)5962 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
5963 {
5964     int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
5965     int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
5966     return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
5967 }
5968 
5969 // Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
5970 // 8 signed or unsigned 8-bit integers in b.
5971 //
5972 //   r0 := a8
5973 //   r1 := b8
5974 //   r2 := a9
5975 //   r3 := b9
5976 //   ...
5977 //   r14 := a15
5978 //   r15 := b15
5979 //
5980 // https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
_mm_unpackhi_epi8(__m128i a,__m128i b)5981 FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
5982 {
5983 #if defined(__aarch64__)
5984     return vreinterpretq_m128i_s8(
5985         vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5986 #else
5987     int8x8_t a1 =
5988         vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
5989     int8x8_t b1 =
5990         vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
5991     int8x8x2_t result = vzip_s8(a1, b1);
5992     return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5993 #endif
5994 }
5995 
5996 // Unpack and interleave double-precision (64-bit) floating-point elements from
5997 // the high half of a and b, and store the results in dst.
5998 //
5999 //   DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
6000 //     dst[63:0] := src1[127:64]
6001 //     dst[127:64] := src2[127:64]
6002 //     RETURN dst[127:0]
6003 //   }
6004 //   dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
6005 //
6006 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
_mm_unpackhi_pd(__m128d a,__m128d b)6007 FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
6008 {
6009 #if defined(__aarch64__)
6010     return vreinterpretq_m128d_f64(
6011         vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6012 #else
6013     return vreinterpretq_m128d_s64(
6014         vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
6015                      vget_high_s64(vreinterpretq_s64_m128d(b))));
6016 #endif
6017 }
6018 
6019 // Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
6020 // lower 4 signed or unsigned 16-bit integers in b.
6021 //
6022 //   r0 := a0
6023 //   r1 := b0
6024 //   r2 := a1
6025 //   r3 := b1
6026 //   r4 := a2
6027 //   r5 := b2
6028 //   r6 := a3
6029 //   r7 := b3
6030 //
6031 // https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
_mm_unpacklo_epi16(__m128i a,__m128i b)6032 FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
6033 {
6034 #if defined(__aarch64__)
6035     return vreinterpretq_m128i_s16(
6036         vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6037 #else
6038     int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
6039     int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
6040     int16x4x2_t result = vzip_s16(a1, b1);
6041     return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6042 #endif
6043 }
6044 
6045 // Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
6046 // lower 2 signed or unsigned 32 - bit integers in b.
6047 //
6048 //   r0 := a0
6049 //   r1 := b0
6050 //   r2 := a1
6051 //   r3 := b1
6052 //
6053 // https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
_mm_unpacklo_epi32(__m128i a,__m128i b)6054 FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
6055 {
6056 #if defined(__aarch64__)
6057     return vreinterpretq_m128i_s32(
6058         vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6059 #else
6060     int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
6061     int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
6062     int32x2x2_t result = vzip_s32(a1, b1);
6063     return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6064 #endif
6065 }
6066 
_mm_unpacklo_epi64(__m128i a,__m128i b)6067 FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
6068 {
6069     int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
6070     int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
6071     return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
6072 }
6073 
6074 // Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
6075 // 8 signed or unsigned 8-bit integers in b.
6076 //
6077 //   r0 := a0
6078 //   r1 := b0
6079 //   r2 := a1
6080 //   r3 := b1
6081 //   ...
6082 //   r14 := a7
6083 //   r15 := b7
6084 //
6085 // https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
_mm_unpacklo_epi8(__m128i a,__m128i b)6086 FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
6087 {
6088 #if defined(__aarch64__)
6089     return vreinterpretq_m128i_s8(
6090         vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6091 #else
6092     int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
6093     int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
6094     int8x8x2_t result = vzip_s8(a1, b1);
6095     return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6096 #endif
6097 }
6098 
6099 // Unpack and interleave double-precision (64-bit) floating-point elements from
6100 // the low half of a and b, and store the results in dst.
6101 //
6102 //   DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
6103 //     dst[63:0] := src1[63:0]
6104 //     dst[127:64] := src2[63:0]
6105 //     RETURN dst[127:0]
6106 //   }
6107 //   dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
6108 //
6109 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
_mm_unpacklo_pd(__m128d a,__m128d b)6110 FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
6111 {
6112 #if defined(__aarch64__)
6113     return vreinterpretq_m128d_f64(
6114         vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6115 #else
6116     return vreinterpretq_m128d_s64(
6117         vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
6118                      vget_low_s64(vreinterpretq_s64_m128d(b))));
6119 #endif
6120 }
6121 
6122 // Compute the bitwise XOR of packed double-precision (64-bit) floating-point
6123 // elements in a and b, and store the results in dst.
6124 //
6125 //   FOR j := 0 to 1
6126 //      i := j*64
6127 //      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
6128 //   ENDFOR
6129 //
6130 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
_mm_xor_pd(__m128d a,__m128d b)6131 FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
6132 {
6133     return vreinterpretq_m128d_s64(
6134         veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
6135 }
6136 
6137 // Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
6138 // b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
_mm_xor_si128(__m128i a,__m128i b)6139 FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
6140 {
6141     return vreinterpretq_m128i_s32(
6142         veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6143 }
6144 
6145 /* SSE3 */
6146 
6147 // Alternatively add and subtract packed double-precision (64-bit)
6148 // floating-point elements in a to/from packed elements in b, and store the
6149 // results in dst.
6150 //
6151 // FOR j := 0 to 1
6152 //   i := j*64
6153 //   IF ((j & 1) == 0)
6154 //     dst[i+63:i] := a[i+63:i] - b[i+63:i]
6155 //   ELSE
6156 //     dst[i+63:i] := a[i+63:i] + b[i+63:i]
6157 //   FI
6158 // ENDFOR
6159 //
6160 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd
_mm_addsub_pd(__m128d a,__m128d b)6161 FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
6162 {
6163     __m128d mask = _mm_set_pd(1.0f, -1.0f);
6164 #if defined(__aarch64__)
6165     return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
6166                                              vreinterpretq_f64_m128d(b),
6167                                              vreinterpretq_f64_m128d(mask)));
6168 #else
6169     return _mm_add_pd(_mm_mul_pd(b, mask), a);
6170 #endif
6171 }
6172 
6173 // Alternatively add and subtract packed single-precision (32-bit)
6174 // floating-point elements in a to/from packed elements in b, and store the
6175 // results in dst.
6176 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
_mm_addsub_ps(__m128 a,__m128 b)6177 FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
6178 {
6179     __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
6180 #if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
6181     return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
6182                                             vreinterpretq_f32_m128(mask),
6183                                             vreinterpretq_f32_m128(b)));
6184 #else
6185     return _mm_add_ps(_mm_mul_ps(b, mask), a);
6186 #endif
6187 }
6188 
6189 // Horizontally add adjacent pairs of double-precision (64-bit) floating-point
6190 // elements in a and b, and pack the results in dst.
6191 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
_mm_hadd_pd(__m128d a,__m128d b)6192 FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
6193 {
6194 #if defined(__aarch64__)
6195     return vreinterpretq_m128d_f64(
6196         vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6197 #else
6198     double *da = (double *) &a;
6199     double *db = (double *) &b;
6200     double c[] = {da[0] + da[1], db[0] + db[1]};
6201     return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6202 #endif
6203 }
6204 
6205 // Computes pairwise add of each argument as single-precision, floating-point
6206 // values a and b.
6207 // https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
_mm_hadd_ps(__m128 a,__m128 b)6208 FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
6209 {
6210 #if defined(__aarch64__)
6211     return vreinterpretq_m128_f32(
6212         vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
6213 #else
6214     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
6215     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
6216     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
6217     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
6218     return vreinterpretq_m128_f32(
6219         vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
6220 #endif
6221 }
6222 
6223 // Horizontally subtract adjacent pairs of double-precision (64-bit)
6224 // floating-point elements in a and b, and pack the results in dst.
6225 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd
_mm_hsub_pd(__m128d _a,__m128d _b)6226 FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
6227 {
6228 #if defined(__aarch64__)
6229     return vreinterpretq_m128d_f64(vsubq_f64(
6230         vuzp1q_f64(vreinterpretq_f64_m128d(_a), vreinterpretq_f64_m128d(_b)),
6231         vuzp2q_f64(vreinterpretq_f64_m128d(_a), vreinterpretq_f64_m128d(_b))));
6232 #else
6233     double *da = (double *) &_a;
6234     double *db = (double *) &_b;
6235     double c[] = {da[0] - da[1], db[0] - db[1]};
6236     return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6237 #endif
6238 }
6239 
6240 // Horizontally substract adjacent pairs of single-precision (32-bit)
6241 // floating-point elements in a and b, and pack the results in dst.
6242 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
_mm_hsub_ps(__m128 _a,__m128 _b)6243 FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
6244 {
6245 #if defined(__aarch64__)
6246     return vreinterpretq_m128_f32(vsubq_f32(
6247         vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
6248         vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
6249 #else
6250     float32x4x2_t c =
6251         vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
6252     return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
6253 #endif
6254 }
6255 
6256 // Load 128-bits of integer data from unaligned memory into dst. This intrinsic
6257 // may perform better than _mm_loadu_si128 when the data crosses a cache line
6258 // boundary.
6259 //
6260 //   dst[127:0] := MEM[mem_addr+127:mem_addr]
6261 //
6262 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
6263 #define _mm_lddqu_si128 _mm_loadu_si128
6264 
6265 // Load a double-precision (64-bit) floating-point element from memory into both
6266 // elements of dst.
6267 //
6268 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
6269 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
6270 //
6271 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
6272 #define _mm_loaddup_pd _mm_load1_pd
6273 
6274 // Duplicate the low double-precision (64-bit) floating-point element from a,
6275 // and store the results in dst.
6276 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
_mm_movedup_pd(__m128d a)6277 FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
6278 {
6279 #if (__aarch64__)
6280     return vreinterpretq_m128d_f64(
6281         vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
6282 #else
6283     return vreinterpretq_m128d_u64(
6284         vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
6285 #endif
6286 }
6287 
6288 // Duplicate odd-indexed single-precision (32-bit) floating-point elements
6289 // from a, and store the results in dst.
6290 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
_mm_movehdup_ps(__m128 a)6291 FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
6292 {
6293 #if __has_builtin(__builtin_shufflevector)
6294     return vreinterpretq_m128_f32(__builtin_shufflevector(
6295         vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
6296 #else
6297     float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
6298     float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
6299     float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
6300     return vreinterpretq_m128_f32(vld1q_f32(data));
6301 #endif
6302 }
6303 
6304 // Duplicate even-indexed single-precision (32-bit) floating-point elements
6305 // from a, and store the results in dst.
6306 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
_mm_moveldup_ps(__m128 a)6307 FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
6308 {
6309 #if __has_builtin(__builtin_shufflevector)
6310     return vreinterpretq_m128_f32(__builtin_shufflevector(
6311         vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
6312 #else
6313     float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
6314     float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
6315     float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
6316     return vreinterpretq_m128_f32(vld1q_f32(data));
6317 #endif
6318 }
6319 
6320 /* SSSE3 */
6321 
6322 // Compute the absolute value of packed signed 16-bit integers in a, and store
6323 // the unsigned results in dst.
6324 //
6325 //   FOR j := 0 to 7
6326 //     i := j*16
6327 //     dst[i+15:i] := ABS(a[i+15:i])
6328 //   ENDFOR
6329 //
6330 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
_mm_abs_epi16(__m128i a)6331 FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
6332 {
6333     return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
6334 }
6335 
6336 // Compute the absolute value of packed signed 32-bit integers in a, and store
6337 // the unsigned results in dst.
6338 //
6339 //   FOR j := 0 to 3
6340 //     i := j*32
6341 //     dst[i+31:i] := ABS(a[i+31:i])
6342 //   ENDFOR
6343 //
6344 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
_mm_abs_epi32(__m128i a)6345 FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
6346 {
6347     return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
6348 }
6349 
6350 // Compute the absolute value of packed signed 8-bit integers in a, and store
6351 // the unsigned results in dst.
6352 //
6353 //   FOR j := 0 to 15
6354 //     i := j*8
6355 //     dst[i+7:i] := ABS(a[i+7:i])
6356 //   ENDFOR
6357 //
6358 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
_mm_abs_epi8(__m128i a)6359 FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
6360 {
6361     return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
6362 }
6363 
6364 // Compute the absolute value of packed signed 16-bit integers in a, and store
6365 // the unsigned results in dst.
6366 //
6367 //   FOR j := 0 to 3
6368 //     i := j*16
6369 //     dst[i+15:i] := ABS(a[i+15:i])
6370 //   ENDFOR
6371 //
6372 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
_mm_abs_pi16(__m64 a)6373 FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
6374 {
6375     return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
6376 }
6377 
6378 // Compute the absolute value of packed signed 32-bit integers in a, and store
6379 // the unsigned results in dst.
6380 //
6381 //   FOR j := 0 to 1
6382 //     i := j*32
6383 //     dst[i+31:i] := ABS(a[i+31:i])
6384 //   ENDFOR
6385 //
6386 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
_mm_abs_pi32(__m64 a)6387 FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
6388 {
6389     return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
6390 }
6391 
6392 // Compute the absolute value of packed signed 8-bit integers in a, and store
6393 // the unsigned results in dst.
6394 //
6395 //   FOR j := 0 to 7
6396 //     i := j*8
6397 //     dst[i+7:i] := ABS(a[i+7:i])
6398 //   ENDFOR
6399 //
6400 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
_mm_abs_pi8(__m64 a)6401 FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
6402 {
6403     return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
6404 }
6405 
6406 // Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
6407 // the result right by imm8 bytes, and store the low 16 bytes in dst.
6408 //
6409 //   tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
6410 //   dst[127:0] := tmp[127:0]
6411 //
6412 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
6413 #define _mm_alignr_epi8(a, b, imm)                                            \
6414     __extension__({                                                           \
6415         __m128i ret;                                                          \
6416         if (unlikely((imm) >= 32)) {                                          \
6417             ret = _mm_setzero_si128();                                        \
6418         } else {                                                              \
6419             uint8x16_t tmp_low, tmp_high;                                     \
6420             if (imm >= 16) {                                                  \
6421                 const int idx = imm - 16;                                     \
6422                 tmp_low = vreinterpretq_u8_m128i(a);                          \
6423                 tmp_high = vdupq_n_u8(0);                                     \
6424                 ret =                                                         \
6425                     vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
6426             } else {                                                          \
6427                 const int idx = imm;                                          \
6428                 tmp_low = vreinterpretq_u8_m128i(b);                          \
6429                 tmp_high = vreinterpretq_u8_m128i(a);                         \
6430                 ret =                                                         \
6431                     vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
6432             }                                                                 \
6433         }                                                                     \
6434         ret;                                                                  \
6435     })
6436 
6437 // Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
6438 // the result right by imm8 bytes, and store the low 8 bytes in dst.
6439 //
6440 //   tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
6441 //   dst[63:0] := tmp[63:0]
6442 //
6443 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
6444 #define _mm_alignr_pi8(a, b, imm)                                           \
6445     __extension__({                                                         \
6446         __m64 ret;                                                          \
6447         if (unlikely((imm) >= 16)) {                                        \
6448             ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
6449         } else {                                                            \
6450             uint8x8_t tmp_low, tmp_high;                                    \
6451             if (imm >= 8) {                                                 \
6452                 const int idx = imm - 8;                                    \
6453                 tmp_low = vreinterpret_u8_m64(a);                           \
6454                 tmp_high = vdup_n_u8(0);                                    \
6455                 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6456             } else {                                                        \
6457                 const int idx = imm;                                        \
6458                 tmp_low = vreinterpret_u8_m64(b);                           \
6459                 tmp_high = vreinterpret_u8_m64(a);                          \
6460                 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6461             }                                                               \
6462         }                                                                   \
6463         ret;                                                                \
6464     })
6465 
6466 // Computes pairwise add of each argument as a 16-bit signed or unsigned integer
6467 // values a and b.
_mm_hadd_epi16(__m128i _a,__m128i _b)6468 FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
6469 {
6470     int16x8_t a = vreinterpretq_s16_m128i(_a);
6471     int16x8_t b = vreinterpretq_s16_m128i(_b);
6472 #if defined(__aarch64__)
6473     return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
6474 #else
6475     return vreinterpretq_m128i_s16(
6476         vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
6477                      vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
6478 #endif
6479 }
6480 
6481 // Computes pairwise add of each argument as a 32-bit signed or unsigned integer
6482 // values a and b.
_mm_hadd_epi32(__m128i _a,__m128i _b)6483 FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
6484 {
6485     int32x4_t a = vreinterpretq_s32_m128i(_a);
6486     int32x4_t b = vreinterpretq_s32_m128i(_b);
6487     return vreinterpretq_m128i_s32(
6488         vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
6489                      vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
6490 }
6491 
6492 // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
6493 // signed 16-bit results in dst.
6494 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
_mm_hadd_pi16(__m64 a,__m64 b)6495 FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
6496 {
6497     return vreinterpret_m64_s16(
6498         vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
6499 }
6500 
6501 // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
6502 // signed 32-bit results in dst.
6503 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
_mm_hadd_pi32(__m64 a,__m64 b)6504 FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
6505 {
6506     return vreinterpret_m64_s32(
6507         vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
6508 }
6509 
6510 // Computes saturated pairwise sub of each argument as a 16-bit signed
6511 // integer values a and b.
_mm_hadds_epi16(__m128i _a,__m128i _b)6512 FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
6513 {
6514 #if defined(__aarch64__)
6515     int16x8_t a = vreinterpretq_s16_m128i(_a);
6516     int16x8_t b = vreinterpretq_s16_m128i(_b);
6517     return vreinterpretq_s64_s16(
6518         vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6519 #else
6520     int32x4_t a = vreinterpretq_s32_m128i(_a);
6521     int32x4_t b = vreinterpretq_s32_m128i(_b);
6522     // Interleave using vshrn/vmovn
6523     // [a0|a2|a4|a6|b0|b2|b4|b6]
6524     // [a1|a3|a5|a7|b1|b3|b5|b7]
6525     int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6526     int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6527     // Saturated add
6528     return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
6529 #endif
6530 }
6531 
6532 // Horizontally add adjacent pairs of signed 16-bit integers in a and b using
6533 // saturation, and pack the signed 16-bit results in dst.
6534 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_pi16
_mm_hadds_pi16(__m64 _a,__m64 _b)6535 FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
6536 {
6537     int16x4_t a = vreinterpret_s16_m64(_a);
6538     int16x4_t b = vreinterpret_s16_m64(_b);
6539 #if defined(__aarch64__)
6540     return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6541 #else
6542     int16x4x2_t res = vuzp_s16(a, b);
6543     return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
6544 #endif
6545 }
6546 
6547 // Computes pairwise difference of each argument as a 16-bit signed or unsigned
6548 // integer values a and b.
_mm_hsub_epi16(__m128i _a,__m128i _b)6549 FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
6550 {
6551     int32x4_t a = vreinterpretq_s32_m128i(_a);
6552     int32x4_t b = vreinterpretq_s32_m128i(_b);
6553     // Interleave using vshrn/vmovn
6554     // [a0|a2|a4|a6|b0|b2|b4|b6]
6555     // [a1|a3|a5|a7|b1|b3|b5|b7]
6556     int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6557     int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6558     // Subtract
6559     return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
6560 }
6561 
6562 // Computes pairwise difference of each argument as a 32-bit signed or unsigned
6563 // integer values a and b.
_mm_hsub_epi32(__m128i _a,__m128i _b)6564 FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
6565 {
6566     int64x2_t a = vreinterpretq_s64_m128i(_a);
6567     int64x2_t b = vreinterpretq_s64_m128i(_b);
6568     // Interleave using vshrn/vmovn
6569     // [a0|a2|b0|b2]
6570     // [a1|a2|b1|b3]
6571     int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
6572     int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
6573     // Subtract
6574     return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
6575 }
6576 
6577 // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6578 // the signed 16-bit results in dst.
6579 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi16
_mm_hsub_pi16(__m64 _a,__m64 _b)6580 FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
6581 {
6582     int32x4_t ab =
6583         vcombine_s32(vreinterpret_s32_m64(_a), vreinterpret_s32_m64(_b));
6584 
6585     int16x4_t ab_low_bits = vmovn_s32(ab);
6586     int16x4_t ab_high_bits = vshrn_n_s32(ab, 16);
6587 
6588     return vreinterpret_m64_s16(vsub_s16(ab_low_bits, ab_high_bits));
6589 }
6590 
6591 // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6592 // the signed 32-bit results in dst.
6593 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_hsub_pi32
_mm_hsub_pi32(__m64 _a,__m64 _b)6594 FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
6595 {
6596 #if defined(__aarch64__)
6597     int32x2_t a = vreinterpret_s32_m64(_a);
6598     int32x2_t b = vreinterpret_s32_m64(_b);
6599     return vreinterpret_m64_s32(vsub_s32(vtrn1_s32(a, b), vtrn2_s32(a, b)));
6600 #else
6601     int32x2x2_t trn_ab =
6602         vtrn_s32(vreinterpret_s32_m64(_a), vreinterpret_s32_m64(_b));
6603     return vreinterpret_m64_s32(vsub_s32(trn_ab.val[0], trn_ab.val[1]));
6604 #endif
6605 }
6606 
6607 // Computes saturated pairwise difference of each argument as a 16-bit signed
6608 // integer values a and b.
6609 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
_mm_hsubs_epi16(__m128i _a,__m128i _b)6610 FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
6611 {
6612 #if defined(__aarch64__)
6613     int16x8_t a = vreinterpretq_s16_m128i(_a);
6614     int16x8_t b = vreinterpretq_s16_m128i(_b);
6615     return vreinterpretq_s64_s16(
6616         vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6617 #else
6618     int32x4_t a = vreinterpretq_s32_m128i(_a);
6619     int32x4_t b = vreinterpretq_s32_m128i(_b);
6620     // Interleave using vshrn/vmovn
6621     // [a0|a2|a4|a6|b0|b2|b4|b6]
6622     // [a1|a3|a5|a7|b1|b3|b5|b7]
6623     int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6624     int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6625     // Saturated subtract
6626     return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
6627 #endif
6628 }
6629 
6630 // Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
6631 // using saturation, and pack the signed 16-bit results in dst.
6632 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_pi16
_mm_hsubs_pi16(__m64 _a,__m64 _b)6633 FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
6634 {
6635     int16x4_t a = vreinterpret_s16_m64(_a);
6636     int16x4_t b = vreinterpret_s16_m64(_b);
6637 #if defined(__aarch64__)
6638     return vreinterpret_s64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6639 #else
6640     int16x4x2_t res = vuzp_s16(a, b);
6641     return vreinterpret_s64_s16(vqsub_s16(res.val[0], res.val[1]));
6642 #endif
6643 }
6644 
6645 // Vertically multiply each unsigned 8-bit integer from a with the corresponding
6646 // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6647 // Horizontally add adjacent pairs of intermediate signed 16-bit integers,
6648 // and pack the saturated results in dst.
6649 //
6650 //   FOR j := 0 to 7
6651 //      i := j*16
6652 //      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
6653 //      a[i+7:i]*b[i+7:i] )
6654 //   ENDFOR
_mm_maddubs_epi16(__m128i _a,__m128i _b)6655 FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
6656 {
6657 #if defined(__aarch64__)
6658     uint8x16_t a = vreinterpretq_u8_m128i(_a);
6659     int8x16_t b = vreinterpretq_s8_m128i(_b);
6660     int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
6661                              vmovl_s8(vget_low_s8(b)));
6662     int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
6663                              vmovl_s8(vget_high_s8(b)));
6664     return vreinterpretq_m128i_s16(
6665         vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
6666 #else
6667     // This would be much simpler if x86 would choose to zero extend OR sign
6668     // extend, not both. This could probably be optimized better.
6669     uint16x8_t a = vreinterpretq_u16_m128i(_a);
6670     int16x8_t b = vreinterpretq_s16_m128i(_b);
6671 
6672     // Zero extend a
6673     int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
6674     int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
6675 
6676     // Sign extend by shifting left then shifting right.
6677     int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
6678     int16x8_t b_odd = vshrq_n_s16(b, 8);
6679 
6680     // multiply
6681     int16x8_t prod1 = vmulq_s16(a_even, b_even);
6682     int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
6683 
6684     // saturated add
6685     return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
6686 #endif
6687 }
6688 
6689 // Vertically multiply each unsigned 8-bit integer from a with the corresponding
6690 // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6691 // Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
6692 // pack the saturated results in dst.
6693 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_pi16
_mm_maddubs_pi16(__m64 _a,__m64 _b)6694 FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
6695 {
6696     uint16x4_t a = vreinterpret_u16_m64(_a);
6697     int16x4_t b = vreinterpret_s16_m64(_b);
6698 
6699     // Zero extend a
6700     int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
6701     int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
6702 
6703     // Sign extend by shifting left then shifting right.
6704     int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
6705     int16x4_t b_odd = vshr_n_s16(b, 8);
6706 
6707     // multiply
6708     int16x4_t prod1 = vmul_s16(a_even, b_even);
6709     int16x4_t prod2 = vmul_s16(a_odd, b_odd);
6710 
6711     // saturated add
6712     return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
6713 }
6714 
6715 // Multiply packed signed 16-bit integers in a and b, producing intermediate
6716 // signed 32-bit integers. Shift right by 15 bits while rounding up, and store
6717 // the packed 16-bit integers in dst.
6718 //
6719 //   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
6720 //   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
6721 //   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
6722 //   ...
6723 //   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
_mm_mulhrs_epi16(__m128i a,__m128i b)6724 FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
6725 {
6726     // Has issues due to saturation
6727     // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
6728 
6729     // Multiply
6730     int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
6731                                  vget_low_s16(vreinterpretq_s16_m128i(b)));
6732     int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
6733                                  vget_high_s16(vreinterpretq_s16_m128i(b)));
6734 
6735     // Rounding narrowing shift right
6736     // narrow = (int16_t)((mul + 16384) >> 15);
6737     int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
6738     int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
6739 
6740     // Join together
6741     return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
6742 }
6743 
6744 // Multiply packed signed 16-bit integers in a and b, producing intermediate
6745 // signed 32-bit integers. Truncate each intermediate integer to the 18 most
6746 // significant bits, round by adding 1, and store bits [16:1] to dst.
6747 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_pi16
_mm_mulhrs_pi16(__m64 a,__m64 b)6748 FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
6749 {
6750     int32x4_t mul_extend =
6751         vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
6752 
6753     // Rounding narrowing shift right
6754     return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
6755 }
6756 
6757 // Shuffle packed 8-bit integers in a according to shuffle control mask in the
6758 // corresponding 8-bit element of b, and store the results in dst.
6759 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
_mm_shuffle_epi8(__m128i a,__m128i b)6760 FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
6761 {
6762     int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
6763     uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
6764     uint8x16_t idx_masked =
6765         vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
6766 #if defined(__aarch64__)
6767     return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
6768 #elif defined(__GNUC__)
6769     int8x16_t ret;
6770     // %e and %f represent the even and odd D registers
6771     // respectively.
6772     __asm__ __volatile__(
6773         "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
6774         "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
6775         : [ret] "=&w"(ret)
6776         : [tbl] "w"(tbl), [idx] "w"(idx_masked));
6777     return vreinterpretq_m128i_s8(ret);
6778 #else
6779     // use this line if testing on aarch64
6780     int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
6781     return vreinterpretq_m128i_s8(
6782         vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
6783                     vtbl2_s8(a_split, vget_high_u8(idx_masked))));
6784 #endif
6785 }
6786 
6787 // Negate packed 16-bit integers in a when the corresponding signed
6788 // 16-bit integer in b is negative, and store the results in dst.
6789 // Element in dst are zeroed out when the corresponding element
6790 // in b is zero.
6791 //
6792 //   for i in 0..7
6793 //     if b[i] < 0
6794 //       r[i] := -a[i]
6795 //     else if b[i] == 0
6796 //       r[i] := 0
6797 //     else
6798 //       r[i] := a[i]
6799 //     fi
6800 //   done
_mm_sign_epi16(__m128i _a,__m128i _b)6801 FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
6802 {
6803     int16x8_t a = vreinterpretq_s16_m128i(_a);
6804     int16x8_t b = vreinterpretq_s16_m128i(_b);
6805 
6806     // signed shift right: faster than vclt
6807     // (b < 0) ? 0xFFFF : 0
6808     uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
6809     // (b == 0) ? 0xFFFF : 0
6810 #if defined(__aarch64__)
6811     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
6812 #else
6813     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
6814 #endif
6815 
6816     // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
6817     // 'a') based on ltMask
6818     int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
6819     // res = masked & (~zeroMask)
6820     int16x8_t res = vbicq_s16(masked, zeroMask);
6821     return vreinterpretq_m128i_s16(res);
6822 }
6823 
6824 // Negate packed 32-bit integers in a when the corresponding signed
6825 // 32-bit integer in b is negative, and store the results in dst.
6826 // Element in dst are zeroed out when the corresponding element
6827 // in b is zero.
6828 //
6829 //   for i in 0..3
6830 //     if b[i] < 0
6831 //       r[i] := -a[i]
6832 //     else if b[i] == 0
6833 //       r[i] := 0
6834 //     else
6835 //       r[i] := a[i]
6836 //     fi
6837 //   done
_mm_sign_epi32(__m128i _a,__m128i _b)6838 FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
6839 {
6840     int32x4_t a = vreinterpretq_s32_m128i(_a);
6841     int32x4_t b = vreinterpretq_s32_m128i(_b);
6842 
6843     // signed shift right: faster than vclt
6844     // (b < 0) ? 0xFFFFFFFF : 0
6845     uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
6846 
6847     // (b == 0) ? 0xFFFFFFFF : 0
6848 #if defined(__aarch64__)
6849     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
6850 #else
6851     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
6852 #endif
6853 
6854     // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
6855     // 'a') based on ltMask
6856     int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
6857     // res = masked & (~zeroMask)
6858     int32x4_t res = vbicq_s32(masked, zeroMask);
6859     return vreinterpretq_m128i_s32(res);
6860 }
6861 
6862 // Negate packed 8-bit integers in a when the corresponding signed
6863 // 8-bit integer in b is negative, and store the results in dst.
6864 // Element in dst are zeroed out when the corresponding element
6865 // in b is zero.
6866 //
6867 //   for i in 0..15
6868 //     if b[i] < 0
6869 //       r[i] := -a[i]
6870 //     else if b[i] == 0
6871 //       r[i] := 0
6872 //     else
6873 //       r[i] := a[i]
6874 //     fi
6875 //   done
_mm_sign_epi8(__m128i _a,__m128i _b)6876 FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
6877 {
6878     int8x16_t a = vreinterpretq_s8_m128i(_a);
6879     int8x16_t b = vreinterpretq_s8_m128i(_b);
6880 
6881     // signed shift right: faster than vclt
6882     // (b < 0) ? 0xFF : 0
6883     uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
6884 
6885     // (b == 0) ? 0xFF : 0
6886 #if defined(__aarch64__)
6887     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
6888 #else
6889     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
6890 #endif
6891 
6892     // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
6893     // based on ltMask
6894     int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
6895     // res = masked & (~zeroMask)
6896     int8x16_t res = vbicq_s8(masked, zeroMask);
6897 
6898     return vreinterpretq_m128i_s8(res);
6899 }
6900 
6901 // Negate packed 16-bit integers in a when the corresponding signed 16-bit
6902 // integer in b is negative, and store the results in dst. Element in dst are
6903 // zeroed out when the corresponding element in b is zero.
6904 //
6905 //   FOR j := 0 to 3
6906 //      i := j*16
6907 //      IF b[i+15:i] < 0
6908 //        dst[i+15:i] := -(a[i+15:i])
6909 //      ELSE IF b[i+15:i] == 0
6910 //        dst[i+15:i] := 0
6911 //      ELSE
6912 //        dst[i+15:i] := a[i+15:i]
6913 //      FI
6914 //   ENDFOR
6915 //
6916 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
_mm_sign_pi16(__m64 _a,__m64 _b)6917 FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
6918 {
6919     int16x4_t a = vreinterpret_s16_m64(_a);
6920     int16x4_t b = vreinterpret_s16_m64(_b);
6921 
6922     // signed shift right: faster than vclt
6923     // (b < 0) ? 0xFFFF : 0
6924     uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
6925 
6926     // (b == 0) ? 0xFFFF : 0
6927 #if defined(__aarch64__)
6928     int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
6929 #else
6930     int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
6931 #endif
6932 
6933     // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
6934     // based on ltMask
6935     int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
6936     // res = masked & (~zeroMask)
6937     int16x4_t res = vbic_s16(masked, zeroMask);
6938 
6939     return vreinterpret_m64_s16(res);
6940 }
6941 
6942 // Negate packed 32-bit integers in a when the corresponding signed 32-bit
6943 // integer in b is negative, and store the results in dst. Element in dst are
6944 // zeroed out when the corresponding element in b is zero.
6945 //
6946 //   FOR j := 0 to 1
6947 //      i := j*32
6948 //      IF b[i+31:i] < 0
6949 //        dst[i+31:i] := -(a[i+31:i])
6950 //      ELSE IF b[i+31:i] == 0
6951 //        dst[i+31:i] := 0
6952 //      ELSE
6953 //        dst[i+31:i] := a[i+31:i]
6954 //      FI
6955 //   ENDFOR
6956 //
6957 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
_mm_sign_pi32(__m64 _a,__m64 _b)6958 FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
6959 {
6960     int32x2_t a = vreinterpret_s32_m64(_a);
6961     int32x2_t b = vreinterpret_s32_m64(_b);
6962 
6963     // signed shift right: faster than vclt
6964     // (b < 0) ? 0xFFFFFFFF : 0
6965     uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
6966 
6967     // (b == 0) ? 0xFFFFFFFF : 0
6968 #if defined(__aarch64__)
6969     int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
6970 #else
6971     int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
6972 #endif
6973 
6974     // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
6975     // based on ltMask
6976     int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
6977     // res = masked & (~zeroMask)
6978     int32x2_t res = vbic_s32(masked, zeroMask);
6979 
6980     return vreinterpret_m64_s32(res);
6981 }
6982 
6983 // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
6984 // in b is negative, and store the results in dst. Element in dst are zeroed out
6985 // when the corresponding element in b is zero.
6986 //
6987 //   FOR j := 0 to 7
6988 //      i := j*8
6989 //      IF b[i+7:i] < 0
6990 //        dst[i+7:i] := -(a[i+7:i])
6991 //      ELSE IF b[i+7:i] == 0
6992 //        dst[i+7:i] := 0
6993 //      ELSE
6994 //        dst[i+7:i] := a[i+7:i]
6995 //      FI
6996 //   ENDFOR
6997 //
6998 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
_mm_sign_pi8(__m64 _a,__m64 _b)6999 FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
7000 {
7001     int8x8_t a = vreinterpret_s8_m64(_a);
7002     int8x8_t b = vreinterpret_s8_m64(_b);
7003 
7004     // signed shift right: faster than vclt
7005     // (b < 0) ? 0xFF : 0
7006     uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
7007 
7008     // (b == 0) ? 0xFF : 0
7009 #if defined(__aarch64__)
7010     int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
7011 #else
7012     int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
7013 #endif
7014 
7015     // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
7016     // based on ltMask
7017     int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
7018     // res = masked & (~zeroMask)
7019     int8x8_t res = vbic_s8(masked, zeroMask);
7020 
7021     return vreinterpret_m64_s8(res);
7022 }
7023 
7024 /* SSE4.1 */
7025 
7026 // Blend packed 16-bit integers from a and b using control mask imm8, and store
7027 // the results in dst.
7028 //
7029 //   FOR j := 0 to 7
7030 //       i := j*16
7031 //       IF imm8[j]
7032 //           dst[i+15:i] := b[i+15:i]
7033 //       ELSE
7034 //           dst[i+15:i] := a[i+15:i]
7035 //       FI
7036 //   ENDFOR
7037 // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
7038 //                                      __constrange(0,255) int imm)
7039 #define _mm_blend_epi16(a, b, imm)                                            \
7040     __extension__({                                                           \
7041         const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,  \
7042                                    ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,  \
7043                                    ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,  \
7044                                    ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,  \
7045                                    ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,  \
7046                                    ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,  \
7047                                    ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,  \
7048                                    ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
7049         uint16x8_t _mask_vec = vld1q_u16(_mask);                              \
7050         uint16x8_t _a = vreinterpretq_u16_m128i(a);                           \
7051         uint16x8_t _b = vreinterpretq_u16_m128i(b);                           \
7052         vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));                \
7053     })
7054 
7055 // Blend packed double-precision (64-bit) floating-point elements from a and b
7056 // using control mask imm8, and store the results in dst.
7057 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd
7058 #define _mm_blend_pd(a, b, imm)                                \
7059     __extension__({                                            \
7060         const uint64_t _mask[2] = {                            \
7061             ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),   \
7062             ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)};  \
7063         uint64x2_t _mask_vec = vld1q_u64(_mask);               \
7064         uint64x2_t _a = vreinterpretq_u64_m128d(a);            \
7065         uint64x2_t _b = vreinterpretq_u64_m128d(b);            \
7066         vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
7067     })
7068 
7069 // Blend packed single-precision (32-bit) floating-point elements from a and b
7070 // using mask, and store the results in dst.
7071 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps
_mm_blend_ps(__m128 _a,__m128 _b,const char imm8)7072 FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
7073 {
7074     const uint32_t ALIGN_STRUCT(16)
7075         data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
7076                    ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
7077                    ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
7078                    ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
7079     uint32x4_t mask = vld1q_u32(data);
7080     float32x4_t a = vreinterpretq_f32_m128(_a);
7081     float32x4_t b = vreinterpretq_f32_m128(_b);
7082     return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
7083 }
7084 
7085 // Blend packed 8-bit integers from a and b using mask, and store the results in
7086 // dst.
7087 //
7088 //   FOR j := 0 to 15
7089 //       i := j*8
7090 //       IF mask[i+7]
7091 //           dst[i+7:i] := b[i+7:i]
7092 //       ELSE
7093 //           dst[i+7:i] := a[i+7:i]
7094 //       FI
7095 //   ENDFOR
_mm_blendv_epi8(__m128i _a,__m128i _b,__m128i _mask)7096 FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
7097 {
7098     // Use a signed shift right to create a mask with the sign bit
7099     uint8x16_t mask =
7100         vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
7101     uint8x16_t a = vreinterpretq_u8_m128i(_a);
7102     uint8x16_t b = vreinterpretq_u8_m128i(_b);
7103     return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
7104 }
7105 
7106 // Blend packed double-precision (64-bit) floating-point elements from a and b
7107 // using mask, and store the results in dst.
7108 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
_mm_blendv_pd(__m128d _a,__m128d _b,__m128d _mask)7109 FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
7110 {
7111     uint64x2_t mask =
7112         vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
7113 #if defined(__aarch64__)
7114     float64x2_t a = vreinterpretq_f64_m128d(_a);
7115     float64x2_t b = vreinterpretq_f64_m128d(_b);
7116     return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
7117 #else
7118     uint64x2_t a = vreinterpretq_u64_m128d(_a);
7119     uint64x2_t b = vreinterpretq_u64_m128d(_b);
7120     return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
7121 #endif
7122 }
7123 
7124 // Blend packed single-precision (32-bit) floating-point elements from a and b
7125 // using mask, and store the results in dst.
7126 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
_mm_blendv_ps(__m128 _a,__m128 _b,__m128 _mask)7127 FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
7128 {
7129     // Use a signed shift right to create a mask with the sign bit
7130     uint32x4_t mask =
7131         vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
7132     float32x4_t a = vreinterpretq_f32_m128(_a);
7133     float32x4_t b = vreinterpretq_f32_m128(_b);
7134     return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
7135 }
7136 
7137 // Round the packed double-precision (64-bit) floating-point elements in a up
7138 // to an integer value, and store the results as packed double-precision
7139 // floating-point elements in dst.
7140 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd
_mm_ceil_pd(__m128d a)7141 FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
7142 {
7143 #if defined(__aarch64__)
7144     return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
7145 #else
7146     double *f = (double *) &a;
7147     return _mm_set_pd(ceil(f[1]), ceil(f[0]));
7148 #endif
7149 }
7150 
7151 // Round the packed single-precision (32-bit) floating-point elements in a up to
7152 // an integer value, and store the results as packed single-precision
7153 // floating-point elements in dst.
7154 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
_mm_ceil_ps(__m128 a)7155 FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
7156 {
7157 #if defined(__aarch64__)
7158     return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
7159 #else
7160     float *f = (float *) &a;
7161     return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
7162 #endif
7163 }
7164 
7165 // Round the lower double-precision (64-bit) floating-point element in b up to
7166 // an integer value, store the result as a double-precision floating-point
7167 // element in the lower element of dst, and copy the upper element from a to the
7168 // upper element of dst.
7169 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd
_mm_ceil_sd(__m128d a,__m128d b)7170 FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
7171 {
7172     return _mm_move_sd(a, _mm_ceil_pd(b));
7173 }
7174 
7175 // Round the lower single-precision (32-bit) floating-point element in b up to
7176 // an integer value, store the result as a single-precision floating-point
7177 // element in the lower element of dst, and copy the upper 3 packed elements
7178 // from a to the upper elements of dst.
7179 //
7180 //   dst[31:0] := CEIL(b[31:0])
7181 //   dst[127:32] := a[127:32]
7182 //
7183 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss
_mm_ceil_ss(__m128 a,__m128 b)7184 FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
7185 {
7186     return _mm_move_ss(a, _mm_ceil_ps(b));
7187 }
7188 
7189 // Compare packed 64-bit integers in a and b for equality, and store the results
7190 // in dst
_mm_cmpeq_epi64(__m128i a,__m128i b)7191 FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
7192 {
7193 #if defined(__aarch64__)
7194     return vreinterpretq_m128i_u64(
7195         vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
7196 #else
7197     // ARMv7 lacks vceqq_u64
7198     // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
7199     uint32x4_t cmp =
7200         vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
7201     uint32x4_t swapped = vrev64q_u32(cmp);
7202     return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
7203 #endif
7204 }
7205 
7206 // Converts the four signed 16-bit integers in the lower 64 bits to four signed
7207 // 32-bit integers.
_mm_cvtepi16_epi32(__m128i a)7208 FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
7209 {
7210     return vreinterpretq_m128i_s32(
7211         vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
7212 }
7213 
7214 // Converts the two signed 16-bit integers in the lower 32 bits two signed
7215 // 32-bit integers.
_mm_cvtepi16_epi64(__m128i a)7216 FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
7217 {
7218     int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
7219     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
7220     int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
7221     return vreinterpretq_m128i_s64(s64x2);
7222 }
7223 
7224 // Converts the two signed 32-bit integers in the lower 64 bits to two signed
7225 // 64-bit integers.
_mm_cvtepi32_epi64(__m128i a)7226 FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
7227 {
7228     return vreinterpretq_m128i_s64(
7229         vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
7230 }
7231 
7232 // Converts the four unsigned 8-bit integers in the lower 16 bits to four
7233 // unsigned 32-bit integers.
_mm_cvtepi8_epi16(__m128i a)7234 FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
7235 {
7236     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
7237     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7238     return vreinterpretq_m128i_s16(s16x8);
7239 }
7240 
7241 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
7242 // unsigned 32-bit integers.
_mm_cvtepi8_epi32(__m128i a)7243 FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
7244 {
7245     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
7246     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
7247     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
7248     return vreinterpretq_m128i_s32(s32x4);
7249 }
7250 
7251 // Converts the two signed 8-bit integers in the lower 32 bits to four
7252 // signed 64-bit integers.
_mm_cvtepi8_epi64(__m128i a)7253 FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
7254 {
7255     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
7256     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
7257     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
7258     int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
7259     return vreinterpretq_m128i_s64(s64x2);
7260 }
7261 
7262 // Converts the four unsigned 16-bit integers in the lower 64 bits to four
7263 // unsigned 32-bit integers.
_mm_cvtepu16_epi32(__m128i a)7264 FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
7265 {
7266     return vreinterpretq_m128i_u32(
7267         vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
7268 }
7269 
7270 // Converts the two unsigned 16-bit integers in the lower 32 bits to two
7271 // unsigned 64-bit integers.
_mm_cvtepu16_epi64(__m128i a)7272 FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
7273 {
7274     uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
7275     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
7276     uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
7277     return vreinterpretq_m128i_u64(u64x2);
7278 }
7279 
7280 // Converts the two unsigned 32-bit integers in the lower 64 bits to two
7281 // unsigned 64-bit integers.
_mm_cvtepu32_epi64(__m128i a)7282 FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
7283 {
7284     return vreinterpretq_m128i_u64(
7285         vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
7286 }
7287 
7288 // Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
7289 // and store the results in dst.
7290 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16
_mm_cvtepu8_epi16(__m128i a)7291 FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
7292 {
7293     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
7294     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
7295     return vreinterpretq_m128i_u16(u16x8);
7296 }
7297 
7298 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
7299 // unsigned 32-bit integers.
7300 // https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
_mm_cvtepu8_epi32(__m128i a)7301 FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
7302 {
7303     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
7304     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
7305     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
7306     return vreinterpretq_m128i_u32(u32x4);
7307 }
7308 
7309 // Converts the two unsigned 8-bit integers in the lower 16 bits to two
7310 // unsigned 64-bit integers.
_mm_cvtepu8_epi64(__m128i a)7311 FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
7312 {
7313     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
7314     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
7315     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
7316     uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
7317     return vreinterpretq_m128i_u64(u64x2);
7318 }
7319 
7320 // Conditionally multiply the packed single-precision (32-bit) floating-point
7321 // elements in a and b using the high 4 bits in imm8, sum the four products,
7322 // and conditionally store the sum in dst using the low 4 bits of imm.
7323 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
_mm_dp_ps(__m128 a,__m128 b,const int imm)7324 FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
7325 {
7326 #if defined(__aarch64__)
7327     /* shortcuts */
7328     if (imm == 0xFF) {
7329         return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
7330     }
7331     if (imm == 0x7F) {
7332         float32x4_t m = _mm_mul_ps(a, b);
7333         m[3] = 0;
7334         return _mm_set1_ps(vaddvq_f32(m));
7335     }
7336 #endif
7337 
7338     float s = 0, c = 0;
7339     float32x4_t f32a = vreinterpretq_f32_m128(a);
7340     float32x4_t f32b = vreinterpretq_f32_m128(b);
7341 
7342     /* To improve the accuracy of floating-point summation, Kahan algorithm
7343      * is used for each operation.
7344      */
7345     if (imm & (1 << 4))
7346         _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
7347     if (imm & (1 << 5))
7348         _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
7349     if (imm & (1 << 6))
7350         _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
7351     if (imm & (1 << 7))
7352         _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
7353     s += c;
7354 
7355     float32x4_t res = {
7356         (imm & 0x1) ? s : 0,
7357         (imm & 0x2) ? s : 0,
7358         (imm & 0x4) ? s : 0,
7359         (imm & 0x8) ? s : 0,
7360     };
7361     return vreinterpretq_m128_f32(res);
7362 }
7363 
7364 // Extracts the selected signed or unsigned 32-bit integer from a and zero
7365 // extends.
7366 // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
7367 #define _mm_extract_epi32(a, imm) \
7368     vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
7369 
7370 // Extracts the selected signed or unsigned 64-bit integer from a and zero
7371 // extends.
7372 // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
7373 #define _mm_extract_epi64(a, imm) \
7374     vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
7375 
7376 // Extracts the selected signed or unsigned 8-bit integer from a and zero
7377 // extends.
7378 // FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
7379 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8
7380 #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
7381 
7382 // Extracts the selected single-precision (32-bit) floating-point from a.
7383 // FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
7384 #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
7385 
7386 // Round the packed double-precision (64-bit) floating-point elements in a down
7387 // to an integer value, and store the results as packed double-precision
7388 // floating-point elements in dst.
7389 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd
_mm_floor_pd(__m128d a)7390 FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
7391 {
7392 #if defined(__aarch64__)
7393     return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
7394 #else
7395     double *f = (double *) &a;
7396     return _mm_set_pd(floor(f[1]), floor(f[0]));
7397 #endif
7398 }
7399 
7400 // Round the packed single-precision (32-bit) floating-point elements in a down
7401 // to an integer value, and store the results as packed single-precision
7402 // floating-point elements in dst.
7403 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
_mm_floor_ps(__m128 a)7404 FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
7405 {
7406 #if defined(__aarch64__)
7407     return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
7408 #else
7409     float *f = (float *) &a;
7410     return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
7411 #endif
7412 }
7413 
7414 // Round the lower double-precision (64-bit) floating-point element in b down to
7415 // an integer value, store the result as a double-precision floating-point
7416 // element in the lower element of dst, and copy the upper element from a to the
7417 // upper element of dst.
7418 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd
_mm_floor_sd(__m128d a,__m128d b)7419 FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
7420 {
7421     return _mm_move_sd(a, _mm_floor_pd(b));
7422 }
7423 
7424 // Round the lower single-precision (32-bit) floating-point element in b down to
7425 // an integer value, store the result as a single-precision floating-point
7426 // element in the lower element of dst, and copy the upper 3 packed elements
7427 // from a to the upper elements of dst.
7428 //
7429 //   dst[31:0] := FLOOR(b[31:0])
7430 //   dst[127:32] := a[127:32]
7431 //
7432 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
_mm_floor_ss(__m128 a,__m128 b)7433 FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
7434 {
7435     return _mm_move_ss(a, _mm_floor_ps(b));
7436 }
7437 
7438 // Inserts the least significant 32 bits of b into the selected 32-bit integer
7439 // of a.
7440 // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
7441 //                                       __constrange(0,4) int imm)
7442 #define _mm_insert_epi32(a, b, imm)                                  \
7443     __extension__({                                                  \
7444         vreinterpretq_m128i_s32(                                     \
7445             vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
7446     })
7447 
7448 // Inserts the least significant 64 bits of b into the selected 64-bit integer
7449 // of a.
7450 // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
7451 //                                       __constrange(0,2) int imm)
7452 #define _mm_insert_epi64(a, b, imm)                                  \
7453     __extension__({                                                  \
7454         vreinterpretq_m128i_s64(                                     \
7455             vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
7456     })
7457 
7458 // Inserts the least significant 8 bits of b into the selected 8-bit integer
7459 // of a.
7460 // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
7461 //                                      __constrange(0,16) int imm)
7462 #define _mm_insert_epi8(a, b, imm)                                 \
7463     __extension__({                                                \
7464         vreinterpretq_m128i_s8(                                    \
7465             vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
7466     })
7467 
7468 // Copy a to tmp, then insert a single-precision (32-bit) floating-point
7469 // element from b into tmp using the control in imm8. Store tmp to dst using
7470 // the mask in imm8 (elements are zeroed out when the corresponding bit is set).
7471 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=insert_ps
7472 #define _mm_insert_ps(a, b, imm8)                                              \
7473     __extension__({                                                            \
7474         float32x4_t tmp1 = vsetq_lane_f32(vgetq_lane_f32(b, (imm >> 6) & 0x3), \
7475                                           vreinterpretq_f32_m128(a), 0);       \
7476         float32x4_t tmp2 =                                                     \
7477             vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
7478                            ((imm >> 4) & 0x3));                                \
7479         const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,        \
7480                                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,        \
7481                                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,        \
7482                                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};       \
7483         uint32x4_t mask = vld1q_u32(data);                                     \
7484         float32x4_t all_zeros = vdupq_n_f32(0);                                \
7485                                                                                \
7486         vreinterpretq_m128_f32(                                                \
7487             vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2)));         \
7488     })
7489 
7490 // epi versions of min/max
7491 // Computes the pariwise maximums of the four signed 32-bit integer values of a
7492 // and b.
7493 //
7494 // A 128-bit parameter that can be defined with the following equations:
7495 //   r0 := (a0 > b0) ? a0 : b0
7496 //   r1 := (a1 > b1) ? a1 : b1
7497 //   r2 := (a2 > b2) ? a2 : b2
7498 //   r3 := (a3 > b3) ? a3 : b3
7499 //
7500 // https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
_mm_max_epi32(__m128i a,__m128i b)7501 FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
7502 {
7503     return vreinterpretq_m128i_s32(
7504         vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7505 }
7506 
7507 // Compare packed signed 8-bit integers in a and b, and store packed maximum
7508 // values in dst.
7509 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
_mm_max_epi8(__m128i a,__m128i b)7510 FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
7511 {
7512     return vreinterpretq_m128i_s8(
7513         vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
7514 }
7515 
7516 // Compare packed unsigned 16-bit integers in a and b, and store packed maximum
7517 // values in dst.
7518 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
_mm_max_epu16(__m128i a,__m128i b)7519 FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
7520 {
7521     return vreinterpretq_m128i_u16(
7522         vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
7523 }
7524 
7525 // Compare packed unsigned 32-bit integers in a and b, and store packed maximum
7526 // values in dst.
7527 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
_mm_max_epu32(__m128i a,__m128i b)7528 FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
7529 {
7530     return vreinterpretq_m128i_u32(
7531         vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
7532 }
7533 
7534 // Computes the pariwise minima of the four signed 32-bit integer values of a
7535 // and b.
7536 //
7537 // A 128-bit parameter that can be defined with the following equations:
7538 //   r0 := (a0 < b0) ? a0 : b0
7539 //   r1 := (a1 < b1) ? a1 : b1
7540 //   r2 := (a2 < b2) ? a2 : b2
7541 //   r3 := (a3 < b3) ? a3 : b3
7542 //
7543 // https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
_mm_min_epi32(__m128i a,__m128i b)7544 FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
7545 {
7546     return vreinterpretq_m128i_s32(
7547         vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7548 }
7549 
7550 // Compare packed signed 8-bit integers in a and b, and store packed minimum
7551 // values in dst.
7552 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
_mm_min_epi8(__m128i a,__m128i b)7553 FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
7554 {
7555     return vreinterpretq_m128i_s8(
7556         vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
7557 }
7558 
7559 // Compare packed unsigned 16-bit integers in a and b, and store packed minimum
7560 // values in dst.
7561 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
_mm_min_epu16(__m128i a,__m128i b)7562 FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
7563 {
7564     return vreinterpretq_m128i_u16(
7565         vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
7566 }
7567 
7568 // Compare packed unsigned 32-bit integers in a and b, and store packed minimum
7569 // values in dst.
7570 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
_mm_min_epu32(__m128i a,__m128i b)7571 FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
7572 {
7573     return vreinterpretq_m128i_u32(
7574         vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
7575 }
7576 
7577 // Horizontally compute the minimum amongst the packed unsigned 16-bit integers
7578 // in a, store the minimum and index in dst, and zero the remaining bits in dst.
7579 //
7580 //   index[2:0] := 0
7581 //   min[15:0] := a[15:0]
7582 //   FOR j := 0 to 7
7583 //       i := j*16
7584 //       IF a[i+15:i] < min[15:0]
7585 //           index[2:0] := j
7586 //           min[15:0] := a[i+15:i]
7587 //       FI
7588 //   ENDFOR
7589 //   dst[15:0] := min[15:0]
7590 //   dst[18:16] := index[2:0]
7591 //   dst[127:19] := 0
7592 //
7593 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
_mm_minpos_epu16(__m128i a)7594 FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
7595 {
7596     __m128i dst;
7597     uint16_t min, idx = 0;
7598     // Find the minimum value
7599 #if defined(__aarch64__)
7600     min = vminvq_u16(vreinterpretq_u16_m128i(a));
7601 #else
7602     __m64 tmp;
7603     tmp = vreinterpret_m64_u16(
7604         vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
7605                  vget_high_u16(vreinterpretq_u16_m128i(a))));
7606     tmp = vreinterpret_m64_u16(
7607         vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7608     tmp = vreinterpret_m64_u16(
7609         vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7610     min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
7611 #endif
7612     // Get the index of the minimum value
7613     int i;
7614     for (i = 0; i < 8; i++) {
7615         if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
7616             idx = (uint16_t) i;
7617             break;
7618         }
7619         a = _mm_srli_si128(a, 2);
7620     }
7621     // Generate result
7622     dst = _mm_setzero_si128();
7623     dst = vreinterpretq_m128i_u16(
7624         vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
7625     dst = vreinterpretq_m128i_u16(
7626         vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
7627     return dst;
7628 }
7629 
7630 // Multiply the low signed 32-bit integers from each packed 64-bit element in
7631 // a and b, and store the signed 64-bit results in dst.
7632 //
7633 //   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
7634 //   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
_mm_mul_epi32(__m128i a,__m128i b)7635 FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
7636 {
7637     // vmull_s32 upcasts instead of masking, so we downcast.
7638     int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
7639     int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
7640     return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
7641 }
7642 
7643 // Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
7644 // unsigned 32-bit integers from b.
7645 // https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
_mm_mullo_epi32(__m128i a,__m128i b)7646 FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
7647 {
7648     return vreinterpretq_m128i_s32(
7649         vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7650 }
7651 
7652 // Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
7653 // integers and saturates.
7654 //
7655 //   r0 := UnsignedSaturate(a0)
7656 //   r1 := UnsignedSaturate(a1)
7657 //   r2 := UnsignedSaturate(a2)
7658 //   r3 := UnsignedSaturate(a3)
7659 //   r4 := UnsignedSaturate(b0)
7660 //   r5 := UnsignedSaturate(b1)
7661 //   r6 := UnsignedSaturate(b2)
7662 //   r7 := UnsignedSaturate(b3)
_mm_packus_epi32(__m128i a,__m128i b)7663 FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
7664 {
7665     return vreinterpretq_m128i_u16(
7666         vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
7667                      vqmovun_s32(vreinterpretq_s32_m128i(b))));
7668 }
7669 
7670 // Round the packed double-precision (64-bit) floating-point elements in a using
7671 // the rounding parameter, and store the results as packed double-precision
7672 // floating-point elements in dst.
7673 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd
_mm_round_pd(__m128d a,int rounding)7674 FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
7675 {
7676 #if defined(__aarch64__)
7677     switch (rounding) {
7678     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
7679         return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
7680     case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
7681         return _mm_floor_pd(a);
7682     case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
7683         return _mm_ceil_pd(a);
7684     case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
7685         return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
7686     default:  //_MM_FROUND_CUR_DIRECTION
7687         return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
7688     }
7689 #else
7690     double *v_double = (double *) &a;
7691 
7692     if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
7693         (rounding == _MM_FROUND_CUR_DIRECTION &&
7694          _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
7695         double res[2], tmp;
7696         for (int i = 0; i < 2; i++) {
7697             tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
7698             double roundDown = floor(tmp);  // Round down value
7699             double roundUp = ceil(tmp);     // Round up value
7700             double diffDown = tmp - roundDown;
7701             double diffUp = roundUp - tmp;
7702             if (diffDown < diffUp) {
7703                 /* If it's closer to the round down value, then use it */
7704                 res[i] = roundDown;
7705             } else if (diffDown > diffUp) {
7706                 /* If it's closer to the round up value, then use it */
7707                 res[i] = roundUp;
7708             } else {
7709                 /* If it's equidistant between round up and round down value,
7710                  * pick the one which is an even number */
7711                 double half = roundDown / 2;
7712                 if (half != floor(half)) {
7713                     /* If the round down value is odd, return the round up value
7714                      */
7715                     res[i] = roundUp;
7716                 } else {
7717                     /* If the round up value is odd, return the round down value
7718                      */
7719                     res[i] = roundDown;
7720                 }
7721             }
7722             res[i] = (v_double[i] < 0) ? -res[i] : res[i];
7723         }
7724         return _mm_set_pd(res[1], res[0]);
7725     } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
7726                (rounding == _MM_FROUND_CUR_DIRECTION &&
7727                 _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
7728         return _mm_floor_pd(a);
7729     } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
7730                (rounding == _MM_FROUND_CUR_DIRECTION &&
7731                 _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
7732         return _mm_ceil_pd(a);
7733     }
7734     return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
7735                       v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
7736 #endif
7737 }
7738 
7739 // Round the packed single-precision (32-bit) floating-point elements in a using
7740 // the rounding parameter, and store the results as packed single-precision
7741 // floating-point elements in dst.
7742 // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
_mm_round_ps(__m128 a,int rounding)7743 FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
7744 {
7745 #if defined(__aarch64__)
7746     switch (rounding) {
7747     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
7748         return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
7749     case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
7750         return _mm_floor_ps(a);
7751     case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
7752         return _mm_ceil_ps(a);
7753     case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
7754         return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
7755     default:  //_MM_FROUND_CUR_DIRECTION
7756         return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
7757     }
7758 #else
7759     float *v_float = (float *) &a;
7760 
7761     if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
7762         (rounding == _MM_FROUND_CUR_DIRECTION &&
7763          _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
7764         uint32x4_t signmask = vdupq_n_u32(0x80000000);
7765         float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
7766                                      vdupq_n_f32(0.5f)); /* +/- 0.5 */
7767         int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
7768             vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
7769         int32x4_t r_trunc = vcvtq_s32_f32(
7770             vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
7771         int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
7772             vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
7773         int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
7774                                      vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
7775         float32x4_t delta = vsubq_f32(
7776             vreinterpretq_f32_m128(a),
7777             vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
7778         uint32x4_t is_delta_half =
7779             vceqq_f32(delta, half); /* delta == +/- 0.5 */
7780         return vreinterpretq_m128_f32(
7781             vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
7782     } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
7783                (rounding == _MM_FROUND_CUR_DIRECTION &&
7784                 _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
7785         return _mm_floor_ps(a);
7786     } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
7787                (rounding == _MM_FROUND_CUR_DIRECTION &&
7788                 _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
7789         return _mm_ceil_ps(a);
7790     }
7791     return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
7792                       v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
7793                       v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
7794                       v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
7795 #endif
7796 }
7797 
7798 // Round the lower double-precision (64-bit) floating-point element in b using
7799 // the rounding parameter, store the result as a double-precision floating-point
7800 // element in the lower element of dst, and copy the upper element from a to the
7801 // upper element of dst.
7802 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd
_mm_round_sd(__m128d a,__m128d b,int rounding)7803 FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
7804 {
7805     return _mm_move_sd(a, _mm_round_pd(b, rounding));
7806 }
7807 
7808 // Round the lower single-precision (32-bit) floating-point element in b using
7809 // the rounding parameter, store the result as a single-precision floating-point
7810 // element in the lower element of dst, and copy the upper 3 packed elements
7811 // from a to the upper elements of dst. Rounding is done according to the
7812 // rounding[3:0] parameter, which can be one of:
7813 //     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
7814 //     suppress exceptions
7815 //     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and
7816 //     suppress exceptions
7817 //     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress
7818 //     exceptions
7819 //     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
7820 //     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
7821 //     _MM_SET_ROUNDING_MODE
7822 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss
_mm_round_ss(__m128 a,__m128 b,int rounding)7823 FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
7824 {
7825     return _mm_move_ss(a, _mm_round_ps(b, rounding));
7826 }
7827 
7828 // Load 128-bits of integer data from memory into dst using a non-temporal
7829 // memory hint. mem_addr must be aligned on a 16-byte boundary or a
7830 // general-protection exception may be generated.
7831 //
7832 //   dst[127:0] := MEM[mem_addr+127:mem_addr]
7833 //
7834 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
_mm_stream_load_si128(__m128i * p)7835 FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
7836 {
7837 #if __has_builtin(__builtin_nontemporal_store)
7838     return __builtin_nontemporal_load(p);
7839 #else
7840     return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
7841 #endif
7842 }
7843 
7844 // Compute the bitwise NOT of a and then AND with a 128-bit vector containing
7845 // all 1's, and return 1 if the result is zero, otherwise return 0.
7846 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
_mm_test_all_ones(__m128i a)7847 FORCE_INLINE int _mm_test_all_ones(__m128i a)
7848 {
7849     return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
7850            ~(uint64_t) 0;
7851 }
7852 
7853 // Compute the bitwise AND of 128 bits (representing integer data) in a and
7854 // mask, and return 1 if the result is zero, otherwise return 0.
7855 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
_mm_test_all_zeros(__m128i a,__m128i mask)7856 FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
7857 {
7858     int64x2_t a_and_mask =
7859         vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
7860     return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
7861 }
7862 
7863 // Compute the bitwise AND of 128 bits (representing integer data) in a and
7864 // mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
7865 // the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
7866 // zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
7867 // otherwise return 0.
7868 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_test_mix_ones_zero
_mm_test_mix_ones_zeros(__m128i a,__m128i mask)7869 FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
7870 {
7871     uint64x2_t zf =
7872         vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
7873     uint64x2_t cf =
7874         vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
7875     uint64x2_t result = vandq_u64(zf, cf);
7876     return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
7877 }
7878 
7879 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7880 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7881 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7882 // otherwise set CF to 0. Return the CF value.
7883 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
_mm_testc_si128(__m128i a,__m128i b)7884 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
7885 {
7886     int64x2_t s64 =
7887         vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
7888                   vreinterpretq_s64_m128i(b));
7889     return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
7890 }
7891 
7892 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7893 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7894 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7895 // otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
7896 // otherwise return 0.
7897 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128
7898 #define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
7899 
7900 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7901 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7902 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7903 // otherwise set CF to 0. Return the ZF value.
7904 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
_mm_testz_si128(__m128i a,__m128i b)7905 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
7906 {
7907     int64x2_t s64 =
7908         vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
7909     return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
7910 }
7911 
7912 /* SSE4.2 */
7913 
7914 // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
7915 // in b for greater than.
_mm_cmpgt_epi64(__m128i a,__m128i b)7916 FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
7917 {
7918 #if defined(__aarch64__)
7919     return vreinterpretq_m128i_u64(
7920         vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
7921 #else
7922     return vreinterpretq_m128i_s64(vshrq_n_s64(
7923         vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
7924         63));
7925 #endif
7926 }
7927 
7928 // Starting with the initial value in crc, accumulates a CRC32 value for
7929 // unsigned 16-bit integer v.
7930 // https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
_mm_crc32_u16(uint32_t crc,uint16_t v)7931 FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
7932 {
7933 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
7934     __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
7935                          : [c] "+r"(crc)
7936                          : [v] "r"(v));
7937 #else
7938     crc = _mm_crc32_u8(crc, v & 0xff);
7939     crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
7940 #endif
7941     return crc;
7942 }
7943 
7944 // Starting with the initial value in crc, accumulates a CRC32 value for
7945 // unsigned 32-bit integer v.
7946 // https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
_mm_crc32_u32(uint32_t crc,uint32_t v)7947 FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
7948 {
7949 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
7950     __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
7951                          : [c] "+r"(crc)
7952                          : [v] "r"(v));
7953 #else
7954     crc = _mm_crc32_u16(crc, v & 0xffff);
7955     crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
7956 #endif
7957     return crc;
7958 }
7959 
7960 // Starting with the initial value in crc, accumulates a CRC32 value for
7961 // unsigned 64-bit integer v.
7962 // https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
_mm_crc32_u64(uint64_t crc,uint64_t v)7963 FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
7964 {
7965 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
7966     __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
7967                          : [c] "+r"(crc)
7968                          : [v] "r"(v));
7969 #else
7970     crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
7971     crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
7972 #endif
7973     return crc;
7974 }
7975 
7976 // Starting with the initial value in crc, accumulates a CRC32 value for
7977 // unsigned 8-bit integer v.
7978 // https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
_mm_crc32_u8(uint32_t crc,uint8_t v)7979 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
7980 {
7981 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
7982     __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
7983                          : [c] "+r"(crc)
7984                          : [v] "r"(v));
7985 #else
7986     crc ^= v;
7987     for (int bit = 0; bit < 8; bit++) {
7988         if (crc & 1)
7989             crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
7990         else
7991             crc = (crc >> 1);
7992     }
7993 #endif
7994     return crc;
7995 }
7996 
7997 /* AES */
7998 
7999 #if !defined(__ARM_FEATURE_CRYPTO)
8000 /* clang-format off */
8001 #define SSE2NEON_AES_DATA(w)                                           \
8002     {                                                                  \
8003         w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
8004         w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
8005         w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
8006         w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
8007         w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
8008         w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
8009         w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
8010         w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
8011         w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
8012         w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
8013         w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
8014         w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
8015         w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
8016         w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
8017         w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
8018         w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
8019         w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
8020         w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
8021         w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
8022         w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
8023         w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
8024         w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
8025         w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
8026         w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
8027         w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
8028         w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
8029         w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
8030         w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
8031         w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
8032         w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
8033         w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
8034         w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
8035         w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
8036         w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
8037         w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
8038         w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
8039         w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
8040     }
8041 /* clang-format on */
8042 
8043 /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
8044 #define SSE2NEON_AES_H0(x) (x)
8045 static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
8046 #undef SSE2NEON_AES_H0
8047 
8048 // In the absence of crypto extensions, implement aesenc using regular neon
8049 // intrinsics instead. See:
8050 // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
8051 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
8052 // https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
8053 // for more information Reproduced with permission of the author.
_mm_aesenc_si128(__m128i EncBlock,__m128i RoundKey)8054 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
8055 {
8056 #if defined(__aarch64__)
8057     static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
8058                                          0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
8059                                          0xc, 0x1, 0x6, 0xb};
8060     static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8061                                        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
8062 
8063     uint8x16_t v;
8064     uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
8065 
8066     // shift rows
8067     w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8068 
8069     // sub bytes
8070     v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
8071     v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
8072     v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
8073     v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
8074 
8075     // mix columns
8076     w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
8077     w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8078     w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8079 
8080     //  add round key
8081     return vreinterpretq_m128i_u8(w) ^ RoundKey;
8082 
8083 #else /* ARMv7-A NEON implementation */
8084 #define SSE2NEON_AES_B2W(b0, b1, b2, b3)                                       \
8085     (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
8086      (b0))
8087 #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
8088 #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
8089 #define SSE2NEON_AES_U0(p) \
8090     SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
8091 #define SSE2NEON_AES_U1(p) \
8092     SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
8093 #define SSE2NEON_AES_U2(p) \
8094     SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
8095 #define SSE2NEON_AES_U3(p) \
8096     SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
8097     static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
8098         SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
8099         SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
8100         SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
8101         SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
8102     };
8103 #undef SSE2NEON_AES_B2W
8104 #undef SSE2NEON_AES_F2
8105 #undef SSE2NEON_AES_F3
8106 #undef SSE2NEON_AES_U0
8107 #undef SSE2NEON_AES_U1
8108 #undef SSE2NEON_AES_U2
8109 #undef SSE2NEON_AES_U3
8110 
8111     uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
8112     uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
8113     uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
8114     uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
8115 
8116     __m128i out = _mm_set_epi32(
8117         (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8118          aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8119         (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8120          aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8121         (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8122          aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8123         (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8124          aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
8125 
8126     return _mm_xor_si128(out, RoundKey);
8127 #endif
8128 }
8129 
8130 // Perform the last round of an AES encryption flow on data (state) in a using
8131 // the round key in RoundKey, and store the result in dst.
8132 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
_mm_aesenclast_si128(__m128i a,__m128i RoundKey)8133 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
8134 {
8135     /* FIXME: optimized for NEON */
8136     uint8_t v[4][4] = {
8137         {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
8138          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
8139          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
8140          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
8141         {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
8142          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
8143          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
8144          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
8145         {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
8146          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
8147          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
8148          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
8149         {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
8150          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
8151          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
8152          SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
8153     };
8154     for (int i = 0; i < 16; i++)
8155         vreinterpretq_nth_u8_m128i(a, i) =
8156             v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
8157     return a;
8158 }
8159 
8160 // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
8161 // This instruction generates a round key for AES encryption. See
8162 // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
8163 // for details.
8164 //
8165 // https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
_mm_aeskeygenassist_si128(__m128i key,const int rcon)8166 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
8167 {
8168     uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
8169     uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
8170     for (int i = 0; i < 4; ++i) {
8171         ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
8172         ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
8173     }
8174     return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
8175                          ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
8176 }
8177 #undef SSE2NEON_AES_DATA
8178 
8179 #else /* __ARM_FEATURE_CRYPTO */
8180 // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
8181 // AESMC and then manually applying the real key as an xor operation. This
8182 // unfortunately means an additional xor op; the compiler should be able to
8183 // optimize this away for repeated calls however. See
8184 // https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
8185 // for more details.
_mm_aesenc_si128(__m128i a,__m128i b)8186 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
8187 {
8188     return vreinterpretq_m128i_u8(
8189         vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
8190         vreinterpretq_u8_m128i(b));
8191 }
8192 
8193 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
_mm_aesenclast_si128(__m128i a,__m128i RoundKey)8194 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
8195 {
8196     return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
8197                              vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
8198                          RoundKey);
8199 }
8200 
_mm_aeskeygenassist_si128(__m128i a,const int rcon)8201 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
8202 {
8203     // AESE does ShiftRows and SubBytes on A
8204     uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
8205 
8206     uint8x16_t dest = {
8207         // Undo ShiftRows step from AESE and extract X1 and X3
8208         u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
8209         u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
8210         u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
8211         u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
8212     };
8213     uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
8214     return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
8215 }
8216 #endif
8217 
8218 /* Others */
8219 
8220 // Perform a carry-less multiplication of two 64-bit integers, selected from a
8221 // and b according to imm8, and store the results in dst.
8222 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
_mm_clmulepi64_si128(__m128i _a,__m128i _b,const int imm)8223 FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
8224 {
8225     uint64x2_t a = vreinterpretq_u64_m128i(_a);
8226     uint64x2_t b = vreinterpretq_u64_m128i(_b);
8227     switch (imm & 0x11) {
8228     case 0x00:
8229         return vreinterpretq_m128i_u64(
8230             _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
8231     case 0x01:
8232         return vreinterpretq_m128i_u64(
8233             _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
8234     case 0x10:
8235         return vreinterpretq_m128i_u64(
8236             _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
8237     case 0x11:
8238         return vreinterpretq_m128i_u64(
8239             _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
8240     default:
8241         abort();
8242     }
8243 }
8244 
8245 // Count the number of bits set to 1 in unsigned 32-bit integer a, and
8246 // return that count in dst.
8247 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
_mm_popcnt_u32(unsigned int a)8248 FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
8249 {
8250 #if defined(__aarch64__)
8251 #if __has_builtin(__builtin_popcount)
8252     return __builtin_popcount(a);
8253 #else
8254     return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
8255 #endif
8256 #else
8257     uint32_t count = 0;
8258     uint8x8_t input_val, count8x8_val;
8259     uint16x4_t count16x4_val;
8260     uint32x2_t count32x2_val;
8261 
8262     input_val = vld1_u8((uint8_t *) &a);
8263     count8x8_val = vcnt_u8(input_val);
8264     count16x4_val = vpaddl_u8(count8x8_val);
8265     count32x2_val = vpaddl_u16(count16x4_val);
8266 
8267     vst1_u32(&count, count32x2_val);
8268     return count;
8269 #endif
8270 }
8271 
8272 // Count the number of bits set to 1 in unsigned 64-bit integer a, and
8273 // return that count in dst.
8274 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
_mm_popcnt_u64(uint64_t a)8275 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
8276 {
8277 #if defined(__aarch64__)
8278 #if __has_builtin(__builtin_popcountll)
8279     return __builtin_popcountll(a);
8280 #else
8281     return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
8282 #endif
8283 #else
8284     uint64_t count = 0;
8285     uint8x8_t input_val, count8x8_val;
8286     uint16x4_t count16x4_val;
8287     uint32x2_t count32x2_val;
8288     uint64x1_t count64x1_val;
8289 
8290     input_val = vld1_u8((uint8_t *) &a);
8291     count8x8_val = vcnt_u8(input_val);
8292     count16x4_val = vpaddl_u8(count8x8_val);
8293     count32x2_val = vpaddl_u16(count16x4_val);
8294     count64x1_val = vpaddl_u32(count32x2_val);
8295     vst1_u64(&count, count64x1_val);
8296     return count;
8297 #endif
8298 }
8299 
8300 #if defined(__GNUC__) || defined(__clang__)
8301 #pragma pop_macro("ALIGN_STRUCT")
8302 #pragma pop_macro("FORCE_INLINE")
8303 #endif
8304 
8305 #if defined(__GNUC__) && !defined(__clang__)
8306 #pragma GCC pop_options
8307 #endif
8308 
8309 #endif
8310