1 #ifndef SSE2NEON_H
2 #define SSE2NEON_H
3 
4 // This header file provides a simple API translation layer
5 // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
6 //
7 // This header file does not yet translate all of the SSE intrinsics.
8 //
9 // Contributors to this work are:
10 //   John W. Ratcliff <jratcliffscarab@gmail.com>
11 //   Brandon Rowlett <browlett@nvidia.com>
12 //   Ken Fast <kfast@gdeb.com>
13 //   Eric van Beurden <evanbeurden@nvidia.com>
14 //   Alexander Potylitsin <apotylitsin@nvidia.com>
15 //   Hasindu Gamaarachchi <hasindu2008@gmail.com>
16 //   Jim Huang <jserv@biilabs.io>
17 //   Mark Cheng <marktwtn@biilabs.io>
18 //   Malcolm James MacLeod <malcolm@gulden.com>
19 //   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
20 //   Sebastian Pop <spop@amazon.com>
21 //   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
22 //   Danila Kutenin <danilak@google.com>
23 //   François Turban (JishinMaster) <francois.turban@gmail.com>
24 //   Pei-Hsuan Hung <afcidk@gmail.com>
25 //   Yang-Hao Yuan <yanghau@biilabs.io>
26 
27 /*
28  * sse2neon is freely redistributable under the MIT License.
29  *
30  * Permission is hereby granted, free of charge, to any person obtaining a copy
31  * of this software and associated documentation files (the "Software"), to deal
32  * in the Software without restriction, including without limitation the rights
33  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
34  * copies of the Software, and to permit persons to whom the Software is
35  * furnished to do so, subject to the following conditions:
36  *
37  * The above copyright notice and this permission notice shall be included in
38  * all copies or substantial portions of the Software.
39  *
40  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
41  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
42  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
43  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
44  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
45  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
46  * SOFTWARE.
47  */
48 
49 /* Tunable configurations */
50 
51 /* Enable precise implementation of _mm_min_ps and _mm_max_ps
52  * This would slow down the computation a bit, but gives consistent result with
53  * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
54  */
55 #ifndef SSE2NEON_PRECISE_MINMAX
56 #define SSE2NEON_PRECISE_MINMAX (0)
57 #endif
58 
59 #if defined(__GNUC__) || defined(__clang__)
60 #pragma push_macro("FORCE_INLINE")
61 #pragma push_macro("ALIGN_STRUCT")
62 #define FORCE_INLINE static inline __attribute__((always_inline))
63 #define ALIGN_STRUCT(x) __attribute__((aligned(x)))
64 #else
65 #error "Macro name collisions may happen with unsupported compiler."
66 #ifdef FORCE_INLINE
67 #undef FORCE_INLINE
68 #endif
69 #define FORCE_INLINE static inline
70 #ifndef ALIGN_STRUCT
71 #define ALIGN_STRUCT(x) __declspec(align(x))
72 #endif
73 #endif
74 
75 #include <stdint.h>
76 #include <stdlib.h>
77 
78 // These cause the build to fail on raspberry pi with 'unsupported target'
79 // and don't seem to do anything particularly useful
80 ///* Architecture-specific build options */
81 ///* FIXME: #pragma GCC push_options is only available on GCC */
82 //#if defined(__GNUC__)
83 //#if defined(__arm__) && __ARM_ARCH == 7
84 ///* According to ARM C Language Extensions Architecture specification,
85 // * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
86 // * architecture supported.
87 // */
88 //#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
89 //#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
90 //#endif
91 //#pragma GCC push_options
92 //#pragma GCC target("fpu=neon")
93 //#elif defined(__aarch64__)
94 //#pragma GCC push_options
95 //#pragma GCC target("+simd")
96 //#else
97 //#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
98 //#endif
99 //#endif
100 
101 #include <arm_neon.h>
102 
103 /* Rounding functions require either Aarch64 instructions or libm failback */
104 #if !defined(__aarch64__)
105 #include <math.h>
106 #endif
107 
108 /* "__has_builtin" can be used to query support for built-in functions
109  * provided by gcc/clang and other compilers that support it.
110  */
111 #ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
112 /* Compatibility with gcc <= 9 */
113 #if __GNUC__ <= 9
114 #define __has_builtin(x) HAS##x
115 #define HAS__builtin_popcount 1
116 #define HAS__builtin_popcountll 1
117 #else
118 #define __has_builtin(x) 0
119 #endif
120 #endif
121 
122 /**
123  * MACRO for shuffle parameter for _mm_shuffle_ps().
124  * Argument fp3 is a digit[0123] that represents the fp from argument "b"
125  * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
126  * for fp2 in result. fp1 is a digit[0123] that represents the fp from
127  * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
128  * fp0 is the same for fp0 of result.
129  */
130 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
131     (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
132 
133 /* Rounding mode macros. */
134 #define _MM_FROUND_TO_NEAREST_INT 0x00
135 #define _MM_FROUND_TO_NEG_INF 0x01
136 #define _MM_FROUND_TO_POS_INF 0x02
137 #define _MM_FROUND_TO_ZERO 0x03
138 #define _MM_FROUND_CUR_DIRECTION 0x04
139 #define _MM_FROUND_NO_EXC 0x08
140 
141 /* indicate immediate constant argument in a given range */
142 #define __constrange(a, b) const
143 
144 /* A few intrinsics accept traditional data types like ints or floats, but
145  * most operate on data types that are specific to SSE.
146  * If a vector type ends in d, it contains doubles, and if it does not have
147  * a suffix, it contains floats. An integer vector type can contain any type
148  * of integer, from chars to shorts to unsigned long longs.
149  */
150 typedef int64x1_t __m64;
151 typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
152 // On ARM 32-bit architecture, the float64x2_t is not supported.
153 // The data type __m128d should be represented in a different way for related
154 // intrinsic conversion.
155 #if defined(__aarch64__)
156 typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
157 #else
158 typedef float32x4_t __m128d;
159 #endif
160 typedef int64x2_t __m128i; /* 128-bit vector containing integers */
161 
162 /* type-safe casting between types */
163 
164 #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
165 #define vreinterpretq_m128_f32(x) (x)
166 #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
167 
168 #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
169 #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
170 #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
171 #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
172 
173 #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
174 #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
175 #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
176 #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
177 
178 #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
179 #define vreinterpretq_f32_m128(x) (x)
180 #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
181 
182 #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
183 #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
184 #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
185 #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
186 
187 #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
188 #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
189 #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
190 #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
191 
192 #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
193 #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
194 #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
195 #define vreinterpretq_m128i_s64(x) (x)
196 
197 #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
198 #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
199 #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
200 #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
201 
202 #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
203 #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
204 #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
205 #define vreinterpretq_s64_m128i(x) (x)
206 
207 #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
208 #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
209 #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
210 #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
211 
212 #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
213 #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
214 #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
215 #define vreinterpret_m64_s64(x) (x)
216 
217 #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
218 #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
219 #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
220 #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
221 
222 #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
223 #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
224 #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
225 
226 #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
227 #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
228 #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
229 #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
230 
231 #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
232 #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
233 #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
234 #define vreinterpret_s64_m64(x) (x)
235 
236 #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
237 
238 #if defined(__aarch64__)
239 #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
240 #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
241 
242 #define vreinterpretq_m128d_f64(x) (x)
243 
244 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
245 
246 #define vreinterpretq_f64_m128d(x) (x)
247 #else
248 #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
249 #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
250 
251 #define vreinterpretq_m128d_f32(x) (x)
252 
253 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
254 
255 #define vreinterpretq_f32_m128d(x) (x)
256 #endif
257 
258 // A struct is defined in this header file called 'SIMDVec' which can be used
259 // by applications which attempt to access the contents of an _m128 struct
260 // directly.  It is important to note that accessing the __m128 struct directly
261 // is bad coding practice by Microsoft: @see:
262 // https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
263 //
264 // However, some legacy source code may try to access the contents of an __m128
265 // struct directly so the developer can use the SIMDVec as an alias for it.  Any
266 // casting must be done manually by the developer, as you cannot cast or
267 // otherwise alias the base NEON data type for intrinsic operations.
268 //
269 // union intended to allow direct access to an __m128 variable using the names
270 // that the MSVC compiler provides.  This union should really only be used when
271 // trying to access the members of the vector as integer values.  GCC/clang
272 // allow native access to the float members through a simple array access
273 // operator (in C since 4.6, in C++ since 4.8).
274 //
275 // Ideally direct accesses to SIMD vectors should not be used since it can cause
276 // a performance hit.  If it really is needed however, the original __m128
277 // variable can be aliased with a pointer to this union and used to access
278 // individual components.  The use of this union should be hidden behind a macro
279 // that is used throughout the codebase to access the members instead of always
280 // declaring this type of variable.
281 typedef union ALIGN_STRUCT(16) SIMDVec {
282     float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
283     int8_t m128_i8[16];    // as signed 8-bit integers.
284     int16_t m128_i16[8];   // as signed 16-bit integers.
285     int32_t m128_i32[4];   // as signed 32-bit integers.
286     int64_t m128_i64[2];   // as signed 64-bit integers.
287     uint8_t m128_u8[16];   // as unsigned 8-bit integers.
288     uint16_t m128_u16[8];  // as unsigned 16-bit integers.
289     uint32_t m128_u32[4];  // as unsigned 32-bit integers.
290     uint64_t m128_u64[2];  // as unsigned 64-bit integers.
291 } SIMDVec;
292 
293 // casting using SIMDVec
294 #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
295 #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
296 #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
297 
298 /* Backwards compatibility for compilers with lack of specific type support */
299 
300 // Older gcc does not define vld1q_u8_x4 type
301 #if defined(__GNUC__) && !defined(__clang__)
302 #if __GNUC__ <= 9
vld1q_u8_x4(const uint8_t * p)303 FORCE_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t *p)
304 {
305     uint8x16x4_t ret;
306     ret.val[0] = vld1q_u8(p + 0);
307     ret.val[1] = vld1q_u8(p + 16);
308     ret.val[2] = vld1q_u8(p + 32);
309     ret.val[3] = vld1q_u8(p + 48);
310     return ret;
311 }
312 #endif
313 #endif
314 
315 /* Function Naming Conventions
316  * The naming convention of SSE intrinsics is straightforward. A generic SSE
317  * intrinsic function is given as follows:
318  *   _mm_<name>_<data_type>
319  *
320  * The parts of this format are given as follows:
321  * 1. <name> describes the operation performed by the intrinsic
322  * 2. <data_type> identifies the data type of the function's primary arguments
323  *
324  * This last part, <data_type>, is a little complicated. It identifies the
325  * content of the input values, and can be set to any of the following values:
326  * + ps - vectors contain floats (ps stands for packed single-precision)
327  * + pd - vectors cantain doubles (pd stands for packed double-precision)
328  * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
329  *                            signed integers
330  * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
331  *                            unsigned integers
332  * + si128 - unspecified 128-bit vector or 256-bit vector
333  * + m128/m128i/m128d - identifies input vector types when they are different
334  *                      than the type of the returned vector
335  *
336  * For example, _mm_setzero_ps. The _mm implies that the function returns
337  * a 128-bit vector. The _ps at the end implies that the argument vectors
338  * contain floats.
339  *
340  * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
341  *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
342  *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
343  *   // Set packed 8-bit integers
344  *   // 128 bits, 16 chars, per 8 bits
345  *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
346  *                                  4, 5, 12, 13, 6, 7, 14, 15);
347  *   // Shuffle packed 8-bit integers
348  *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
349  *
350  * Data (Number, Binary, Byte Index):
351     +------+------+-------------+------+------+-------------+
352     |      1      |      2      |      3      |      4      | Number
353     +------+------+------+------+------+------+------+------+
354     | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
355     +------+------+------+------+------+------+------+------+
356     |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
357     +------+------+------+------+------+------+------+------+
358 
359     +------+------+------+------+------+------+------+------+
360     |      5      |      6      |      7      |      8      | Number
361     +------+------+------+------+------+------+------+------+
362     | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
363     +------+------+------+------+------+------+------+------+
364     |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
365     +------+------+------+------+------+------+------+------+
366  * Index (Byte Index):
367     +------+------+------+------+------+------+------+------+
368     |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
369     +------+------+------+------+------+------+------+------+
370 
371     +------+------+------+------+------+------+------+------+
372     |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
373     +------+------+------+------+------+------+------+------+
374  * Result:
375     +------+------+------+------+------+------+------+------+
376     |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
377     +------+------+------+------+------+------+------+------+
378     | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
379     +------+------+------+------+------+------+------+------+
380     |     256     |      2      |      5      |      6      | Number
381     +------+------+------+------+------+------+------+------+
382 
383     +------+------+------+------+------+------+------+------+
384     |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
385     +------+------+------+------+------+------+------+------+
386     | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
387     +------+------+------+------+------+------+------+------+
388     |      3      |      7      |      4      |      8      | Number
389     +------+------+------+------+------+------+-------------+
390  */
391 
392 /* Set/get methods */
393 
394 /* Constants for use with _mm_prefetch.  */
395 enum _mm_hint {
396     _MM_HINT_NTA = 0,  /* load data to L1 and L2 cache, mark it as NTA */
397     _MM_HINT_T0 = 1,   /* load data to L1 and L2 cache */
398     _MM_HINT_T1 = 2,   /* load data to L2 cache only */
399     _MM_HINT_T2 = 3,   /* load data to L2 cache only, mark it as NTA */
400     _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
401     _MM_HINT_ET0 = 5,  /* exclusive version of _MM_HINT_T0 */
402     _MM_HINT_ET1 = 6,  /* exclusive version of _MM_HINT_T1 */
403     _MM_HINT_ET2 = 7   /* exclusive version of _MM_HINT_T2 */
404 };
405 
406 // Loads one cache line of data from address p to a location closer to the
407 // processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
_mm_prefetch(const void * p,int i)408 FORCE_INLINE void _mm_prefetch(const void *p, int i)
409 {
410     (void) i;
411     __builtin_prefetch(p);
412 }
413 
414 // Copy the lower single-precision (32-bit) floating-point element of a to dst.
415 //
416 //   dst[31:0] := a[31:0]
417 //
418 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
_mm_cvtss_f32(__m128 a)419 FORCE_INLINE float _mm_cvtss_f32(__m128 a)
420 {
421     return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
422 }
423 
424 // Convert the lower single-precision (32-bit) floating-point element in a to a
425 // 32-bit integer, and store the result in dst.
426 //
427 //   dst[31:0] := Convert_FP32_To_Int32(a[31:0])
428 //
429 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
430 #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
431 
432 // Convert the lower single-precision (32-bit) floating-point element in a to a
433 // 64-bit integer, and store the result in dst.
434 //
435 //   dst[63:0] := Convert_FP32_To_Int64(a[31:0])
436 //
437 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
_mm_cvtss_si64(__m128 a)438 FORCE_INLINE int _mm_cvtss_si64(__m128 a)
439 {
440 #if defined(__aarch64__)
441     return vgetq_lane_s64(
442         vreinterpretq_s64_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))), 0);
443 #else
444     float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
445     float32_t diff = data - floor(data);
446     if (diff > 0.5)
447         return (int64_t) ceil(data);
448     if (diff == 0.5) {
449         int64_t f = (int64_t) floor(data);
450         int64_t c = (int64_t) ceil(data);
451         return c & 1 ? f : c;
452     }
453     return (int64_t) floor(data);
454 #endif
455 }
456 
457 // Convert packed single-precision (32-bit) floating-point elements in a to
458 // packed 32-bit integers with truncation, and store the results in dst.
459 //
460 //   FOR j := 0 to 1
461 //      i := 32*j
462 //      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
463 //   ENDFOR
464 //
465 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
_mm_cvtt_ps2pi(__m128 a)466 FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
467 {
468     return vreinterpret_m64_s32(
469         vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
470 }
471 
472 // Convert the lower single-precision (32-bit) floating-point element in a to a
473 // 32-bit integer with truncation, and store the result in dst.
474 //
475 //   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
476 //
477 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
_mm_cvtt_ss2si(__m128 a)478 FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
479 {
480     return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
481 }
482 
483 // Convert packed single-precision (32-bit) floating-point elements in a to
484 // packed 32-bit integers with truncation, and store the results in dst.
485 //
486 //   FOR j := 0 to 1
487 //      i := 32*j
488 //      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
489 //   ENDFOR
490 //
491 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
492 #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
493 
494 // Convert the lower single-precision (32-bit) floating-point element in a to a
495 // 32-bit integer with truncation, and store the result in dst.
496 //
497 //   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
498 //
499 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
500 #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
501 
502 // Convert the lower single-precision (32-bit) floating-point element in a to a
503 // 64-bit integer with truncation, and store the result in dst.
504 //
505 //   dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
506 //
507 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
_mm_cvttss_si64(__m128 a)508 FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
509 {
510     return vgetq_lane_s64(
511         vmovl_s32(vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))), 0);
512 }
513 
514 // Sets the 128-bit value to zero
515 // https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
_mm_setzero_si128(void)516 FORCE_INLINE __m128i _mm_setzero_si128(void)
517 {
518     return vreinterpretq_m128i_s32(vdupq_n_s32(0));
519 }
520 
521 // Clears the four single-precision, floating-point values.
522 // https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
_mm_setzero_ps(void)523 FORCE_INLINE __m128 _mm_setzero_ps(void)
524 {
525     return vreinterpretq_m128_f32(vdupq_n_f32(0));
526 }
527 
528 // Return vector of type __m128d with all elements set to zero.
529 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
_mm_setzero_pd(void)530 FORCE_INLINE __m128d _mm_setzero_pd(void)
531 {
532 #if defined(__aarch64__)
533     return vreinterpretq_m128d_f64(vdupq_n_f64(0));
534 #else
535     return vreinterpretq_m128d_f32(vdupq_n_f32(0));
536 #endif
537 }
538 
539 // Sets the four single-precision, floating-point values to w.
540 //
541 //   r0 := r1 := r2 := r3 := w
542 //
543 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
_mm_set1_ps(float _w)544 FORCE_INLINE __m128 _mm_set1_ps(float _w)
545 {
546     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
547 }
548 
549 // Sets the four single-precision, floating-point values to w.
550 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
_mm_set_ps1(float _w)551 FORCE_INLINE __m128 _mm_set_ps1(float _w)
552 {
553     return vreinterpretq_m128_f32(vdupq_n_f32(_w));
554 }
555 
556 // Sets the four single-precision, floating-point values to the four inputs.
557 // https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
_mm_set_ps(float w,float z,float y,float x)558 FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
559 {
560     float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
561     return vreinterpretq_m128_f32(vld1q_f32(data));
562 }
563 
564 // Copy single-precision (32-bit) floating-point element a to the lower element
565 // of dst, and zero the upper 3 elements.
566 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
_mm_set_ss(float a)567 FORCE_INLINE __m128 _mm_set_ss(float a)
568 {
569     float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
570     return vreinterpretq_m128_f32(vld1q_f32(data));
571 }
572 
573 // Sets the four single-precision, floating-point values to the four inputs in
574 // reverse order.
575 // https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
_mm_setr_ps(float w,float z,float y,float x)576 FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
577 {
578     float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
579     return vreinterpretq_m128_f32(vld1q_f32(data));
580 }
581 
582 // Sets the 8 signed 16-bit integer values in reverse order.
583 //
584 // Return Value
585 //   r0 := w0
586 //   r1 := w1
587 //   ...
588 //   r7 := w7
_mm_setr_epi16(short w0,short w1,short w2,short w3,short w4,short w5,short w6,short w7)589 FORCE_INLINE __m128i _mm_setr_epi16(short w0,
590                                     short w1,
591                                     short w2,
592                                     short w3,
593                                     short w4,
594                                     short w5,
595                                     short w6,
596                                     short w7)
597 {
598     int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
599     return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
600 }
601 
602 // Sets the 4 signed 32-bit integer values in reverse order
603 // https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
_mm_setr_epi32(int i3,int i2,int i1,int i0)604 FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
605 {
606     int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
607     return vreinterpretq_m128i_s32(vld1q_s32(data));
608 }
609 
610 // Set packed 64-bit integers in dst with the supplied values in reverse order.
611 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
_mm_setr_epi64(__m64 e1,__m64 e0)612 FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
613 {
614     return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
615 }
616 
617 // Sets the 16 signed 8-bit integer values to b.
618 //
619 //   r0 := b
620 //   r1 := b
621 //   ...
622 //   r15 := b
623 //
624 // https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
_mm_set1_epi8(signed char w)625 FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
626 {
627     return vreinterpretq_m128i_s8(vdupq_n_s8(w));
628 }
629 
630 // Sets the 8 signed 16-bit integer values to w.
631 //
632 //   r0 := w
633 //   r1 := w
634 //   ...
635 //   r7 := w
636 //
637 // https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
_mm_set1_epi16(short w)638 FORCE_INLINE __m128i _mm_set1_epi16(short w)
639 {
640     return vreinterpretq_m128i_s16(vdupq_n_s16(w));
641 }
642 
643 // Sets the 16 signed 8-bit integer values.
644 // https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
_mm_set_epi8(signed char b15,signed char b14,signed char b13,signed char b12,signed char b11,signed char b10,signed char b9,signed char b8,signed char b7,signed char b6,signed char b5,signed char b4,signed char b3,signed char b2,signed char b1,signed char b0)645 FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
646                                   signed char b14,
647                                   signed char b13,
648                                   signed char b12,
649                                   signed char b11,
650                                   signed char b10,
651                                   signed char b9,
652                                   signed char b8,
653                                   signed char b7,
654                                   signed char b6,
655                                   signed char b5,
656                                   signed char b4,
657                                   signed char b3,
658                                   signed char b2,
659                                   signed char b1,
660                                   signed char b0)
661 {
662     int8_t ALIGN_STRUCT(16)
663         data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
664                     (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
665                     (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
666                     (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
667     return (__m128i) vld1q_s8(data);
668 }
669 
670 // Sets the 8 signed 16-bit integer values.
671 // https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
_mm_set_epi16(short i7,short i6,short i5,short i4,short i3,short i2,short i1,short i0)672 FORCE_INLINE __m128i _mm_set_epi16(short i7,
673                                    short i6,
674                                    short i5,
675                                    short i4,
676                                    short i3,
677                                    short i2,
678                                    short i1,
679                                    short i0)
680 {
681     int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
682     return vreinterpretq_m128i_s16(vld1q_s16(data));
683 }
684 
685 // Sets the 16 signed 8-bit integer values in reverse order.
686 // https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
_mm_setr_epi8(signed char b0,signed char b1,signed char b2,signed char b3,signed char b4,signed char b5,signed char b6,signed char b7,signed char b8,signed char b9,signed char b10,signed char b11,signed char b12,signed char b13,signed char b14,signed char b15)687 FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
688                                    signed char b1,
689                                    signed char b2,
690                                    signed char b3,
691                                    signed char b4,
692                                    signed char b5,
693                                    signed char b6,
694                                    signed char b7,
695                                    signed char b8,
696                                    signed char b9,
697                                    signed char b10,
698                                    signed char b11,
699                                    signed char b12,
700                                    signed char b13,
701                                    signed char b14,
702                                    signed char b15)
703 {
704     int8_t ALIGN_STRUCT(16)
705         data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
706                     (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
707                     (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
708                     (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
709     return (__m128i) vld1q_s8(data);
710 }
711 
712 // Sets the 4 signed 32-bit integer values to i.
713 //
714 //   r0 := i
715 //   r1 := i
716 //   r2 := i
717 //   r3 := I
718 //
719 // https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
_mm_set1_epi32(int _i)720 FORCE_INLINE __m128i _mm_set1_epi32(int _i)
721 {
722     return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
723 }
724 
725 // Sets the 2 signed 64-bit integer values to i.
726 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
_mm_set1_epi64(__m64 _i)727 FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
728 {
729     return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
730 }
731 
732 // Sets the 2 signed 64-bit integer values to i.
733 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
_mm_set1_epi64x(int64_t _i)734 FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
735 {
736     return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
737 }
738 
739 // Sets the 4 signed 32-bit integer values.
740 // https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
_mm_set_epi32(int i3,int i2,int i1,int i0)741 FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
742 {
743     int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
744     return vreinterpretq_m128i_s32(vld1q_s32(data));
745 }
746 
747 // Returns the __m128i structure with its two 64-bit integer values
748 // initialized to the values of the two 64-bit integers passed in.
749 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
_mm_set_epi64x(int64_t i1,int64_t i2)750 FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
751 {
752     int64_t ALIGN_STRUCT(16) data[2] = {i2, i1};
753     return vreinterpretq_m128i_s64(vld1q_s64(data));
754 }
755 
756 // Returns the __m128i structure with its two 64-bit integer values
757 // initialized to the values of the two 64-bit integers passed in.
758 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
_mm_set_epi64(__m64 i1,__m64 i2)759 FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
760 {
761     return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
762 }
763 
764 // Set packed double-precision (64-bit) floating-point elements in dst with the
765 // supplied values.
766 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
_mm_set_pd(double e1,double e0)767 FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
768 {
769     double ALIGN_STRUCT(16) data[2] = {e0, e1};
770 #if defined(__aarch64__)
771     return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
772 #else
773     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
774 #endif
775 }
776 
777 // Stores four single-precision, floating-point values.
778 // https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
_mm_store_ps(float * p,__m128 a)779 FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
780 {
781     vst1q_f32(p, vreinterpretq_f32_m128(a));
782 }
783 
784 // Stores four single-precision, floating-point values.
785 // https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
_mm_storeu_ps(float * p,__m128 a)786 FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
787 {
788     vst1q_f32(p, vreinterpretq_f32_m128(a));
789 }
790 
791 // Stores four 32-bit integer values as (as a __m128i value) at the address p.
792 // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
_mm_store_si128(__m128i * p,__m128i a)793 FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
794 {
795     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
796 }
797 
798 // Stores four 32-bit integer values as (as a __m128i value) at the address p.
799 // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
_mm_storeu_si128(__m128i * p,__m128i a)800 FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
801 {
802     vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
803 }
804 
805 // Stores the lower single - precision, floating - point value.
806 // https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
_mm_store_ss(float * p,__m128 a)807 FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
808 {
809     vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
810 }
811 
812 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
813 // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
814 // or a general-protection exception may be generated.
815 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
_mm_store_pd(double * mem_addr,__m128d a)816 FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
817 {
818 #if defined(__aarch64__)
819     vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
820 #else
821     vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
822 #endif
823 }
824 
825 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
826 // elements) from a into memory. mem_addr does not need to be aligned on any
827 // particular boundary.
828 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
_mm_storeu_pd(double * mem_addr,__m128d a)829 FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
830 {
831     _mm_store_pd(mem_addr, a);
832 }
833 
834 // Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
835 // https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
_mm_storel_epi64(__m128i * a,__m128i b)836 FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
837 {
838     uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
839     uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
840     *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
841 }
842 
843 // Stores the lower two single-precision floating point values of a to the
844 // address p.
845 //
846 //   *p0 := a0
847 //   *p1 := a1
848 //
849 // https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
_mm_storel_pi(__m64 * p,__m128 a)850 FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
851 {
852     *p = vreinterpret_m64_f32(vget_low_f32(a));
853 }
854 
855 // Stores the upper two single-precision, floating-point values of a to the
856 // address p.
857 //
858 //   *p0 := a2
859 //   *p1 := a3
860 //
861 // https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
_mm_storeh_pi(__m64 * p,__m128 a)862 FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
863 {
864     *p = vreinterpret_m64_f32(vget_high_f32(a));
865 }
866 
867 // Loads a single single-precision, floating-point value, copying it into all
868 // four words
869 // https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
_mm_load1_ps(const float * p)870 FORCE_INLINE __m128 _mm_load1_ps(const float *p)
871 {
872     return vreinterpretq_m128_f32(vld1q_dup_f32(p));
873 }
874 
875 // Load a single-precision (32-bit) floating-point element from memory into all
876 // elements of dst.
877 //
878 //   dst[31:0] := MEM[mem_addr+31:mem_addr]
879 //   dst[63:32] := MEM[mem_addr+31:mem_addr]
880 //   dst[95:64] := MEM[mem_addr+31:mem_addr]
881 //   dst[127:96] := MEM[mem_addr+31:mem_addr]
882 //
883 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
884 #define _mm_load_ps1 _mm_load1_ps
885 
886 // Sets the lower two single-precision, floating-point values with 64
887 // bits of data loaded from the address p; the upper two values are passed
888 // through from a.
889 //
890 // Return Value
891 //   r0 := *p0
892 //   r1 := *p1
893 //   r2 := a2
894 //   r3 := a3
895 //
896 // https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
_mm_loadl_pi(__m128 a,__m64 const * p)897 FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
898 {
899     return vreinterpretq_m128_f32(
900         vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
901 }
902 
903 // Load 4 single-precision (32-bit) floating-point elements from memory into dst
904 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
905 // general-protection exception may be generated.
906 //
907 //   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
908 //   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
909 //   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
910 //   dst[127:96] := MEM[mem_addr+31:mem_addr]
911 //
912 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
_mm_loadr_ps(const float * p)913 FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
914 {
915     float32x4_t v = vrev64q_f32(vld1q_f32(p));
916     return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
917 }
918 
919 // Sets the upper two single-precision, floating-point values with 64
920 // bits of data loaded from the address p; the lower two values are passed
921 // through from a.
922 //
923 //   r0 := a0
924 //   r1 := a1
925 //   r2 := *p0
926 //   r3 := *p1
927 //
928 // https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
_mm_loadh_pi(__m128 a,__m64 const * p)929 FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
930 {
931     return vreinterpretq_m128_f32(
932         vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
933 }
934 
935 // Loads four single-precision, floating-point values.
936 // https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
_mm_load_ps(const float * p)937 FORCE_INLINE __m128 _mm_load_ps(const float *p)
938 {
939     return vreinterpretq_m128_f32(vld1q_f32(p));
940 }
941 
942 // Loads four single-precision, floating-point values.
943 // https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
_mm_loadu_ps(const float * p)944 FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
945 {
946     // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
947     // equivalent for neon
948     return vreinterpretq_m128_f32(vld1q_f32(p));
949 }
950 
951 // Load unaligned 16-bit integer from memory into the first element of dst.
952 //
953 //   dst[15:0] := MEM[mem_addr+15:mem_addr]
954 //   dst[MAX:16] := 0
955 //
956 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
_mm_loadu_si16(const void * p)957 FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
958 {
959     return vreinterpretq_m128i_s16(
960         vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
961 }
962 
963 // Load unaligned 64-bit integer from memory into the first element of dst.
964 //
965 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
966 //   dst[MAX:64] := 0
967 //
968 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
_mm_loadu_si64(const void * p)969 FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
970 {
971     return vreinterpretq_m128i_s64(
972         vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
973 }
974 
975 // Load a double-precision (64-bit) floating-point element from memory into the
976 // lower of dst, and zero the upper element. mem_addr does not need to be
977 // aligned on any particular boundary.
978 //
979 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
980 //   dst[127:64] := 0
981 //
982 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
_mm_load_sd(const double * p)983 FORCE_INLINE __m128d _mm_load_sd(const double *p)
984 {
985 #if defined(__aarch64__)
986     return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
987 #else
988     const float *fp = (const float *) p;
989     float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
990     return vreinterpretq_m128d_f32(vld1q_f32(data));
991 #endif
992 }
993 
994 // Loads two double-precision from 16-byte aligned memory, floating-point
995 // values.
996 //
997 //   dst[127:0] := MEM[mem_addr+127:mem_addr]
998 //
999 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
_mm_load_pd(const double * p)1000 FORCE_INLINE __m128d _mm_load_pd(const double *p)
1001 {
1002 #if defined(__aarch64__)
1003     return vreinterpretq_m128d_f64(vld1q_f64(p));
1004 #else
1005     const float *fp = (const float *) p;
1006     float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
1007     return vreinterpretq_m128d_f32(vld1q_f32(data));
1008 #endif
1009 }
1010 
1011 // Loads two double-precision from unaligned memory, floating-point values.
1012 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
_mm_loadu_pd(const double * p)1013 FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
1014 {
1015     return _mm_load_pd(p);
1016 }
1017 
1018 // Loads an single - precision, floating - point value into the low word and
1019 // clears the upper three words.
1020 // https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
_mm_load_ss(const float * p)1021 FORCE_INLINE __m128 _mm_load_ss(const float *p)
1022 {
1023     return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1024 }
1025 
_mm_loadl_epi64(__m128i const * p)1026 FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
1027 {
1028     /* Load the lower 64 bits of the value pointed to by p into the
1029      * lower 64 bits of the result, zeroing the upper 64 bits of the result.
1030      */
1031     return vreinterpretq_m128i_s32(
1032         vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
1033 }
1034 
1035 // Load a double-precision (64-bit) floating-point element from memory into the
1036 // lower element of dst, and copy the upper element from a to dst. mem_addr does
1037 // not need to be aligned on any particular boundary.
1038 //
1039 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
1040 //   dst[127:64] := a[127:64]
1041 //
1042 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
_mm_loadl_pd(__m128d a,const double * p)1043 FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
1044 {
1045 #if defined(__aarch64__)
1046     return vreinterpretq_m128d_f64(
1047         vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
1048 #else
1049     return vreinterpretq_m128d_f32(
1050         vcombine_f32(vld1_f32((const float *) p),
1051                      vget_high_f32(vreinterpretq_f32_m128d(a))));
1052 #endif
1053 }
1054 
1055 // Load 2 double-precision (64-bit) floating-point elements from memory into dst
1056 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1057 // general-protection exception may be generated.
1058 //
1059 //   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
1060 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
1061 //
1062 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
_mm_loadr_pd(const double * p)1063 FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
1064 {
1065 #if defined(__aarch64__)
1066     float64x2_t v = vld1q_f64(p);
1067     return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
1068 #else
1069     int64x2_t v = vld1q_s64((const int64_t *) p);
1070     return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
1071 #endif
1072 }
1073 
1074 // Sets the low word to the single-precision, floating-point value of b
1075 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
_mm_move_ss(__m128 a,__m128 b)1076 FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
1077 {
1078     return vreinterpretq_m128_f32(
1079         vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
1080                        vreinterpretq_f32_m128(a), 0));
1081 }
1082 
1083 // Copy the lower 64-bit integer in a to the lower element of dst, and zero the
1084 // upper element.
1085 //
1086 //   dst[63:0] := a[63:0]
1087 //   dst[127:64] := 0
1088 //
1089 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
_mm_move_epi64(__m128i a)1090 FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
1091 {
1092     return vreinterpretq_m128i_s64(
1093         vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
1094 }
1095 
1096 // Return vector of type __m128 with undefined elements.
1097 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
_mm_undefined_ps(void)1098 FORCE_INLINE __m128 _mm_undefined_ps(void)
1099 {
1100     __m128 a;
1101     return a;
1102 }
1103 
1104 /* Logic/Binary operations */
1105 
1106 // Computes the bitwise AND-NOT of the four single-precision, floating-point
1107 // values of a and b.
1108 //
1109 //   r0 := ~a0 & b0
1110 //   r1 := ~a1 & b1
1111 //   r2 := ~a2 & b2
1112 //   r3 := ~a3 & b3
1113 //
1114 // https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
_mm_andnot_ps(__m128 a,__m128 b)1115 FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
1116 {
1117     return vreinterpretq_m128_s32(
1118         vbicq_s32(vreinterpretq_s32_m128(b),
1119                   vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
1120 }
1121 
1122 // Compute the bitwise NOT of packed double-precision (64-bit) floating-point
1123 // elements in a and then AND with b, and store the results in dst.
1124 //
1125 //   FOR j := 0 to 1
1126 // 	     i := j*64
1127 // 	     dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
1128 //   ENDFOR
1129 //
1130 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
_mm_andnot_pd(__m128d a,__m128d b)1131 FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
1132 {
1133     // *NOTE* argument swap
1134     return vreinterpretq_m128d_s64(
1135         vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
1136 }
1137 
1138 // Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
1139 // 128-bit value in a.
1140 //
1141 //   r := (~a) & b
1142 //
1143 // https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
_mm_andnot_si128(__m128i a,__m128i b)1144 FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
1145 {
1146     return vreinterpretq_m128i_s32(
1147         vbicq_s32(vreinterpretq_s32_m128i(b),
1148                   vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
1149 }
1150 
1151 // Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
1152 // b.
1153 //
1154 //   r := a & b
1155 //
1156 // https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
_mm_and_si128(__m128i a,__m128i b)1157 FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
1158 {
1159     return vreinterpretq_m128i_s32(
1160         vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1161 }
1162 
1163 // Computes the bitwise AND of the four single-precision, floating-point values
1164 // of a and b.
1165 //
1166 //   r0 := a0 & b0
1167 //   r1 := a1 & b1
1168 //   r2 := a2 & b2
1169 //   r3 := a3 & b3
1170 //
1171 // https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
_mm_and_ps(__m128 a,__m128 b)1172 FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
1173 {
1174     return vreinterpretq_m128_s32(
1175         vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1176 }
1177 
1178 // Compute the bitwise AND of packed double-precision (64-bit) floating-point
1179 // elements in a and b, and store the results in dst.
1180 //
1181 //   FOR j := 0 to 1
1182 //     i := j*64
1183 //     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
1184 //   ENDFOR
1185 //
1186 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
_mm_and_pd(__m128d a,__m128d b)1187 FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
1188 {
1189     return vreinterpretq_m128d_s64(
1190         vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
1191 }
1192 
1193 // Computes the bitwise OR of the four single-precision, floating-point values
1194 // of a and b.
1195 // https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
_mm_or_ps(__m128 a,__m128 b)1196 FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
1197 {
1198     return vreinterpretq_m128_s32(
1199         vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1200 }
1201 
1202 // Computes bitwise EXOR (exclusive-or) of the four single-precision,
1203 // floating-point values of a and b.
1204 // https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
_mm_xor_ps(__m128 a,__m128 b)1205 FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
1206 {
1207     return vreinterpretq_m128_s32(
1208         veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1209 }
1210 
1211 // Compute the bitwise XOR of packed double-precision (64-bit) floating-point
1212 // elements in a and b, and store the results in dst.
1213 //
1214 //   FOR j := 0 to 1
1215 //      i := j*64
1216 //      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
1217 //   ENDFOR
1218 //
1219 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
_mm_xor_pd(__m128d a,__m128d b)1220 FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
1221 {
1222     return vreinterpretq_m128d_s64(
1223         veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
1224 }
1225 
1226 // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
1227 //
1228 //   r := a | b
1229 //
1230 // https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
_mm_or_si128(__m128i a,__m128i b)1231 FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
1232 {
1233     return vreinterpretq_m128i_s32(
1234         vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1235 }
1236 
1237 // Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
1238 // b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
_mm_xor_si128(__m128i a,__m128i b)1239 FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
1240 {
1241     return vreinterpretq_m128i_s32(
1242         veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1243 }
1244 
1245 // Duplicate odd-indexed single-precision (32-bit) floating-point elements
1246 // from a, and store the results in dst.
1247 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
_mm_movehdup_ps(__m128 a)1248 FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
1249 {
1250 #if __has_builtin(__builtin_shufflevector)
1251     return vreinterpretq_m128_f32(__builtin_shufflevector(
1252         vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
1253 #else
1254     float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
1255     float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
1256     float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
1257     return vreinterpretq_m128_f32(vld1q_f32(data));
1258 #endif
1259 }
1260 
1261 // Duplicate even-indexed single-precision (32-bit) floating-point elements
1262 // from a, and store the results in dst.
1263 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
_mm_moveldup_ps(__m128 a)1264 FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
1265 {
1266 #if __has_builtin(__builtin_shufflevector)
1267     return vreinterpretq_m128_f32(__builtin_shufflevector(
1268         vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
1269 #else
1270     float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1271     float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
1272     float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
1273     return vreinterpretq_m128_f32(vld1q_f32(data));
1274 #endif
1275 }
1276 
1277 // Moves the upper two values of B into the lower two values of A.
1278 //
1279 //   r3 := a3
1280 //   r2 := a2
1281 //   r1 := b3
1282 //   r0 := b2
_mm_movehl_ps(__m128 __A,__m128 __B)1283 FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
1284 {
1285     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
1286     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
1287     return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
1288 }
1289 
1290 // Moves the lower two values of B into the upper two values of A.
1291 //
1292 //   r3 := b1
1293 //   r2 := b0
1294 //   r1 := a1
1295 //   r0 := a0
_mm_movelh_ps(__m128 __A,__m128 __B)1296 FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
1297 {
1298     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
1299     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
1300     return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
1301 }
1302 
1303 // Compute the absolute value of packed signed 32-bit integers in a, and store
1304 // the unsigned results in dst.
1305 //
1306 //   FOR j := 0 to 3
1307 //     i := j*32
1308 //     dst[i+31:i] := ABS(a[i+31:i])
1309 //   ENDFOR
1310 //
1311 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
_mm_abs_epi32(__m128i a)1312 FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
1313 {
1314     return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
1315 }
1316 
1317 // Compute the absolute value of packed signed 16-bit integers in a, and store
1318 // the unsigned results in dst.
1319 //
1320 //   FOR j := 0 to 7
1321 //     i := j*16
1322 //     dst[i+15:i] := ABS(a[i+15:i])
1323 //   ENDFOR
1324 //
1325 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
_mm_abs_epi16(__m128i a)1326 FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
1327 {
1328     return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
1329 }
1330 
1331 // Compute the absolute value of packed signed 8-bit integers in a, and store
1332 // the unsigned results in dst.
1333 //
1334 //   FOR j := 0 to 15
1335 //     i := j*8
1336 //     dst[i+7:i] := ABS(a[i+7:i])
1337 //   ENDFOR
1338 //
1339 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
_mm_abs_epi8(__m128i a)1340 FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
1341 {
1342     return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
1343 }
1344 
1345 // Compute the absolute value of packed signed 32-bit integers in a, and store
1346 // the unsigned results in dst.
1347 //
1348 //   FOR j := 0 to 1
1349 //     i := j*32
1350 //     dst[i+31:i] := ABS(a[i+31:i])
1351 //   ENDFOR
1352 //
1353 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
_mm_abs_pi32(__m64 a)1354 FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
1355 {
1356     return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
1357 }
1358 
1359 // Compute the absolute value of packed signed 16-bit integers in a, and store
1360 // the unsigned results in dst.
1361 //
1362 //   FOR j := 0 to 3
1363 //     i := j*16
1364 //     dst[i+15:i] := ABS(a[i+15:i])
1365 //   ENDFOR
1366 //
1367 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
_mm_abs_pi16(__m64 a)1368 FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
1369 {
1370     return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
1371 }
1372 
1373 // Compute the absolute value of packed signed 8-bit integers in a, and store
1374 // the unsigned results in dst.
1375 //
1376 //   FOR j := 0 to 7
1377 //     i := j*8
1378 //     dst[i+7:i] := ABS(a[i+7:i])
1379 //   ENDFOR
1380 //
1381 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
_mm_abs_pi8(__m64 a)1382 FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
1383 {
1384     return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
1385 }
1386 
1387 // Takes the upper 64 bits of a and places it in the low end of the result
1388 // Takes the lower 64 bits of b and places it into the high end of the result.
_mm_shuffle_ps_1032(__m128 a,__m128 b)1389 FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
1390 {
1391     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
1392     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1393     return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
1394 }
1395 
1396 // takes the lower two 32-bit values from a and swaps them and places in high
1397 // end of result takes the higher two 32 bit values from b and swaps them and
1398 // places in low end of result.
_mm_shuffle_ps_2301(__m128 a,__m128 b)1399 FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
1400 {
1401     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1402     float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
1403     return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
1404 }
1405 
_mm_shuffle_ps_0321(__m128 a,__m128 b)1406 FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
1407 {
1408     float32x2_t a21 = vget_high_f32(
1409         vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
1410     float32x2_t b03 = vget_low_f32(
1411         vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
1412     return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
1413 }
1414 
_mm_shuffle_ps_2103(__m128 a,__m128 b)1415 FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
1416 {
1417     float32x2_t a03 = vget_low_f32(
1418         vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
1419     float32x2_t b21 = vget_high_f32(
1420         vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
1421     return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
1422 }
1423 
_mm_shuffle_ps_1010(__m128 a,__m128 b)1424 FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
1425 {
1426     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1427     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1428     return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
1429 }
1430 
_mm_shuffle_ps_1001(__m128 a,__m128 b)1431 FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
1432 {
1433     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1434     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1435     return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
1436 }
1437 
_mm_shuffle_ps_0101(__m128 a,__m128 b)1438 FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
1439 {
1440     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1441     float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
1442     return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
1443 }
1444 
1445 // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
1446 // high
_mm_shuffle_ps_3210(__m128 a,__m128 b)1447 FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
1448 {
1449     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1450     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
1451     return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
1452 }
1453 
_mm_shuffle_ps_0011(__m128 a,__m128 b)1454 FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
1455 {
1456     float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
1457     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1458     return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
1459 }
1460 
_mm_shuffle_ps_0022(__m128 a,__m128 b)1461 FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
1462 {
1463     float32x2_t a22 =
1464         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
1465     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1466     return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
1467 }
1468 
_mm_shuffle_ps_2200(__m128 a,__m128 b)1469 FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
1470 {
1471     float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
1472     float32x2_t b22 =
1473         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
1474     return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
1475 }
1476 
_mm_shuffle_ps_3202(__m128 a,__m128 b)1477 FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
1478 {
1479     float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1480     float32x2_t a22 =
1481         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
1482     float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
1483     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
1484     return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
1485 }
1486 
_mm_shuffle_ps_1133(__m128 a,__m128 b)1487 FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
1488 {
1489     float32x2_t a33 =
1490         vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
1491     float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
1492     return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
1493 }
1494 
_mm_shuffle_ps_2010(__m128 a,__m128 b)1495 FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
1496 {
1497     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1498     float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
1499     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1500     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1501     return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
1502 }
1503 
_mm_shuffle_ps_2001(__m128 a,__m128 b)1504 FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
1505 {
1506     float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1507     float32_t b2 = vgetq_lane_f32(b, 2);
1508     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1509     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1510     return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
1511 }
1512 
_mm_shuffle_ps_2032(__m128 a,__m128 b)1513 FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
1514 {
1515     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
1516     float32_t b2 = vgetq_lane_f32(b, 2);
1517     float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1518     float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1519     return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
1520 }
1521 
1522 // NEON does not support a general purpose permute intrinsic
1523 // Selects four specific single-precision, floating-point values from a and b,
1524 // based on the mask i.
1525 //
1526 // C equivalent:
1527 //   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
1528 //                                 __constrange(0, 255) int imm) {
1529 //       __m128 ret;
1530 //       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
1531 //       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
1532 //       return ret;
1533 //   }
1534 //
1535 // https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
1536 #define _mm_shuffle_ps_default(a, b, imm)                                  \
1537     __extension__({                                                        \
1538         float32x4_t ret;                                                   \
1539         ret = vmovq_n_f32(                                                 \
1540             vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
1541         ret = vsetq_lane_f32(                                              \
1542             vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
1543             ret, 1);                                                       \
1544         ret = vsetq_lane_f32(                                              \
1545             vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
1546             ret, 2);                                                       \
1547         ret = vsetq_lane_f32(                                              \
1548             vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
1549             ret, 3);                                                       \
1550         vreinterpretq_m128_f32(ret);                                       \
1551     })
1552 
1553 // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
1554 // int imm)
1555 #if __has_builtin(__builtin_shufflevector)
1556 #define _mm_shuffle_ps(a, b, imm)                                \
1557     __extension__({                                              \
1558         float32x4_t _input1 = vreinterpretq_f32_m128(a);         \
1559         float32x4_t _input2 = vreinterpretq_f32_m128(b);         \
1560         float32x4_t _shuf = __builtin_shufflevector(             \
1561             _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
1562             (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
1563         vreinterpretq_m128_f32(_shuf);                           \
1564     })
1565 #else  // generic
1566 #define _mm_shuffle_ps(a, b, imm)                          \
1567     __extension__({                                        \
1568         __m128 ret;                                        \
1569         switch (imm) {                                     \
1570         case _MM_SHUFFLE(1, 0, 3, 2):                      \
1571             ret = _mm_shuffle_ps_1032((a), (b));           \
1572             break;                                         \
1573         case _MM_SHUFFLE(2, 3, 0, 1):                      \
1574             ret = _mm_shuffle_ps_2301((a), (b));           \
1575             break;                                         \
1576         case _MM_SHUFFLE(0, 3, 2, 1):                      \
1577             ret = _mm_shuffle_ps_0321((a), (b));           \
1578             break;                                         \
1579         case _MM_SHUFFLE(2, 1, 0, 3):                      \
1580             ret = _mm_shuffle_ps_2103((a), (b));           \
1581             break;                                         \
1582         case _MM_SHUFFLE(1, 0, 1, 0):                      \
1583             ret = _mm_movelh_ps((a), (b));                 \
1584             break;                                         \
1585         case _MM_SHUFFLE(1, 0, 0, 1):                      \
1586             ret = _mm_shuffle_ps_1001((a), (b));           \
1587             break;                                         \
1588         case _MM_SHUFFLE(0, 1, 0, 1):                      \
1589             ret = _mm_shuffle_ps_0101((a), (b));           \
1590             break;                                         \
1591         case _MM_SHUFFLE(3, 2, 1, 0):                      \
1592             ret = _mm_shuffle_ps_3210((a), (b));           \
1593             break;                                         \
1594         case _MM_SHUFFLE(0, 0, 1, 1):                      \
1595             ret = _mm_shuffle_ps_0011((a), (b));           \
1596             break;                                         \
1597         case _MM_SHUFFLE(0, 0, 2, 2):                      \
1598             ret = _mm_shuffle_ps_0022((a), (b));           \
1599             break;                                         \
1600         case _MM_SHUFFLE(2, 2, 0, 0):                      \
1601             ret = _mm_shuffle_ps_2200((a), (b));           \
1602             break;                                         \
1603         case _MM_SHUFFLE(3, 2, 0, 2):                      \
1604             ret = _mm_shuffle_ps_3202((a), (b));           \
1605             break;                                         \
1606         case _MM_SHUFFLE(3, 2, 3, 2):                      \
1607             ret = _mm_movehl_ps((b), (a));                 \
1608             break;                                         \
1609         case _MM_SHUFFLE(1, 1, 3, 3):                      \
1610             ret = _mm_shuffle_ps_1133((a), (b));           \
1611             break;                                         \
1612         case _MM_SHUFFLE(2, 0, 1, 0):                      \
1613             ret = _mm_shuffle_ps_2010((a), (b));           \
1614             break;                                         \
1615         case _MM_SHUFFLE(2, 0, 0, 1):                      \
1616             ret = _mm_shuffle_ps_2001((a), (b));           \
1617             break;                                         \
1618         case _MM_SHUFFLE(2, 0, 3, 2):                      \
1619             ret = _mm_shuffle_ps_2032((a), (b));           \
1620             break;                                         \
1621         default:                                           \
1622             ret = _mm_shuffle_ps_default((a), (b), (imm)); \
1623             break;                                         \
1624         }                                                  \
1625         ret;                                               \
1626     })
1627 #endif
1628 
1629 // Takes the upper 64 bits of a and places it in the low end of the result
1630 // Takes the lower 64 bits of a and places it into the high end of the result.
_mm_shuffle_epi_1032(__m128i a)1631 FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
1632 {
1633     int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1634     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1635     return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
1636 }
1637 
1638 // takes the lower two 32-bit values from a and swaps them and places in low end
1639 // of result takes the higher two 32 bit values from a and swaps them and places
1640 // in high end of result.
_mm_shuffle_epi_2301(__m128i a)1641 FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
1642 {
1643     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1644     int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
1645     return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
1646 }
1647 
1648 // rotates the least significant 32 bits into the most signficant 32 bits, and
1649 // shifts the rest down
_mm_shuffle_epi_0321(__m128i a)1650 FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
1651 {
1652     return vreinterpretq_m128i_s32(
1653         vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
1654 }
1655 
1656 // rotates the most significant 32 bits into the least signficant 32 bits, and
1657 // shifts the rest up
_mm_shuffle_epi_2103(__m128i a)1658 FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
1659 {
1660     return vreinterpretq_m128i_s32(
1661         vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
1662 }
1663 
1664 // gets the lower 64 bits of a, and places it in the upper 64 bits
1665 // gets the lower 64 bits of a and places it in the lower 64 bits
_mm_shuffle_epi_1010(__m128i a)1666 FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
1667 {
1668     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1669     return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
1670 }
1671 
1672 // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
1673 // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
_mm_shuffle_epi_1001(__m128i a)1674 FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
1675 {
1676     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1677     int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1678     return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
1679 }
1680 
1681 // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
1682 // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
1683 // places it in the lower 64 bits
_mm_shuffle_epi_0101(__m128i a)1684 FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
1685 {
1686     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1687     return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
1688 }
1689 
_mm_shuffle_epi_2211(__m128i a)1690 FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
1691 {
1692     int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
1693     int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1694     return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
1695 }
1696 
_mm_shuffle_epi_0122(__m128i a)1697 FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
1698 {
1699     int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1700     int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1701     return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
1702 }
1703 
_mm_shuffle_epi_3332(__m128i a)1704 FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
1705 {
1706     int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1707     int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
1708     return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
1709 }
1710 
1711 // Shuffle packed 8-bit integers in a according to shuffle control mask in the
1712 // corresponding 8-bit element of b, and store the results in dst.
1713 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
_mm_shuffle_epi8(__m128i a,__m128i b)1714 FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
1715 {
1716     int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
1717     uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
1718     uint8x16_t idx_masked =
1719         vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
1720 #if defined(__aarch64__)
1721     return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
1722 #elif defined(__GNUC__)
1723     int8x16_t ret;
1724     // %e and %f represent the even and odd D registers
1725     // respectively.
1726     __asm__ __volatile__(
1727         "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
1728         "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
1729         : [ret] "=&w"(ret)
1730         : [tbl] "w"(tbl), [idx] "w"(idx_masked));
1731     return vreinterpretq_m128i_s8(ret);
1732 #else
1733     // use this line if testing on aarch64
1734     int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
1735     return vreinterpretq_m128i_s8(
1736         vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
1737                     vtbl2_s8(a_split, vget_high_u8(idx_masked))));
1738 #endif
1739 }
1740 
1741 // C equivalent:
1742 //   __m128i _mm_shuffle_epi32_default(__m128i a,
1743 //                                     __constrange(0, 255) int imm) {
1744 //       __m128i ret;
1745 //       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
1746 //       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
1747 //       return ret;
1748 //   }
1749 #define _mm_shuffle_epi32_default(a, imm)                                   \
1750     __extension__({                                                         \
1751         int32x4_t ret;                                                      \
1752         ret = vmovq_n_s32(                                                  \
1753             vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
1754         ret = vsetq_lane_s32(                                               \
1755             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
1756             ret, 1);                                                        \
1757         ret = vsetq_lane_s32(                                               \
1758             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
1759             ret, 2);                                                        \
1760         ret = vsetq_lane_s32(                                               \
1761             vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
1762             ret, 3);                                                        \
1763         vreinterpretq_m128i_s32(ret);                                       \
1764     })
1765 
1766 // FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
1767 // int imm)
1768 #if defined(__aarch64__)
1769 #define _mm_shuffle_epi32_splat(a, imm)                          \
1770     __extension__({                                              \
1771         vreinterpretq_m128i_s32(                                 \
1772             vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
1773     })
1774 #else
1775 #define _mm_shuffle_epi32_splat(a, imm)                                      \
1776     __extension__({                                                          \
1777         vreinterpretq_m128i_s32(                                             \
1778             vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
1779     })
1780 #endif
1781 
1782 // Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
1783 // https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
1784 // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
1785 //                                        __constrange(0,255) int imm)
1786 #if __has_builtin(__builtin_shufflevector)
1787 #define _mm_shuffle_epi32(a, imm)                              \
1788     __extension__({                                            \
1789         int32x4_t _input = vreinterpretq_s32_m128i(a);         \
1790         int32x4_t _shuf = __builtin_shufflevector(             \
1791             _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
1792             ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
1793         vreinterpretq_m128i_s32(_shuf);                        \
1794     })
1795 #else  // generic
1796 #define _mm_shuffle_epi32(a, imm)                        \
1797     __extension__({                                      \
1798         __m128i ret;                                     \
1799         switch (imm) {                                   \
1800         case _MM_SHUFFLE(1, 0, 3, 2):                    \
1801             ret = _mm_shuffle_epi_1032((a));             \
1802             break;                                       \
1803         case _MM_SHUFFLE(2, 3, 0, 1):                    \
1804             ret = _mm_shuffle_epi_2301((a));             \
1805             break;                                       \
1806         case _MM_SHUFFLE(0, 3, 2, 1):                    \
1807             ret = _mm_shuffle_epi_0321((a));             \
1808             break;                                       \
1809         case _MM_SHUFFLE(2, 1, 0, 3):                    \
1810             ret = _mm_shuffle_epi_2103((a));             \
1811             break;                                       \
1812         case _MM_SHUFFLE(1, 0, 1, 0):                    \
1813             ret = _mm_shuffle_epi_1010((a));             \
1814             break;                                       \
1815         case _MM_SHUFFLE(1, 0, 0, 1):                    \
1816             ret = _mm_shuffle_epi_1001((a));             \
1817             break;                                       \
1818         case _MM_SHUFFLE(0, 1, 0, 1):                    \
1819             ret = _mm_shuffle_epi_0101((a));             \
1820             break;                                       \
1821         case _MM_SHUFFLE(2, 2, 1, 1):                    \
1822             ret = _mm_shuffle_epi_2211((a));             \
1823             break;                                       \
1824         case _MM_SHUFFLE(0, 1, 2, 2):                    \
1825             ret = _mm_shuffle_epi_0122((a));             \
1826             break;                                       \
1827         case _MM_SHUFFLE(3, 3, 3, 2):                    \
1828             ret = _mm_shuffle_epi_3332((a));             \
1829             break;                                       \
1830         case _MM_SHUFFLE(0, 0, 0, 0):                    \
1831             ret = _mm_shuffle_epi32_splat((a), 0);       \
1832             break;                                       \
1833         case _MM_SHUFFLE(1, 1, 1, 1):                    \
1834             ret = _mm_shuffle_epi32_splat((a), 1);       \
1835             break;                                       \
1836         case _MM_SHUFFLE(2, 2, 2, 2):                    \
1837             ret = _mm_shuffle_epi32_splat((a), 2);       \
1838             break;                                       \
1839         case _MM_SHUFFLE(3, 3, 3, 3):                    \
1840             ret = _mm_shuffle_epi32_splat((a), 3);       \
1841             break;                                       \
1842         default:                                         \
1843             ret = _mm_shuffle_epi32_default((a), (imm)); \
1844             break;                                       \
1845         }                                                \
1846         ret;                                             \
1847     })
1848 #endif
1849 
1850 // Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
1851 // by imm.
1852 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
1853 // FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
1854 //                                                   __constrange(0,255) int
1855 //                                                   imm)
1856 #define _mm_shufflelo_epi16_function(a, imm)                                  \
1857     __extension__({                                                           \
1858         int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
1859         int16x4_t lowBits = vget_low_s16(ret);                                \
1860         ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
1861         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
1862                              1);                                              \
1863         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
1864                              2);                                              \
1865         ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1866                              3);                                              \
1867         vreinterpretq_m128i_s16(ret);                                         \
1868     })
1869 
1870 // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
1871 //                                          __constrange(0,255) int imm)
1872 #if __has_builtin(__builtin_shufflevector)
1873 #define _mm_shufflelo_epi16(a, imm)                                  \
1874     __extension__({                                                  \
1875         int16x8_t _input = vreinterpretq_s16_m128i(a);               \
1876         int16x8_t _shuf = __builtin_shufflevector(                   \
1877             _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
1878             (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
1879         vreinterpretq_m128i_s16(_shuf);                              \
1880     })
1881 #else  // generic
1882 #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
1883 #endif
1884 
1885 // Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
1886 // by imm.
1887 // https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
1888 // FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
1889 //                                                   __constrange(0,255) int
1890 //                                                   imm)
1891 #define _mm_shufflehi_epi16_function(a, imm)                                   \
1892     __extension__({                                                            \
1893         int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
1894         int16x4_t highBits = vget_high_s16(ret);                               \
1895         ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
1896         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1897                              5);                                               \
1898         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1899                              6);                                               \
1900         ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1901                              7);                                               \
1902         vreinterpretq_m128i_s16(ret);                                          \
1903     })
1904 
1905 // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
1906 //                                          __constrange(0,255) int imm)
1907 #if __has_builtin(__builtin_shufflevector)
1908 #define _mm_shufflehi_epi16(a, imm)                             \
1909     __extension__({                                             \
1910         int16x8_t _input = vreinterpretq_s16_m128i(a);          \
1911         int16x8_t _shuf = __builtin_shufflevector(              \
1912             _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
1913             (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
1914             (((imm) >> 6) & 0x3) + 4);                          \
1915         vreinterpretq_m128i_s16(_shuf);                         \
1916     })
1917 #else  // generic
1918 #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
1919 #endif
1920 
1921 // Blend packed 16-bit integers from a and b using control mask imm8, and store
1922 // the results in dst.
1923 //
1924 //   FOR j := 0 to 7
1925 //       i := j*16
1926 //       IF imm8[j]
1927 //           dst[i+15:i] := b[i+15:i]
1928 //       ELSE
1929 //           dst[i+15:i] := a[i+15:i]
1930 //       FI
1931 //   ENDFOR
1932 // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
1933 //                                      __constrange(0,255) int imm)
1934 #define _mm_blend_epi16(a, b, imm)                                        \
1935     __extension__({                                                       \
1936         const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000,  \
1937                                    ((imm) & (1 << 1)) ? 0xFFFF : 0x0000,  \
1938                                    ((imm) & (1 << 2)) ? 0xFFFF : 0x0000,  \
1939                                    ((imm) & (1 << 3)) ? 0xFFFF : 0x0000,  \
1940                                    ((imm) & (1 << 4)) ? 0xFFFF : 0x0000,  \
1941                                    ((imm) & (1 << 5)) ? 0xFFFF : 0x0000,  \
1942                                    ((imm) & (1 << 6)) ? 0xFFFF : 0x0000,  \
1943                                    ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \
1944         uint16x8_t _mask_vec = vld1q_u16(_mask);                          \
1945         uint16x8_t _a = vreinterpretq_u16_m128i(a);                       \
1946         uint16x8_t _b = vreinterpretq_u16_m128i(b);                       \
1947         vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));            \
1948     })
1949 
1950 // Blend packed 8-bit integers from a and b using mask, and store the results in
1951 // dst.
1952 //
1953 //   FOR j := 0 to 15
1954 //       i := j*8
1955 //       IF mask[i+7]
1956 //           dst[i+7:i] := b[i+7:i]
1957 //       ELSE
1958 //           dst[i+7:i] := a[i+7:i]
1959 //       FI
1960 //   ENDFOR
_mm_blendv_epi8(__m128i _a,__m128i _b,__m128i _mask)1961 FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
1962 {
1963     // Use a signed shift right to create a mask with the sign bit
1964     uint8x16_t mask =
1965         vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
1966     uint8x16_t a = vreinterpretq_u8_m128i(_a);
1967     uint8x16_t b = vreinterpretq_u8_m128i(_b);
1968     return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
1969 }
1970 
1971 /* Shifts */
1972 
1973 
1974 // Shift packed 16-bit integers in a right by imm while shifting in sign
1975 // bits, and store the results in dst.
1976 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
_mm_srai_epi16(__m128i a,int imm)1977 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
1978 {
1979     const int count = (imm & ~15) ? 15 : imm;
1980     return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
1981 }
1982 
1983 // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
1984 // shifting in zeros.
1985 //
1986 //   r0 := a0 << count
1987 //   r1 := a1 << count
1988 //   ...
1989 //   r7 := a7 << count
1990 //
1991 // https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
1992 #define _mm_slli_epi16(a, imm)                                   \
1993     __extension__({                                              \
1994         __m128i ret;                                             \
1995         if ((imm) <= 0) {                                        \
1996             ret = a;                                             \
1997         } else if ((imm) > 15) {                                 \
1998             ret = _mm_setzero_si128();                           \
1999         } else {                                                 \
2000             ret = vreinterpretq_m128i_s16(                       \
2001                 vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
2002         }                                                        \
2003         ret;                                                     \
2004     })
2005 
2006 // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
2007 // shifting in zeros. :
2008 // https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
2009 // FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
_mm_slli_epi32(__m128i a,int imm)2010 FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
2011 {
2012     if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
2013         return a;
2014     if (imm > 31) /* TODO: add unlikely macro */
2015         return _mm_setzero_si128();
2016     return vreinterpretq_m128i_s32(
2017         vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
2018 }
2019 
2020 // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
2021 // store the results in dst.
_mm_slli_epi64(__m128i a,int imm)2022 FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
2023 {
2024     if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
2025         return a;
2026     if (imm > 63) /* TODO: add unlikely macro */
2027         return _mm_setzero_si128();
2028     return vreinterpretq_m128i_s64(
2029         vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
2030 }
2031 
2032 // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
2033 // store the results in dst.
2034 //
2035 //   FOR j := 0 to 7
2036 //     i := j*16
2037 //     IF imm8[7:0] > 15
2038 //       dst[i+15:i] := 0
2039 //     ELSE
2040 //       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
2041 //     FI
2042 //   ENDFOR
2043 //
2044 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
2045 #define _mm_srli_epi16(a, imm)                                             \
2046     __extension__({                                                        \
2047         __m128i ret;                                                       \
2048         if ((imm) == 0) {                                                  \
2049             ret = a;                                                       \
2050         } else if (0 < (imm) && (imm) < 16) {                              \
2051             ret = vreinterpretq_m128i_u16(                                 \
2052                 vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
2053         } else {                                                           \
2054             ret = _mm_setzero_si128();                                     \
2055         }                                                                  \
2056         ret;                                                               \
2057     })
2058 
2059 // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
2060 // store the results in dst.
2061 //
2062 //   FOR j := 0 to 3
2063 //     i := j*32
2064 //     IF imm8[7:0] > 31
2065 //       dst[i+31:i] := 0
2066 //     ELSE
2067 //       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
2068 //     FI
2069 //   ENDFOR
2070 //
2071 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
2072 // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
2073 #define _mm_srli_epi32(a, imm)                                             \
2074     __extension__({                                                        \
2075         __m128i ret;                                                       \
2076         if ((imm) == 0) {                                                  \
2077             ret = a;                                                       \
2078         } else if (0 < (imm) && (imm) < 32) {                              \
2079             ret = vreinterpretq_m128i_u32(                                 \
2080                 vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
2081         } else {                                                           \
2082             ret = _mm_setzero_si128();                                     \
2083         }                                                                  \
2084         ret;                                                               \
2085     })
2086 
2087 // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
2088 // store the results in dst.
2089 //
2090 //   FOR j := 0 to 1
2091 //     i := j*64
2092 //     IF imm8[7:0] > 63
2093 //       dst[i+63:i] := 0
2094 //     ELSE
2095 //       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
2096 //     FI
2097 //   ENDFOR
2098 //
2099 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
2100 #define _mm_srli_epi64(a, imm)                                             \
2101     __extension__({                                                        \
2102         __m128i ret;                                                       \
2103         if ((imm) == 0) {                                                  \
2104             ret = a;                                                       \
2105         } else if (0 < (imm) && (imm) < 64) {                              \
2106             ret = vreinterpretq_m128i_u64(                                 \
2107                 vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
2108         } else {                                                           \
2109             ret = _mm_setzero_si128();                                     \
2110         }                                                                  \
2111         ret;                                                               \
2112     })
2113 
2114 // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
2115 // and store the results in dst.
2116 //
2117 //   FOR j := 0 to 3
2118 //     i := j*32
2119 //     IF imm8[7:0] > 31
2120 //       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
2121 //     ELSE
2122 //       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
2123 //     FI
2124 //   ENDFOR
2125 //
2126 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
2127 // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
2128 #define _mm_srai_epi32(a, imm)                                             \
2129     __extension__({                                                        \
2130         __m128i ret;                                                       \
2131         if ((imm) == 0) {                                                  \
2132             ret = a;                                                       \
2133         } else if (0 < (imm) && (imm) < 32) {                              \
2134             ret = vreinterpretq_m128i_s32(                                 \
2135                 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
2136         } else {                                                           \
2137             ret = vreinterpretq_m128i_s32(                                 \
2138                 vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));              \
2139         }                                                                  \
2140         ret;                                                               \
2141     })
2142 
2143 // Shifts the 128 - bit value in a right by imm bytes while shifting in
2144 // zeros.imm must be an immediate.
2145 //
2146 //   r := srl(a, imm*8)
2147 //
2148 // https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
2149 // FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
2150 #define _mm_srli_si128(a, imm)                                              \
2151     __extension__({                                                         \
2152         __m128i ret;                                                        \
2153         if ((imm) <= 0) {                                                   \
2154             ret = a;                                                        \
2155         } else if ((imm) > 15) {                                            \
2156             ret = _mm_setzero_si128();                                      \
2157         } else {                                                            \
2158             ret = vreinterpretq_m128i_s8(                                   \
2159                 vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
2160         }                                                                   \
2161         ret;                                                                \
2162     })
2163 
2164 // Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
2165 // must be an immediate.
2166 //
2167 //   r := a << (imm * 8)
2168 //
2169 // https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
2170 // FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
2171 #define _mm_slli_si128(a, imm)                                          \
2172     __extension__({                                                     \
2173         __m128i ret;                                                    \
2174         if ((imm) <= 0) {                                               \
2175             ret = a;                                                    \
2176         } else if ((imm) > 15) {                                        \
2177             ret = _mm_setzero_si128();                                  \
2178         } else {                                                        \
2179             ret = vreinterpretq_m128i_s8(vextq_s8(                      \
2180                 vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
2181         }                                                               \
2182         ret;                                                            \
2183     })
2184 
2185 // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
2186 // shifting in zeros.
2187 //
2188 //   r0 := a0 << count
2189 //   r1 := a1 << count
2190 //   ...
2191 //   r7 := a7 << count
2192 //
2193 // https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
_mm_sll_epi16(__m128i a,__m128i count)2194 FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
2195 {
2196     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2197     if (c > 15)
2198         return _mm_setzero_si128();
2199 
2200     int16x8_t vc = vdupq_n_s16((int16_t) c);
2201     return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
2202 }
2203 
2204 // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
2205 // shifting in zeros.
2206 //
2207 // r0 := a0 << count
2208 // r1 := a1 << count
2209 // r2 := a2 << count
2210 // r3 := a3 << count
2211 //
2212 // https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
_mm_sll_epi32(__m128i a,__m128i count)2213 FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
2214 {
2215     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2216     if (c > 31)
2217         return _mm_setzero_si128();
2218 
2219     int32x4_t vc = vdupq_n_s32((int32_t) c);
2220     return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
2221 }
2222 
2223 // Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
2224 // shifting in zeros.
2225 //
2226 // r0 := a0 << count
2227 // r1 := a1 << count
2228 //
2229 // https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
_mm_sll_epi64(__m128i a,__m128i count)2230 FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
2231 {
2232     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2233     if (c > 63)
2234         return _mm_setzero_si128();
2235 
2236     int64x2_t vc = vdupq_n_s64((int64_t) c);
2237     return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
2238 }
2239 
2240 // Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
2241 // while shifting in zeros.
2242 //
2243 // r0 := srl(a0, count)
2244 // r1 := srl(a1, count)
2245 // ...
2246 // r7 := srl(a7, count)
2247 //
2248 // https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
_mm_srl_epi16(__m128i a,__m128i count)2249 FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
2250 {
2251     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2252     if (c > 15)
2253         return _mm_setzero_si128();
2254 
2255     int16x8_t vc = vdupq_n_s16(-(int16_t) c);
2256     return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
2257 }
2258 
2259 // Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
2260 // while shifting in zeros.
2261 //
2262 // r0 := srl(a0, count)
2263 // r1 := srl(a1, count)
2264 // r2 := srl(a2, count)
2265 // r3 := srl(a3, count)
2266 //
2267 // https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
_mm_srl_epi32(__m128i a,__m128i count)2268 FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
2269 {
2270     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2271     if (c > 31)
2272         return _mm_setzero_si128();
2273 
2274     int32x4_t vc = vdupq_n_s32(-(int32_t) c);
2275     return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
2276 }
2277 
2278 // Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
2279 // while shifting in zeros.
2280 //
2281 // r0 := srl(a0, count)
2282 // r1 := srl(a1, count)
2283 //
2284 // https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
_mm_srl_epi64(__m128i a,__m128i count)2285 FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
2286 {
2287     uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2288     if (c > 63)
2289         return _mm_setzero_si128();
2290 
2291     int64x2_t vc = vdupq_n_s64(-(int64_t) c);
2292     return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
2293 }
2294 
2295 // NEON does not provide a version of this function.
2296 // Creates a 16-bit mask from the most significant bits of the 16 signed or
2297 // unsigned 8-bit integers in a and zero extends the upper bits.
2298 // https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
_mm_movemask_epi8(__m128i a)2299 FORCE_INLINE int _mm_movemask_epi8(__m128i a)
2300 {
2301 #if defined(__aarch64__)
2302     uint8x16_t input = vreinterpretq_u8_m128i(a);
2303     const int8_t ALIGN_STRUCT(16)
2304         xr[16] = {-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0};
2305     const uint8x16_t mask_and = vdupq_n_u8(0x80);
2306     const int8x16_t mask_shift = vld1q_s8(xr);
2307     const uint8x16_t mask_result =
2308         vshlq_u8(vandq_u8(input, mask_and), mask_shift);
2309     uint8x8_t lo = vget_low_u8(mask_result);
2310     uint8x8_t hi = vget_high_u8(mask_result);
2311 
2312     return vaddv_u8(lo) + (vaddv_u8(hi) << 8);
2313 #else
2314     // Use increasingly wide shifts+adds to collect the sign bits
2315     // together.
2316     // Since the widening shifts would be rather confusing to follow in little
2317     // endian, everything will be illustrated in big endian order instead. This
2318     // has a different result - the bits would actually be reversed on a big
2319     // endian machine.
2320 
2321     // Starting input (only half the elements are shown):
2322     // 89 ff 1d c0 00 10 99 33
2323     uint8x16_t input = vreinterpretq_u8_m128i(a);
2324 
2325     // Shift out everything but the sign bits with an unsigned shift right.
2326     //
2327     // Bytes of the vector::
2328     // 89 ff 1d c0 00 10 99 33
2329     // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
2330     //  |  |  |  |  |  |  |  |
2331     // 01 01 00 01 00 00 01 00
2332     //
2333     // Bits of first important lane(s):
2334     // 10001001 (89)
2335     // \______
2336     //        |
2337     // 00000001 (01)
2338     uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
2339 
2340     // Merge the even lanes together with a 16-bit unsigned shift right + add.
2341     // 'xx' represents garbage data which will be ignored in the final result.
2342     // In the important bytes, the add functions like a binary OR.
2343     //
2344     // 01 01 00 01 00 00 01 00
2345     //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
2346     //    \|    \|    \|    \|
2347     // xx 03 xx 01 xx 00 xx 02
2348     //
2349     // 00000001 00000001 (01 01)
2350     //        \_______ |
2351     //                \|
2352     // xxxxxxxx xxxxxx11 (xx 03)
2353     uint32x4_t paired16 =
2354         vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
2355 
2356     // Repeat with a wider 32-bit shift + add.
2357     // xx 03 xx 01 xx 00 xx 02
2358     //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
2359     //     14))
2360     //          \|          \|
2361     // xx xx xx 0d xx xx xx 02
2362     //
2363     // 00000011 00000001 (03 01)
2364     //        \\_____ ||
2365     //         '----.\||
2366     // xxxxxxxx xxxx1101 (xx 0d)
2367     uint64x2_t paired32 =
2368         vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
2369 
2370     // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
2371     // lanes. xx xx xx 0d xx xx xx 02
2372     //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
2373     //            28))
2374     //                      \|
2375     // xx xx xx xx xx xx xx d2
2376     //
2377     // 00001101 00000010 (0d 02)
2378     //     \   \___ |  |
2379     //      '---.  \|  |
2380     // xxxxxxxx 11010010 (xx d2)
2381     uint8x16_t paired64 =
2382         vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
2383 
2384     // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
2385     // xx xx xx xx xx xx xx d2
2386     //                      ||  return paired64[0]
2387     //                      d2
2388     // Note: Little endian would return the correct value 4b (01001011) instead.
2389     return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
2390 #endif
2391 }
2392 
2393 // Copy the lower 64-bit integer in a to dst.
2394 //
2395 //   dst[63:0] := a[63:0]
2396 //
2397 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
_mm_movepi64_pi64(__m128i a)2398 FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
2399 {
2400     return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
2401 }
2402 
2403 // Copy the 64-bit integer a to the lower element of dst, and zero the upper
2404 // element.
2405 //
2406 //   dst[63:0] := a[63:0]
2407 //   dst[127:64] := 0
2408 //
2409 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
_mm_movpi64_epi64(__m64 a)2410 FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
2411 {
2412     return vreinterpretq_m128i_s64(
2413         vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
2414 }
2415 
2416 // NEON does not provide this method
2417 // Creates a 4-bit mask from the most significant bits of the four
2418 // single-precision, floating-point values.
2419 // https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
_mm_movemask_ps(__m128 a)2420 FORCE_INLINE int _mm_movemask_ps(__m128 a)
2421 {
2422     uint32x4_t input = vreinterpretq_u32_m128(a);
2423 #if defined(__aarch64__)
2424     static const int32x4_t shift = {0, 1, 2, 3};
2425     uint32x4_t tmp = vshrq_n_u32(input, 31);
2426     return vaddvq_u32(vshlq_u32(tmp, shift));
2427 #else
2428     // Uses the exact same method as _mm_movemask_epi8, see that for details.
2429     // Shift out everything but the sign bits with a 32-bit unsigned shift
2430     // right.
2431     uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2432     // Merge the two pairs together with a 64-bit unsigned shift right + add.
2433     uint8x16_t paired =
2434         vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2435     // Extract the result.
2436     return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2437 #endif
2438 }
2439 
2440 // Compute the bitwise NOT of a and then AND with a 128-bit vector containing
2441 // all 1's, and return 1 if the result is zero, otherwise return 0.
2442 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
_mm_test_all_ones(__m128i a)2443 FORCE_INLINE int _mm_test_all_ones(__m128i a)
2444 {
2445     return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
2446            ~(uint64_t) 0;
2447 }
2448 
2449 // Compute the bitwise AND of 128 bits (representing integer data) in a and
2450 // mask, and return 1 if the result is zero, otherwise return 0.
2451 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
_mm_test_all_zeros(__m128i a,__m128i mask)2452 FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
2453 {
2454     int64x2_t a_and_mask =
2455         vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
2456     return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
2457                                                                            : 1;
2458 }
2459 
2460 /* Math operations */
2461 
2462 // Subtracts the four single-precision, floating-point values of a and b.
2463 //
2464 //   r0 := a0 - b0
2465 //   r1 := a1 - b1
2466 //   r2 := a2 - b2
2467 //   r3 := a3 - b3
2468 //
2469 // https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
_mm_sub_ps(__m128 a,__m128 b)2470 FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2471 {
2472     return vreinterpretq_m128_f32(
2473         vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2474 }
2475 
2476 // Subtract the lower single-precision (32-bit) floating-point element in b from
2477 // the lower single-precision (32-bit) floating-point element in a, store the
2478 // result in the lower element of dst, and copy the upper 3 packed elements from
2479 // a to the upper elements of dst.
2480 //
2481 //   dst[31:0] := a[31:0] - b[31:0]
2482 //   dst[127:32] := a[127:32]
2483 //
2484 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
_mm_sub_ss(__m128 a,__m128 b)2485 FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2486 {
2487     return _mm_move_ss(a, _mm_sub_ps(a, b));
2488 }
2489 
2490 // Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
2491 // and store the results in dst.
2492 //    r0 := a0 - b0
2493 //    r1 := a1 - b1
_mm_sub_epi64(__m128i a,__m128i b)2494 FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
2495 {
2496     return vreinterpretq_m128i_s64(
2497         vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2498 }
2499 
2500 // Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
2501 // unsigned 32-bit integers of a.
2502 //
2503 //   r0 := a0 - b0
2504 //   r1 := a1 - b1
2505 //   r2 := a2 - b2
2506 //   r3 := a3 - b3
2507 //
2508 // https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
_mm_sub_epi32(__m128i a,__m128i b)2509 FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
2510 {
2511     return vreinterpretq_m128i_s32(
2512         vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2513 }
2514 
_mm_sub_epi16(__m128i a,__m128i b)2515 FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
2516 {
2517     return vreinterpretq_m128i_s16(
2518         vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2519 }
2520 
_mm_sub_epi8(__m128i a,__m128i b)2521 FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
2522 {
2523     return vreinterpretq_m128i_s8(
2524         vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2525 }
2526 
2527 // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
2528 //
2529 //   dst[63:0] := a[63:0] - b[63:0]
2530 //
2531 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
_mm_sub_si64(__m64 a,__m64 b)2532 FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
2533 {
2534     return vreinterpret_m64_s64(
2535         vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2536 }
2537 
2538 // Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
2539 // integers of a and saturates..
2540 // https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
_mm_subs_epu16(__m128i a,__m128i b)2541 FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
2542 {
2543     return vreinterpretq_m128i_u16(
2544         vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
2545 }
2546 
2547 // Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
2548 // integers of a and saturates.
2549 //
2550 //   r0 := UnsignedSaturate(a0 - b0)
2551 //   r1 := UnsignedSaturate(a1 - b1)
2552 //   ...
2553 //   r15 := UnsignedSaturate(a15 - b15)
2554 //
2555 // https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
_mm_subs_epu8(__m128i a,__m128i b)2556 FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
2557 {
2558     return vreinterpretq_m128i_u8(
2559         vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2560 }
2561 
2562 // Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
2563 // of a and saturates.
2564 //
2565 //   r0 := SignedSaturate(a0 - b0)
2566 //   r1 := SignedSaturate(a1 - b1)
2567 //   ...
2568 //   r15 := SignedSaturate(a15 - b15)
2569 //
2570 // https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
_mm_subs_epi8(__m128i a,__m128i b)2571 FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
2572 {
2573     return vreinterpretq_m128i_s8(
2574         vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2575 }
2576 
2577 // Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
2578 // of a and saturates.
2579 //
2580 //   r0 := SignedSaturate(a0 - b0)
2581 //   r1 := SignedSaturate(a1 - b1)
2582 //   ...
2583 //   r7 := SignedSaturate(a7 - b7)
2584 //
2585 // https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
_mm_subs_epi16(__m128i a,__m128i b)2586 FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
2587 {
2588     return vreinterpretq_m128i_s16(
2589         vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2590 }
2591 
_mm_adds_epu16(__m128i a,__m128i b)2592 FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
2593 {
2594     return vreinterpretq_m128i_u16(
2595         vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
2596 }
2597 
2598 // Negate packed 8-bit integers in a when the corresponding signed
2599 // 8-bit integer in b is negative, and store the results in dst.
2600 // Element in dst are zeroed out when the corresponding element
2601 // in b is zero.
2602 //
2603 //   for i in 0..15
2604 //     if b[i] < 0
2605 //       r[i] := -a[i]
2606 //     else if b[i] == 0
2607 //       r[i] := 0
2608 //     else
2609 //       r[i] := a[i]
2610 //     fi
2611 //   done
_mm_sign_epi8(__m128i _a,__m128i _b)2612 FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
2613 {
2614     int8x16_t a = vreinterpretq_s8_m128i(_a);
2615     int8x16_t b = vreinterpretq_s8_m128i(_b);
2616 
2617     // signed shift right: faster than vclt
2618     // (b < 0) ? 0xFF : 0
2619     uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
2620 
2621     // (b == 0) ? 0xFF : 0
2622 #if defined(__aarch64__)
2623     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
2624 #else
2625     int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
2626 #endif
2627 
2628     // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
2629     // based on ltMask
2630     int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
2631     // res = masked & (~zeroMask)
2632     int8x16_t res = vbicq_s8(masked, zeroMask);
2633 
2634     return vreinterpretq_m128i_s8(res);
2635 }
2636 
2637 // Negate packed 16-bit integers in a when the corresponding signed
2638 // 16-bit integer in b is negative, and store the results in dst.
2639 // Element in dst are zeroed out when the corresponding element
2640 // in b is zero.
2641 //
2642 //   for i in 0..7
2643 //     if b[i] < 0
2644 //       r[i] := -a[i]
2645 //     else if b[i] == 0
2646 //       r[i] := 0
2647 //     else
2648 //       r[i] := a[i]
2649 //     fi
2650 //   done
_mm_sign_epi16(__m128i _a,__m128i _b)2651 FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
2652 {
2653     int16x8_t a = vreinterpretq_s16_m128i(_a);
2654     int16x8_t b = vreinterpretq_s16_m128i(_b);
2655 
2656     // signed shift right: faster than vclt
2657     // (b < 0) ? 0xFFFF : 0
2658     uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
2659     // (b == 0) ? 0xFFFF : 0
2660 #if defined(__aarch64__)
2661     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
2662 #else
2663     int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
2664 #endif
2665 
2666     // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
2667     // 'a') based on ltMask
2668     int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
2669     // res = masked & (~zeroMask)
2670     int16x8_t res = vbicq_s16(masked, zeroMask);
2671     return vreinterpretq_m128i_s16(res);
2672 }
2673 
2674 // Negate packed 32-bit integers in a when the corresponding signed
2675 // 32-bit integer in b is negative, and store the results in dst.
2676 // Element in dst are zeroed out when the corresponding element
2677 // in b is zero.
2678 //
2679 //   for i in 0..3
2680 //     if b[i] < 0
2681 //       r[i] := -a[i]
2682 //     else if b[i] == 0
2683 //       r[i] := 0
2684 //     else
2685 //       r[i] := a[i]
2686 //     fi
2687 //   done
_mm_sign_epi32(__m128i _a,__m128i _b)2688 FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
2689 {
2690     int32x4_t a = vreinterpretq_s32_m128i(_a);
2691     int32x4_t b = vreinterpretq_s32_m128i(_b);
2692 
2693     // signed shift right: faster than vclt
2694     // (b < 0) ? 0xFFFFFFFF : 0
2695     uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
2696 
2697     // (b == 0) ? 0xFFFFFFFF : 0
2698 #if defined(__aarch64__)
2699     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
2700 #else
2701     int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
2702 #endif
2703 
2704     // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
2705     // 'a') based on ltMask
2706     int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
2707     // res = masked & (~zeroMask)
2708     int32x4_t res = vbicq_s32(masked, zeroMask);
2709     return vreinterpretq_m128i_s32(res);
2710 }
2711 
2712 // Negate packed 16-bit integers in a when the corresponding signed 16-bit
2713 // integer in b is negative, and store the results in dst. Element in dst are
2714 // zeroed out when the corresponding element in b is zero.
2715 //
2716 //   FOR j := 0 to 3
2717 //      i := j*16
2718 //      IF b[i+15:i] < 0
2719 //        dst[i+15:i] := -(a[i+15:i])
2720 //      ELSE IF b[i+15:i] == 0
2721 //        dst[i+15:i] := 0
2722 //      ELSE
2723 //        dst[i+15:i] := a[i+15:i]
2724 //      FI
2725 //   ENDFOR
2726 //
2727 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
_mm_sign_pi16(__m64 _a,__m64 _b)2728 FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
2729 {
2730     int16x4_t a = vreinterpret_s16_m64(_a);
2731     int16x4_t b = vreinterpret_s16_m64(_b);
2732 
2733     // signed shift right: faster than vclt
2734     // (b < 0) ? 0xFFFF : 0
2735     uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
2736 
2737     // (b == 0) ? 0xFFFF : 0
2738 #if defined(__aarch64__)
2739     int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
2740 #else
2741     int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
2742 #endif
2743 
2744     // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
2745     // based on ltMask
2746     int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
2747     // res = masked & (~zeroMask)
2748     int16x4_t res = vbic_s16(masked, zeroMask);
2749 
2750     return vreinterpret_m64_s16(res);
2751 }
2752 
2753 // Negate packed 32-bit integers in a when the corresponding signed 32-bit
2754 // integer in b is negative, and store the results in dst. Element in dst are
2755 // zeroed out when the corresponding element in b is zero.
2756 //
2757 //   FOR j := 0 to 1
2758 //      i := j*32
2759 //      IF b[i+31:i] < 0
2760 //        dst[i+31:i] := -(a[i+31:i])
2761 //      ELSE IF b[i+31:i] == 0
2762 //        dst[i+31:i] := 0
2763 //      ELSE
2764 //        dst[i+31:i] := a[i+31:i]
2765 //      FI
2766 //   ENDFOR
2767 //
2768 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
_mm_sign_pi32(__m64 _a,__m64 _b)2769 FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
2770 {
2771     int32x2_t a = vreinterpret_s32_m64(_a);
2772     int32x2_t b = vreinterpret_s32_m64(_b);
2773 
2774     // signed shift right: faster than vclt
2775     // (b < 0) ? 0xFFFFFFFF : 0
2776     uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
2777 
2778     // (b == 0) ? 0xFFFFFFFF : 0
2779 #if defined(__aarch64__)
2780     int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
2781 #else
2782     int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
2783 #endif
2784 
2785     // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
2786     // based on ltMask
2787     int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
2788     // res = masked & (~zeroMask)
2789     int32x2_t res = vbic_s32(masked, zeroMask);
2790 
2791     return vreinterpret_m64_s32(res);
2792 }
2793 
2794 // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
2795 // in b is negative, and store the results in dst. Element in dst are zeroed out
2796 // when the corresponding element in b is zero.
2797 //
2798 //   FOR j := 0 to 7
2799 //      i := j*8
2800 //      IF b[i+7:i] < 0
2801 //        dst[i+7:i] := -(a[i+7:i])
2802 //      ELSE IF b[i+7:i] == 0
2803 //        dst[i+7:i] := 0
2804 //      ELSE
2805 //        dst[i+7:i] := a[i+7:i]
2806 //      FI
2807 //   ENDFOR
2808 //
2809 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
_mm_sign_pi8(__m64 _a,__m64 _b)2810 FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
2811 {
2812     int8x8_t a = vreinterpret_s8_m64(_a);
2813     int8x8_t b = vreinterpret_s8_m64(_b);
2814 
2815     // signed shift right: faster than vclt
2816     // (b < 0) ? 0xFF : 0
2817     uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
2818 
2819     // (b == 0) ? 0xFF : 0
2820 #if defined(__aarch64__)
2821     int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
2822 #else
2823     int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
2824 #endif
2825 
2826     // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
2827     // based on ltMask
2828     int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
2829     // res = masked & (~zeroMask)
2830     int8x8_t res = vbic_s8(masked, zeroMask);
2831 
2832     return vreinterpret_m64_s8(res);
2833 }
2834 
2835 // Average packed unsigned 16-bit integers in a and b, and store the results in
2836 // dst.
2837 //
2838 //   FOR j := 0 to 3
2839 //     i := j*16
2840 //     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2841 //   ENDFOR
2842 //
2843 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
_mm_avg_pu16(__m64 a,__m64 b)2844 FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
2845 {
2846     return vreinterpret_m64_u16(
2847         vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
2848 }
2849 
2850 // Average packed unsigned 8-bit integers in a and b, and store the results in
2851 // dst.
2852 //
2853 //   FOR j := 0 to 7
2854 //     i := j*8
2855 //     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2856 //   ENDFOR
2857 //
2858 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
_mm_avg_pu8(__m64 a,__m64 b)2859 FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
2860 {
2861     return vreinterpret_m64_u8(
2862         vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2863 }
2864 
2865 // Average packed unsigned 8-bit integers in a and b, and store the results in
2866 // dst.
2867 //
2868 //   FOR j := 0 to 7
2869 //     i := j*8
2870 //     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2871 //   ENDFOR
2872 //
2873 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
2874 #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2875 
2876 // Average packed unsigned 16-bit integers in a and b, and store the results in
2877 // dst.
2878 //
2879 //   FOR j := 0 to 3
2880 //     i := j*16
2881 //     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2882 //   ENDFOR
2883 //
2884 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
2885 #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2886 
2887 // Computes the average of the 16 unsigned 8-bit integers in a and the 16
2888 // unsigned 8-bit integers in b and rounds.
2889 //
2890 //   r0 := (a0 + b0) / 2
2891 //   r1 := (a1 + b1) / 2
2892 //   ...
2893 //   r15 := (a15 + b15) / 2
2894 //
2895 // https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
_mm_avg_epu8(__m128i a,__m128i b)2896 FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
2897 {
2898     return vreinterpretq_m128i_u8(
2899         vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2900 }
2901 
2902 // Computes the average of the 8 unsigned 16-bit integers in a and the 8
2903 // unsigned 16-bit integers in b and rounds.
2904 //
2905 //   r0 := (a0 + b0) / 2
2906 //   r1 := (a1 + b1) / 2
2907 //   ...
2908 //   r7 := (a7 + b7) / 2
2909 //
2910 // https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
_mm_avg_epu16(__m128i a,__m128i b)2911 FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
2912 {
2913     return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
2914                                  vreinterpretq_u16_m128i(b));
2915 }
2916 
2917 // Adds the four single-precision, floating-point values of a and b.
2918 //
2919 //   r0 := a0 + b0
2920 //   r1 := a1 + b1
2921 //   r2 := a2 + b2
2922 //   r3 := a3 + b3
2923 //
2924 // https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
_mm_add_ps(__m128 a,__m128 b)2925 FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
2926 {
2927     return vreinterpretq_m128_f32(
2928         vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2929 }
2930 
2931 // Add packed double-precision (64-bit) floating-point elements in a and b, and
2932 // store the results in dst.
2933 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
_mm_add_pd(__m128d a,__m128d b)2934 FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
2935 {
2936 #if defined(__aarch64__)
2937     return vreinterpretq_m128d_f64(
2938         vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2939 #else
2940     double *da = (double *) &a;
2941     double *db = (double *) &b;
2942     double c[2];
2943     c[0] = da[0] + db[0];
2944     c[1] = da[1] + db[1];
2945     return vld1q_f32((float32_t *) c);
2946 #endif
2947 }
2948 
2949 // Add 64-bit integers a and b, and store the result in dst.
2950 //
2951 //   dst[63:0] := a[63:0] + b[63:0]
2952 //
2953 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
_mm_add_si64(__m64 a,__m64 b)2954 FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
2955 {
2956     return vreinterpret_m64_s64(
2957         vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2958 }
2959 
2960 // adds the scalar single-precision floating point values of a and b.
2961 // https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
_mm_add_ss(__m128 a,__m128 b)2962 FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
2963 {
2964     float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
2965     float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
2966     // the upper values in the result must be the remnants of <a>.
2967     return vreinterpretq_m128_f32(vaddq_f32(a, value));
2968 }
2969 
2970 // Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
2971 // unsigned 32-bit integers in b.
2972 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
_mm_add_epi64(__m128i a,__m128i b)2973 FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
2974 {
2975     return vreinterpretq_m128i_s64(
2976         vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2977 }
2978 
2979 // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
2980 // unsigned 32-bit integers in b.
2981 //
2982 //   r0 := a0 + b0
2983 //   r1 := a1 + b1
2984 //   r2 := a2 + b2
2985 //   r3 := a3 + b3
2986 //
2987 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
_mm_add_epi32(__m128i a,__m128i b)2988 FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
2989 {
2990     return vreinterpretq_m128i_s32(
2991         vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2992 }
2993 
2994 // Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
2995 // unsigned 16-bit integers in b.
2996 // https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
_mm_add_epi16(__m128i a,__m128i b)2997 FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
2998 {
2999     return vreinterpretq_m128i_s16(
3000         vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3001 }
3002 
3003 // Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
3004 // unsigned 8-bit integers in b.
3005 // https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
_mm_add_epi8(__m128i a,__m128i b)3006 FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
3007 {
3008     return vreinterpretq_m128i_s8(
3009         vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3010 }
3011 
3012 // Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
3013 // and saturates.
3014 //
3015 //   r0 := SignedSaturate(a0 + b0)
3016 //   r1 := SignedSaturate(a1 + b1)
3017 //   ...
3018 //   r7 := SignedSaturate(a7 + b7)
3019 //
3020 // https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
_mm_adds_epi16(__m128i a,__m128i b)3021 FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
3022 {
3023     return vreinterpretq_m128i_s16(
3024         vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3025 }
3026 
3027 // Add packed signed 8-bit integers in a and b using saturation, and store the
3028 // results in dst.
3029 //
3030 //   FOR j := 0 to 15
3031 //     i := j*8
3032 //     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
3033 //   ENDFOR
3034 //
3035 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
_mm_adds_epi8(__m128i a,__m128i b)3036 FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
3037 {
3038     return vreinterpretq_m128i_s8(
3039         vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3040 }
3041 
3042 // Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
3043 // b and saturates..
3044 // https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
_mm_adds_epu8(__m128i a,__m128i b)3045 FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
3046 {
3047     return vreinterpretq_m128i_u8(
3048         vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3049 }
3050 
3051 // Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
3052 // unsigned 16-bit integers from b.
3053 //
3054 //   r0 := (a0 * b0)[15:0]
3055 //   r1 := (a1 * b1)[15:0]
3056 //   ...
3057 //   r7 := (a7 * b7)[15:0]
3058 //
3059 // https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
_mm_mullo_epi16(__m128i a,__m128i b)3060 FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
3061 {
3062     return vreinterpretq_m128i_s16(
3063         vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3064 }
3065 
3066 // Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
3067 // unsigned 32-bit integers from b.
3068 // https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
_mm_mullo_epi32(__m128i a,__m128i b)3069 FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
3070 {
3071     return vreinterpretq_m128i_s32(
3072         vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3073 }
3074 
3075 // Multiply the packed unsigned 16-bit integers in a and b, producing
3076 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
3077 // integers in dst.
3078 //
3079 //   FOR j := 0 to 3
3080 //      i := j*16
3081 //      tmp[31:0] := a[i+15:i] * b[i+15:i]
3082 //      dst[i+15:i] := tmp[31:16]
3083 //   ENDFOR
3084 //
3085 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
3086 #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
3087 
3088 // Multiplies the four single-precision, floating-point values of a and b.
3089 //
3090 //   r0 := a0 * b0
3091 //   r1 := a1 * b1
3092 //   r2 := a2 * b2
3093 //   r3 := a3 * b3
3094 //
3095 // https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
_mm_mul_ps(__m128 a,__m128 b)3096 FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
3097 {
3098     return vreinterpretq_m128_f32(
3099         vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3100 }
3101 
3102 // Multiply packed double-precision (64-bit) floating-point elements in a and b,
3103 // and store the results in dst.
3104 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
_mm_mul_pd(__m128d a,__m128d b)3105 FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
3106 {
3107 #if defined(__aarch64__)
3108     return vreinterpretq_m128d_f64(
3109         vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3110 #else
3111     double *da = (double *) &a;
3112     double *db = (double *) &b;
3113     double c[2];
3114     c[0] = da[0] * db[0];
3115     c[1] = da[1] * db[1];
3116     return vld1q_f32((float32_t *) c);
3117 #endif
3118 }
3119 
3120 // Multiply the lower single-precision (32-bit) floating-point element in a and
3121 // b, store the result in the lower element of dst, and copy the upper 3 packed
3122 // elements from a to the upper elements of dst.
3123 //
3124 //   dst[31:0] := a[31:0] * b[31:0]
3125 //   dst[127:32] := a[127:32]
3126 //
3127 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
_mm_mul_ss(__m128 a,__m128 b)3128 FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
3129 {
3130     return _mm_move_ss(a, _mm_mul_ps(a, b));
3131 }
3132 
3133 // Multiply the low unsigned 32-bit integers from each packed 64-bit element in
3134 // a and b, and store the unsigned 64-bit results in dst.
3135 //
3136 //   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
3137 //   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
_mm_mul_epu32(__m128i a,__m128i b)3138 FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
3139 {
3140     // vmull_u32 upcasts instead of masking, so we downcast.
3141     uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
3142     uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
3143     return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
3144 }
3145 
3146 // Multiply the low unsigned 32-bit integers from a and b, and store the
3147 // unsigned 64-bit result in dst.
3148 //
3149 //   dst[63:0] := a[31:0] * b[31:0]
3150 //
3151 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
_mm_mul_su32(__m64 a,__m64 b)3152 FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
3153 {
3154     return vreinterpret_m64_u64(vget_low_u64(
3155         vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
3156 }
3157 
3158 // Multiply the low signed 32-bit integers from each packed 64-bit element in
3159 // a and b, and store the signed 64-bit results in dst.
3160 //
3161 //   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
3162 //   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
_mm_mul_epi32(__m128i a,__m128i b)3163 FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
3164 {
3165     // vmull_s32 upcasts instead of masking, so we downcast.
3166     int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
3167     int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
3168     return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
3169 }
3170 
3171 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
3172 // integers from b.
3173 //
3174 //   r0 := (a0 * b0) + (a1 * b1)
3175 //   r1 := (a2 * b2) + (a3 * b3)
3176 //   r2 := (a4 * b4) + (a5 * b5)
3177 //   r3 := (a6 * b6) + (a7 * b7)
3178 // https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
_mm_madd_epi16(__m128i a,__m128i b)3179 FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
3180 {
3181     int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
3182                               vget_low_s16(vreinterpretq_s16_m128i(b)));
3183     int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
3184                                vget_high_s16(vreinterpretq_s16_m128i(b)));
3185 
3186     int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
3187     int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
3188 
3189     return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
3190 }
3191 
3192 // Multiply packed signed 16-bit integers in a and b, producing intermediate
3193 // signed 32-bit integers. Shift right by 15 bits while rounding up, and store
3194 // the packed 16-bit integers in dst.
3195 //
3196 //   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
3197 //   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
3198 //   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
3199 //   ...
3200 //   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
_mm_mulhrs_epi16(__m128i a,__m128i b)3201 FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
3202 {
3203     // Has issues due to saturation
3204     // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
3205 
3206     // Multiply
3207     int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
3208                                  vget_low_s16(vreinterpretq_s16_m128i(b)));
3209     int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
3210                                  vget_high_s16(vreinterpretq_s16_m128i(b)));
3211 
3212     // Rounding narrowing shift right
3213     // narrow = (int16_t)((mul + 16384) >> 15);
3214     int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
3215     int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
3216 
3217     // Join together
3218     return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
3219 }
3220 
3221 // Vertically multiply each unsigned 8-bit integer from a with the corresponding
3222 // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
3223 // Horizontally add adjacent pairs of intermediate signed 16-bit integers,
3224 // and pack the saturated results in dst.
3225 //
3226 //   FOR j := 0 to 7
3227 //      i := j*16
3228 //      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
3229 //      a[i+7:i]*b[i+7:i] )
3230 //   ENDFOR
_mm_maddubs_epi16(__m128i _a,__m128i _b)3231 FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
3232 {
3233 #if defined(__aarch64__)
3234     uint8x16_t a = vreinterpretq_u8_m128i(_a);
3235     int8x16_t b = vreinterpretq_s8_m128i(_b);
3236     int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
3237                              vmovl_s8(vget_low_s8(b)));
3238     int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
3239                              vmovl_s8(vget_high_s8(b)));
3240     return vreinterpretq_m128i_s16(
3241         vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
3242 #else
3243     // This would be much simpler if x86 would choose to zero extend OR sign
3244     // extend, not both. This could probably be optimized better.
3245     uint16x8_t a = vreinterpretq_u16_m128i(_a);
3246     int16x8_t b = vreinterpretq_s16_m128i(_b);
3247 
3248     // Zero extend a
3249     int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
3250     int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
3251 
3252     // Sign extend by shifting left then shifting right.
3253     int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
3254     int16x8_t b_odd = vshrq_n_s16(b, 8);
3255 
3256     // multiply
3257     int16x8_t prod1 = vmulq_s16(a_even, b_even);
3258     int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
3259 
3260     // saturated add
3261     return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
3262 #endif
3263 }
3264 
3265 // Computes the fused multiple add product of 32-bit floating point numbers.
3266 //
3267 // Return Value
3268 // Multiplies A and B, and adds C to the temporary result before returning it.
3269 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
_mm_fmadd_ps(__m128 a,__m128 b,__m128 c)3270 FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
3271 {
3272 #if defined(__aarch64__)
3273     return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
3274                                             vreinterpretq_f32_m128(b),
3275                                             vreinterpretq_f32_m128(a)));
3276 #else
3277     return _mm_add_ps(_mm_mul_ps(a, b), c);
3278 #endif
3279 }
3280 
3281 // Alternatively add and subtract packed single-precision (32-bit)
3282 // floating-point elements in a to/from packed elements in b, and store the
3283 // results in dst.
3284 //
3285 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
_mm_addsub_ps(__m128 a,__m128 b)3286 FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
3287 {
3288     __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
3289     return _mm_fmadd_ps(b, mask, a);
3290 }
3291 
3292 // Compute the absolute differences of packed unsigned 8-bit integers in a and
3293 // b, then horizontally sum each consecutive 8 differences to produce two
3294 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3295 // 16 bits of 64-bit elements in dst.
3296 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
_mm_sad_epu8(__m128i a,__m128i b)3297 FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
3298 {
3299     uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
3300     uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3301     uint16_t r4 = t[4] + t[5] + t[6] + t[7];
3302     uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
3303     return (__m128i) vsetq_lane_u16(r4, r, 4);
3304 }
3305 
3306 // Compute the absolute differences of packed unsigned 8-bit integers in a and
3307 // b, then horizontally sum each consecutive 8 differences to produce four
3308 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3309 // 16 bits of dst.
3310 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
_mm_sad_pu8(__m64 a,__m64 b)3311 FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
3312 {
3313     uint16x4_t t =
3314         vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3315     uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3316     return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0));
3317 }
3318 
3319 // Compute the absolute differences of packed unsigned 8-bit integers in a and
3320 // b, then horizontally sum each consecutive 8 differences to produce four
3321 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3322 // 16 bits of dst.
3323 //
3324 //   FOR j := 0 to 7
3325 //      i := j*8
3326 //      tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
3327 //   ENDFOR
3328 //   dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] +
3329 //   tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
3330 //
3331 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw
3332 #define _m_psadbw(a, b) _mm_sad_pu8(a, b)
3333 
3334 // Divides the four single-precision, floating-point values of a and b.
3335 //
3336 //   r0 := a0 / b0
3337 //   r1 := a1 / b1
3338 //   r2 := a2 / b2
3339 //   r3 := a3 / b3
3340 //
3341 // https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
_mm_div_ps(__m128 a,__m128 b)3342 FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
3343 {
3344 #if defined(__aarch64__)
3345     return vreinterpretq_m128_f32(
3346         vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3347 #else
3348     float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b));
3349     float32x4_t recip1 =
3350         vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b)));
3351     return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1));
3352 #endif
3353 }
3354 
3355 // Divides the scalar single-precision floating point value of a by b.
3356 // https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
_mm_div_ss(__m128 a,__m128 b)3357 FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
3358 {
3359     float32_t value =
3360         vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
3361     return vreinterpretq_m128_f32(
3362         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
3363 }
3364 
3365 // Compute the approximate reciprocal of packed single-precision (32-bit)
3366 // floating-point elements in a, and store the results in dst. The maximum
3367 // relative error for this approximation is less than 1.5*2^-12.
3368 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
_mm_rcp_ps(__m128 in)3369 FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
3370 {
3371 #if defined(__aarch64__)
3372     return vreinterpretq_m128_f32(
3373         vdivq_f32(vdupq_n_f32(1.0f), vreinterpretq_f32_m128(in)));
3374 #else
3375     float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
3376     recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
3377     return vreinterpretq_m128_f32(recip);
3378 #endif
3379 }
3380 
3381 // Compute the approximate reciprocal of the lower single-precision (32-bit)
3382 // floating-point element in a, store the result in the lower element of dst,
3383 // and copy the upper 3 packed elements from a to the upper elements of dst. The
3384 // maximum relative error for this approximation is less than 1.5*2^-12.
3385 //
3386 //   dst[31:0] := (1.0 / a[31:0])
3387 //   dst[127:32] := a[127:32]
3388 //
3389 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
_mm_rcp_ss(__m128 a)3390 FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
3391 {
3392     return _mm_move_ss(a, _mm_rcp_ps(a));
3393 }
3394 
3395 // Computes the approximations of square roots of the four single-precision,
3396 // floating-point values of a. First computes reciprocal square roots and then
3397 // reciprocals of the four values.
3398 //
3399 //   r0 := sqrt(a0)
3400 //   r1 := sqrt(a1)
3401 //   r2 := sqrt(a2)
3402 //   r3 := sqrt(a3)
3403 //
3404 // https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
_mm_sqrt_ps(__m128 in)3405 FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
3406 {
3407 #if defined(__aarch64__)
3408     return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
3409 #else
3410     float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
3411     float32x4_t sq = vrecpeq_f32(recipsq);
3412     // ??? use step versions of both sqrt and recip for better accuracy?
3413     return vreinterpretq_m128_f32(sq);
3414 #endif
3415 }
3416 
3417 // Computes the approximation of the square root of the scalar single-precision
3418 // floating point value of in.
3419 // https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
_mm_sqrt_ss(__m128 in)3420 FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
3421 {
3422     float32_t value =
3423         vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
3424     return vreinterpretq_m128_f32(
3425         vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
3426 }
3427 
3428 // Computes the approximations of the reciprocal square roots of the four
3429 // single-precision floating point values of in.
3430 // https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
_mm_rsqrt_ps(__m128 in)3431 FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
3432 {
3433     return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in)));
3434 }
3435 
3436 // Compute the approximate reciprocal square root of the lower single-precision
3437 // (32-bit) floating-point element in a, store the result in the lower element
3438 // of dst, and copy the upper 3 packed elements from a to the upper elements of
3439 // dst.
3440 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
_mm_rsqrt_ss(__m128 in)3441 FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
3442 {
3443     return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
3444 }
3445 
3446 // Compare packed signed 16-bit integers in a and b, and store packed maximum
3447 // values in dst.
3448 //
3449 //   FOR j := 0 to 3
3450 //      i := j*16
3451 //      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
3452 //   ENDFOR
3453 //
3454 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
_mm_max_pi16(__m64 a,__m64 b)3455 FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
3456 {
3457     return vreinterpret_m64_s16(
3458         vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
3459 }
3460 
3461 // Compare packed signed 16-bit integers in a and b, and store packed maximum
3462 // values in dst.
3463 //
3464 //   FOR j := 0 to 3
3465 //      i := j*16
3466 //      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
3467 //   ENDFOR
3468 //
3469 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
3470 #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
3471 
3472 // Computes the maximums of the four single-precision, floating-point values of
3473 // a and b.
3474 // https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
_mm_max_ps(__m128 a,__m128 b)3475 FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
3476 {
3477 #if SSE2NEON_PRECISE_MINMAX
3478     float32x4_t _a = vreinterpretq_f32_m128(a);
3479     float32x4_t _b = vreinterpretq_f32_m128(b);
3480     return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
3481 #else
3482     return vreinterpretq_m128_f32(
3483         vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3484 #endif
3485 }
3486 
3487 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
3488 // values in dst.
3489 //
3490 //   FOR j := 0 to 7
3491 //      i := j*8
3492 //      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
3493 //   ENDFOR
3494 //
3495 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
_mm_max_pu8(__m64 a,__m64 b)3496 FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
3497 {
3498     return vreinterpret_m64_u8(
3499         vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3500 }
3501 
3502 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
3503 // values in dst.
3504 //
3505 //   FOR j := 0 to 7
3506 //      i := j*8
3507 //      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
3508 //   ENDFOR
3509 //
3510 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
3511 #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
3512 
3513 // Compare packed signed 16-bit integers in a and b, and store packed minimum
3514 // values in dst.
3515 //
3516 //   FOR j := 0 to 3
3517 //      i := j*16
3518 //      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
3519 //   ENDFOR
3520 //
3521 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
_mm_min_pi16(__m64 a,__m64 b)3522 FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
3523 {
3524     return vreinterpret_m64_s16(
3525         vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
3526 }
3527 
3528 // Compare packed signed 16-bit integers in a and b, and store packed minimum
3529 // values in dst.
3530 //
3531 //   FOR j := 0 to 3
3532 //      i := j*16
3533 //      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
3534 //   ENDFOR
3535 //
3536 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
3537 #define _m_pminsw(a, b) _mm_min_pi16(a, b)
3538 
3539 // Computes the minima of the four single-precision, floating-point values of a
3540 // and b.
3541 // https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
_mm_min_ps(__m128 a,__m128 b)3542 FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
3543 {
3544 #if SSE2NEON_PRECISE_MINMAX
3545     float32x4_t _a = vreinterpretq_f32_m128(a);
3546     float32x4_t _b = vreinterpretq_f32_m128(b);
3547     return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
3548 #else
3549     return vreinterpretq_m128_f32(
3550         vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3551 #endif
3552 }
3553 
3554 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
3555 // values in dst.
3556 //
3557 //   FOR j := 0 to 7
3558 //      i := j*8
3559 //      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
3560 //   ENDFOR
3561 //
3562 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
_mm_min_pu8(__m64 a,__m64 b)3563 FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
3564 {
3565     return vreinterpret_m64_u8(
3566         vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3567 }
3568 
3569 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
3570 // values in dst.
3571 //
3572 //   FOR j := 0 to 7
3573 //      i := j*8
3574 //      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
3575 //   ENDFOR
3576 //
3577 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
3578 #define _m_pminub(a, b) _mm_min_pu8(a, b)
3579 
3580 // Computes the maximum of the two lower scalar single-precision floating point
3581 // values of a and b.
3582 // https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
_mm_max_ss(__m128 a,__m128 b)3583 FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
3584 {
3585     float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
3586     return vreinterpretq_m128_f32(
3587         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
3588 }
3589 
3590 // Computes the minimum of the two lower scalar single-precision floating point
3591 // values of a and b.
3592 // https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
_mm_min_ss(__m128 a,__m128 b)3593 FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
3594 {
3595     float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
3596     return vreinterpretq_m128_f32(
3597         vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
3598 }
3599 
3600 // Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
3601 // 16 unsigned 8-bit integers from b.
3602 // https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
_mm_max_epu8(__m128i a,__m128i b)3603 FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
3604 {
3605     return vreinterpretq_m128i_u8(
3606         vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3607 }
3608 
3609 // Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
3610 // 16 unsigned 8-bit integers from b.
3611 // https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
_mm_min_epu8(__m128i a,__m128i b)3612 FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
3613 {
3614     return vreinterpretq_m128i_u8(
3615         vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3616 }
3617 
3618 // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
3619 // signed 16-bit integers from b.
3620 // https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
_mm_min_epi16(__m128i a,__m128i b)3621 FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
3622 {
3623     return vreinterpretq_m128i_s16(
3624         vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3625 }
3626 
3627 // Compare packed signed 8-bit integers in a and b, and store packed maximum
3628 // values in dst.
3629 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
_mm_max_epi8(__m128i a,__m128i b)3630 FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
3631 {
3632     return vreinterpretq_m128i_s8(
3633         vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3634 }
3635 
3636 // Compare packed unsigned 16-bit integers in a and b, and store packed maximum
3637 // values in dst.
3638 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
_mm_max_epu16(__m128i a,__m128i b)3639 FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
3640 {
3641     return vreinterpretq_m128i_u16(
3642         vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
3643 }
3644 
3645 // Compare packed signed 8-bit integers in a and b, and store packed minimum
3646 // values in dst.
3647 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
_mm_min_epi8(__m128i a,__m128i b)3648 FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
3649 {
3650     return vreinterpretq_m128i_s8(
3651         vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3652 }
3653 
3654 // Compare packed unsigned 16-bit integers in a and b, and store packed minimum
3655 // values in dst.
3656 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
_mm_min_epu16(__m128i a,__m128i b)3657 FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
3658 {
3659     return vreinterpretq_m128i_u16(
3660         vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
3661 }
3662 
3663 // Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
3664 // signed 16-bit integers from b.
3665 // https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
_mm_max_epi16(__m128i a,__m128i b)3666 FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
3667 {
3668     return vreinterpretq_m128i_s16(
3669         vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3670 }
3671 
3672 // epi versions of min/max
3673 // Computes the pariwise maximums of the four signed 32-bit integer values of a
3674 // and b.
3675 //
3676 // A 128-bit parameter that can be defined with the following equations:
3677 //   r0 := (a0 > b0) ? a0 : b0
3678 //   r1 := (a1 > b1) ? a1 : b1
3679 //   r2 := (a2 > b2) ? a2 : b2
3680 //   r3 := (a3 > b3) ? a3 : b3
3681 //
3682 // https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
_mm_max_epi32(__m128i a,__m128i b)3683 FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
3684 {
3685     return vreinterpretq_m128i_s32(
3686         vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3687 }
3688 
3689 // Computes the pariwise minima of the four signed 32-bit integer values of a
3690 // and b.
3691 //
3692 // A 128-bit parameter that can be defined with the following equations:
3693 //   r0 := (a0 < b0) ? a0 : b0
3694 //   r1 := (a1 < b1) ? a1 : b1
3695 //   r2 := (a2 < b2) ? a2 : b2
3696 //   r3 := (a3 < b3) ? a3 : b3
3697 //
3698 // https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
_mm_min_epi32(__m128i a,__m128i b)3699 FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
3700 {
3701     return vreinterpretq_m128i_s32(
3702         vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3703 }
3704 
3705 // Compare packed unsigned 32-bit integers in a and b, and store packed maximum
3706 // values in dst.
3707 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
_mm_max_epu32(__m128i a,__m128i b)3708 FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
3709 {
3710     return vreinterpretq_m128i_u32(
3711         vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
3712 }
3713 
3714 // Compare packed unsigned 32-bit integers in a and b, and store packed minimum
3715 // values in dst.
3716 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
_mm_min_epu32(__m128i a,__m128i b)3717 FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
3718 {
3719     return vreinterpretq_m128i_u32(
3720         vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
3721 }
3722 
3723 // Multiply the packed unsigned 16-bit integers in a and b, producing
3724 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
3725 // integers in dst.
3726 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
_mm_mulhi_pu16(__m64 a,__m64 b)3727 FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
3728 {
3729     return vreinterpret_m64_u16(vshrn_n_u32(
3730         vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
3731 }
3732 
3733 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
3734 // integers from b.
3735 //
3736 //   r0 := (a0 * b0)[31:16]
3737 //   r1 := (a1 * b1)[31:16]
3738 //   ...
3739 //   r7 := (a7 * b7)[31:16]
3740 //
3741 // https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
_mm_mulhi_epi16(__m128i a,__m128i b)3742 FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
3743 {
3744     /* FIXME: issue with large values because of result saturation */
3745     // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
3746     // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
3747     // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
3748     int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
3749     int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
3750     int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
3751     int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
3752     int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
3753     int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
3754     uint16x8x2_t r =
3755         vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
3756     return vreinterpretq_m128i_u16(r.val[1]);
3757 }
3758 
3759 // Multiply the packed unsigned 16-bit integers in a and b, producing
3760 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
3761 // integers in dst.
3762 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
_mm_mulhi_epu16(__m128i a,__m128i b)3763 FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
3764 {
3765     uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
3766     uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
3767     uint32x4_t ab3210 = vmull_u16(a3210, b3210);
3768 #if defined(__aarch64__)
3769     uint32x4_t ab7654 =
3770         vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
3771     uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
3772                               vreinterpretq_u16_u32(ab7654));
3773     return vreinterpretq_m128i_u16(r);
3774 #else
3775     uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
3776     uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
3777     uint32x4_t ab7654 = vmull_u16(a7654, b7654);
3778     uint16x8x2_t r =
3779         vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
3780     return vreinterpretq_m128i_u16(r.val[1]);
3781 #endif
3782 }
3783 
3784 // Computes pairwise add of each argument as single-precision, floating-point
3785 // values a and b.
3786 // https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
_mm_hadd_ps(__m128 a,__m128 b)3787 FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
3788 {
3789 #if defined(__aarch64__)
3790     return vreinterpretq_m128_f32(
3791         vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3792 #else
3793     float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
3794     float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
3795     float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
3796     float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
3797     return vreinterpretq_m128_f32(
3798         vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
3799 #endif
3800 }
3801 
3802 // Computes pairwise add of each argument as a 16-bit signed or unsigned integer
3803 // values a and b.
_mm_hadd_epi16(__m128i _a,__m128i _b)3804 FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
3805 {
3806     int16x8_t a = vreinterpretq_s16_m128i(_a);
3807     int16x8_t b = vreinterpretq_s16_m128i(_b);
3808 #if defined(__aarch64__)
3809     return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
3810 #else
3811     return vreinterpretq_m128i_s16(
3812         vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
3813                      vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
3814 #endif
3815 }
3816 
3817 // Horizontally substract adjacent pairs of single-precision (32-bit)
3818 // floating-point elements in a and b, and pack the results in dst.
3819 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
_mm_hsub_ps(__m128 _a,__m128 _b)3820 FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
3821 {
3822 #if defined(__aarch64__)
3823     return vreinterpretq_m128_f32(vsubq_f32(
3824         vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
3825         vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
3826 #else
3827     float32x4x2_t c =
3828         vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
3829     return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
3830 #endif
3831 }
3832 
3833 // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
3834 // signed 16-bit results in dst.
3835 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
_mm_hadd_pi16(__m64 a,__m64 b)3836 FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
3837 {
3838     return vreinterpret_m64_s16(
3839         vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
3840 }
3841 
3842 // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
3843 // signed 32-bit results in dst.
3844 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
_mm_hadd_pi32(__m64 a,__m64 b)3845 FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
3846 {
3847     return vreinterpret_m64_s32(
3848         vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
3849 }
3850 
3851 // Computes pairwise difference of each argument as a 16-bit signed or unsigned
3852 // integer values a and b.
_mm_hsub_epi16(__m128i _a,__m128i _b)3853 FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
3854 {
3855     int32x4_t a = vreinterpretq_s32_m128i(_a);
3856     int32x4_t b = vreinterpretq_s32_m128i(_b);
3857     // Interleave using vshrn/vmovn
3858     // [a0|a2|a4|a6|b0|b2|b4|b6]
3859     // [a1|a3|a5|a7|b1|b3|b5|b7]
3860     int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
3861     int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
3862     // Subtract
3863     return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
3864 }
3865 
3866 // Computes saturated pairwise sub of each argument as a 16-bit signed
3867 // integer values a and b.
_mm_hadds_epi16(__m128i _a,__m128i _b)3868 FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
3869 {
3870 #if defined(__aarch64__)
3871     int16x8_t a = vreinterpretq_s16_m128i(_a);
3872     int16x8_t b = vreinterpretq_s16_m128i(_b);
3873     return vreinterpretq_s64_s16(
3874         vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
3875 #else
3876     int32x4_t a = vreinterpretq_s32_m128i(_a);
3877     int32x4_t b = vreinterpretq_s32_m128i(_b);
3878     // Interleave using vshrn/vmovn
3879     // [a0|a2|a4|a6|b0|b2|b4|b6]
3880     // [a1|a3|a5|a7|b1|b3|b5|b7]
3881     int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
3882     int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
3883     // Saturated add
3884     return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
3885 #endif
3886 }
3887 
3888 // Computes saturated pairwise difference of each argument as a 16-bit signed
3889 // integer values a and b.
3890 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
_mm_hsubs_epi16(__m128i _a,__m128i _b)3891 FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
3892 {
3893 #if defined(__aarch64__)
3894     int16x8_t a = vreinterpretq_s16_m128i(_a);
3895     int16x8_t b = vreinterpretq_s16_m128i(_b);
3896     return vreinterpretq_s64_s16(
3897         vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
3898 #else
3899     int32x4_t a = vreinterpretq_s32_m128i(_a);
3900     int32x4_t b = vreinterpretq_s32_m128i(_b);
3901     // Interleave using vshrn/vmovn
3902     // [a0|a2|a4|a6|b0|b2|b4|b6]
3903     // [a1|a3|a5|a7|b1|b3|b5|b7]
3904     int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
3905     int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
3906     // Saturated subtract
3907     return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
3908 #endif
3909 }
3910 
3911 // Computes pairwise add of each argument as a 32-bit signed or unsigned integer
3912 // values a and b.
_mm_hadd_epi32(__m128i _a,__m128i _b)3913 FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
3914 {
3915     int32x4_t a = vreinterpretq_s32_m128i(_a);
3916     int32x4_t b = vreinterpretq_s32_m128i(_b);
3917     return vreinterpretq_m128i_s32(
3918         vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
3919                      vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
3920 }
3921 
3922 // Computes pairwise difference of each argument as a 32-bit signed or unsigned
3923 // integer values a and b.
_mm_hsub_epi32(__m128i _a,__m128i _b)3924 FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
3925 {
3926     int64x2_t a = vreinterpretq_s64_m128i(_a);
3927     int64x2_t b = vreinterpretq_s64_m128i(_b);
3928     // Interleave using vshrn/vmovn
3929     // [a0|a2|b0|b2]
3930     // [a1|a2|b1|b3]
3931     int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
3932     int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
3933     // Subtract
3934     return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
3935 }
3936 
3937 // Kahan summation for accurate summation of floating-point numbers.
3938 // http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
sse2neon_kadd_f32(float * sum,float * c,float y)3939 FORCE_INLINE void sse2neon_kadd_f32(float *sum, float *c, float y)
3940 {
3941     y -= *c;
3942     float t = *sum + y;
3943     *c = (t - *sum) - y;
3944     *sum = t;
3945 }
3946 
3947 // Conditionally multiply the packed single-precision (32-bit) floating-point
3948 // elements in a and b using the high 4 bits in imm8, sum the four products,
3949 // and conditionally store the sum in dst using the low 4 bits of imm.
3950 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
_mm_dp_ps(__m128 a,__m128 b,const int imm)3951 FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
3952 {
3953 #if defined(__aarch64__)
3954     /* shortcuts */
3955     if (imm == 0xFF) {
3956         return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
3957     }
3958     if (imm == 0x7F) {
3959         float32x4_t m = _mm_mul_ps(a, b);
3960         m[3] = 0;
3961         return _mm_set1_ps(vaddvq_f32(m));
3962     }
3963 #endif
3964 
3965     float s = 0, c = 0;
3966     float32x4_t f32a = vreinterpretq_f32_m128(a);
3967     float32x4_t f32b = vreinterpretq_f32_m128(b);
3968 
3969     /* To improve the accuracy of floating-point summation, Kahan algorithm
3970      * is used for each operation.
3971      */
3972     if (imm & (1 << 4))
3973         sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
3974     if (imm & (1 << 5))
3975         sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
3976     if (imm & (1 << 6))
3977         sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
3978     if (imm & (1 << 7))
3979         sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
3980     s += c;
3981 
3982     float32x4_t res = {
3983         (imm & 0x1) ? s : 0,
3984         (imm & 0x2) ? s : 0,
3985         (imm & 0x4) ? s : 0,
3986         (imm & 0x8) ? s : 0,
3987     };
3988     return vreinterpretq_m128_f32(res);
3989 }
3990 
3991 /* Compare operations */
3992 
3993 // Compares for less than
3994 // https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
_mm_cmplt_ps(__m128 a,__m128 b)3995 FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
3996 {
3997     return vreinterpretq_m128_u32(
3998         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3999 }
4000 
4001 // Compares for less than
4002 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
_mm_cmplt_ss(__m128 a,__m128 b)4003 FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
4004 {
4005     return _mm_move_ss(a, _mm_cmplt_ps(a, b));
4006 }
4007 
4008 // Compares for greater than.
4009 //
4010 //   r0 := (a0 > b0) ? 0xffffffff : 0x0
4011 //   r1 := (a1 > b1) ? 0xffffffff : 0x0
4012 //   r2 := (a2 > b2) ? 0xffffffff : 0x0
4013 //   r3 := (a3 > b3) ? 0xffffffff : 0x0
4014 //
4015 // https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
_mm_cmpgt_ps(__m128 a,__m128 b)4016 FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
4017 {
4018     return vreinterpretq_m128_u32(
4019         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4020 }
4021 
4022 // Compares for greater than.
4023 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
_mm_cmpgt_ss(__m128 a,__m128 b)4024 FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
4025 {
4026     return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
4027 }
4028 
4029 // Compares for greater than or equal.
4030 // https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
_mm_cmpge_ps(__m128 a,__m128 b)4031 FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
4032 {
4033     return vreinterpretq_m128_u32(
4034         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4035 }
4036 
4037 // Compares for greater than or equal.
4038 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
_mm_cmpge_ss(__m128 a,__m128 b)4039 FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
4040 {
4041     return _mm_move_ss(a, _mm_cmpge_ps(a, b));
4042 }
4043 
4044 // Compares for less than or equal.
4045 //
4046 //   r0 := (a0 <= b0) ? 0xffffffff : 0x0
4047 //   r1 := (a1 <= b1) ? 0xffffffff : 0x0
4048 //   r2 := (a2 <= b2) ? 0xffffffff : 0x0
4049 //   r3 := (a3 <= b3) ? 0xffffffff : 0x0
4050 //
4051 // https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
_mm_cmple_ps(__m128 a,__m128 b)4052 FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
4053 {
4054     return vreinterpretq_m128_u32(
4055         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4056 }
4057 
4058 // Compares for less than or equal.
4059 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
_mm_cmple_ss(__m128 a,__m128 b)4060 FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
4061 {
4062     return _mm_move_ss(a, _mm_cmple_ps(a, b));
4063 }
4064 
4065 // Compares for equality.
4066 // https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
_mm_cmpeq_ps(__m128 a,__m128 b)4067 FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
4068 {
4069     return vreinterpretq_m128_u32(
4070         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4071 }
4072 
4073 // Compares for equality.
4074 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
_mm_cmpeq_ss(__m128 a,__m128 b)4075 FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
4076 {
4077     return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
4078 }
4079 
4080 // Compares for inequality.
4081 // https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
_mm_cmpneq_ps(__m128 a,__m128 b)4082 FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
4083 {
4084     return vreinterpretq_m128_u32(vmvnq_u32(
4085         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
4086 }
4087 
4088 // Compares for inequality.
4089 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
_mm_cmpneq_ss(__m128 a,__m128 b)4090 FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
4091 {
4092     return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
4093 }
4094 
4095 // Compares for not greater than or equal.
4096 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
_mm_cmpnge_ps(__m128 a,__m128 b)4097 FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
4098 {
4099     return _mm_cmplt_ps(a, b);
4100 }
4101 
4102 // Compares for not greater than or equal.
4103 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
_mm_cmpnge_ss(__m128 a,__m128 b)4104 FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
4105 {
4106     return _mm_cmplt_ss(a, b);
4107 }
4108 
4109 // Compares for not greater than.
4110 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
_mm_cmpngt_ps(__m128 a,__m128 b)4111 FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
4112 {
4113     return _mm_cmple_ps(a, b);
4114 }
4115 
4116 // Compares for not greater than.
4117 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
_mm_cmpngt_ss(__m128 a,__m128 b)4118 FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
4119 {
4120     return _mm_cmple_ss(a, b);
4121 }
4122 
4123 // Compares for not less than or equal.
4124 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
_mm_cmpnle_ps(__m128 a,__m128 b)4125 FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
4126 {
4127     return _mm_cmpgt_ps(a, b);
4128 }
4129 
4130 // Compares for not less than or equal.
4131 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
_mm_cmpnle_ss(__m128 a,__m128 b)4132 FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
4133 {
4134     return _mm_cmpgt_ss(a, b);
4135 }
4136 
4137 // Compares for not less than.
4138 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
_mm_cmpnlt_ps(__m128 a,__m128 b)4139 FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
4140 {
4141     return _mm_cmpge_ps(a, b);
4142 }
4143 
4144 // Compares for not less than.
4145 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
_mm_cmpnlt_ss(__m128 a,__m128 b)4146 FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
4147 {
4148     return _mm_cmpge_ss(a, b);
4149 }
4150 
4151 // Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
4152 // unsigned 8-bit integers in b for equality.
4153 // https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
_mm_cmpeq_epi8(__m128i a,__m128i b)4154 FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
4155 {
4156     return vreinterpretq_m128i_u8(
4157         vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4158 }
4159 
4160 // Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
4161 // unsigned 16-bit integers in b for equality.
4162 // https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
_mm_cmpeq_epi16(__m128i a,__m128i b)4163 FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
4164 {
4165     return vreinterpretq_m128i_u16(
4166         vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4167 }
4168 
4169 // Compare packed 32-bit integers in a and b for equality, and store the results
4170 // in dst
_mm_cmpeq_epi32(__m128i a,__m128i b)4171 FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
4172 {
4173     return vreinterpretq_m128i_u32(
4174         vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4175 }
4176 
4177 // Compare packed 64-bit integers in a and b for equality, and store the results
4178 // in dst
_mm_cmpeq_epi64(__m128i a,__m128i b)4179 FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
4180 {
4181 #if defined(__aarch64__)
4182     return vreinterpretq_m128i_u64(
4183         vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
4184 #else
4185     // ARMv7 lacks vceqq_u64
4186     // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
4187     uint32x4_t cmp =
4188         vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
4189     uint32x4_t swapped = vrev64q_u32(cmp);
4190     return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
4191 #endif
4192 }
4193 
4194 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
4195 // in b for lesser than.
4196 // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
_mm_cmplt_epi8(__m128i a,__m128i b)4197 FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
4198 {
4199     return vreinterpretq_m128i_u8(
4200         vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4201 }
4202 
4203 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
4204 // in b for greater than.
4205 //
4206 //   r0 := (a0 > b0) ? 0xff : 0x0
4207 //   r1 := (a1 > b1) ? 0xff : 0x0
4208 //   ...
4209 //   r15 := (a15 > b15) ? 0xff : 0x0
4210 //
4211 // https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
_mm_cmpgt_epi8(__m128i a,__m128i b)4212 FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
4213 {
4214     return vreinterpretq_m128i_u8(
4215         vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4216 }
4217 
4218 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
4219 // in b for less than.
4220 //
4221 //   r0 := (a0 < b0) ? 0xffff : 0x0
4222 //   r1 := (a1 < b1) ? 0xffff : 0x0
4223 //   ...
4224 //   r7 := (a7 < b7) ? 0xffff : 0x0
4225 //
4226 // https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
_mm_cmplt_epi16(__m128i a,__m128i b)4227 FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
4228 {
4229     return vreinterpretq_m128i_u16(
4230         vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4231 }
4232 
4233 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
4234 // in b for greater than.
4235 //
4236 //   r0 := (a0 > b0) ? 0xffff : 0x0
4237 //   r1 := (a1 > b1) ? 0xffff : 0x0
4238 //   ...
4239 //   r7 := (a7 > b7) ? 0xffff : 0x0
4240 //
4241 // https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
_mm_cmpgt_epi16(__m128i a,__m128i b)4242 FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
4243 {
4244     return vreinterpretq_m128i_u16(
4245         vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4246 }
4247 
4248 
4249 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
4250 // in b for less than.
4251 // https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
_mm_cmplt_epi32(__m128i a,__m128i b)4252 FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
4253 {
4254     return vreinterpretq_m128i_u32(
4255         vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4256 }
4257 
4258 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
4259 // in b for greater than.
4260 // https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
_mm_cmpgt_epi32(__m128i a,__m128i b)4261 FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
4262 {
4263     return vreinterpretq_m128i_u32(
4264         vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4265 }
4266 
4267 // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
4268 // in b for greater than.
_mm_cmpgt_epi64(__m128i a,__m128i b)4269 FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
4270 {
4271 #if defined(__aarch64__)
4272     return vreinterpretq_m128i_u64(
4273         vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
4274 #else
4275     // ARMv7 lacks vcgtq_s64.
4276     // This is based off of Clang's SSE2 polyfill:
4277     // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
4278 
4279     // Mask the sign bit out since we need a signed AND an unsigned comparison
4280     // and it is ugly to try and split them.
4281     int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
4282     int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
4283     int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
4284     // Check if a > b
4285     int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
4286     // Copy upper mask to lower mask
4287     // a_hi > b_hi
4288     int64x2_t gt_hi = vshrq_n_s64(greater, 63);
4289     // Copy lower mask to upper mask
4290     // a_lo > b_lo
4291     int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
4292     // Compare for equality
4293     int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
4294     // Copy upper mask to lower mask
4295     // a_hi == b_hi
4296     int64x2_t eq_hi = vshrq_n_s64(equal, 63);
4297     // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
4298     int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
4299     return vreinterpretq_m128i_s64(ret);
4300 #endif
4301 }
4302 
4303 // Compares the four 32-bit floats in a and b to check if any values are NaN.
4304 // Ordered compare between each value returns true for "orderable" and false for
4305 // "not orderable" (NaN).
4306 // https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
4307 // also:
4308 // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
4309 // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
_mm_cmpord_ps(__m128 a,__m128 b)4310 FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
4311 {
4312     // Note: NEON does not have ordered compare builtin
4313     // Need to compare a eq a and b eq b to check for NaN
4314     // Do AND of results to get final
4315     uint32x4_t ceqaa =
4316         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4317     uint32x4_t ceqbb =
4318         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4319     return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
4320 }
4321 
4322 // Compares for ordered.
4323 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
_mm_cmpord_ss(__m128 a,__m128 b)4324 FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
4325 {
4326     return _mm_move_ss(a, _mm_cmpord_ps(a, b));
4327 }
4328 
4329 // Compares for unordered.
4330 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
_mm_cmpunord_ps(__m128 a,__m128 b)4331 FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
4332 {
4333     uint32x4_t f32a =
4334         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4335     uint32x4_t f32b =
4336         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4337     return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
4338 }
4339 
4340 // Compares for unordered.
4341 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
_mm_cmpunord_ss(__m128 a,__m128 b)4342 FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
4343 {
4344     return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
4345 }
4346 
4347 // Compares the lower single-precision floating point scalar values of a and b
4348 // using a less than operation. :
4349 // https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
4350 // note!! The documentation on MSDN is incorrect!  If either of the values is a
4351 // NAN the docs say you will get a one, but in fact, it will return a zero!!
_mm_comilt_ss(__m128 a,__m128 b)4352 FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
4353 {
4354     uint32x4_t a_not_nan =
4355         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4356     uint32x4_t b_not_nan =
4357         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4358     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4359     uint32x4_t a_lt_b =
4360         vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4361     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
4362 }
4363 
4364 // Compares the lower single-precision floating point scalar values of a and b
4365 // using a greater than operation. :
4366 // https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
_mm_comigt_ss(__m128 a,__m128 b)4367 FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
4368 {
4369     // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
4370     // vreinterpretq_f32_m128(b)), 0);
4371     uint32x4_t a_not_nan =
4372         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4373     uint32x4_t b_not_nan =
4374         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4375     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4376     uint32x4_t a_gt_b =
4377         vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4378     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
4379 }
4380 
4381 // Compares the lower single-precision floating point scalar values of a and b
4382 // using a less than or equal operation. :
4383 // https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
_mm_comile_ss(__m128 a,__m128 b)4384 FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
4385 {
4386     // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
4387     // vreinterpretq_f32_m128(b)), 0);
4388     uint32x4_t a_not_nan =
4389         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4390     uint32x4_t b_not_nan =
4391         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4392     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4393     uint32x4_t a_le_b =
4394         vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4395     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
4396 }
4397 
4398 // Compares the lower single-precision floating point scalar values of a and b
4399 // using a greater than or equal operation. :
4400 // https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
_mm_comige_ss(__m128 a,__m128 b)4401 FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
4402 {
4403     // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
4404     // vreinterpretq_f32_m128(b)), 0);
4405     uint32x4_t a_not_nan =
4406         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4407     uint32x4_t b_not_nan =
4408         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4409     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4410     uint32x4_t a_ge_b =
4411         vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4412     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
4413 }
4414 
4415 // Compares the lower single-precision floating point scalar values of a and b
4416 // using an equality operation. :
4417 // https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
_mm_comieq_ss(__m128 a,__m128 b)4418 FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
4419 {
4420     // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
4421     // vreinterpretq_f32_m128(b)), 0);
4422     uint32x4_t a_not_nan =
4423         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4424     uint32x4_t b_not_nan =
4425         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4426     uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4427     uint32x4_t a_eq_b =
4428         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4429     return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
4430 }
4431 
4432 // Compares the lower single-precision floating point scalar values of a and b
4433 // using an inequality operation. :
4434 // https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
_mm_comineq_ss(__m128 a,__m128 b)4435 FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
4436 {
4437     // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
4438     // vreinterpretq_f32_m128(b)), 0);
4439     uint32x4_t a_not_nan =
4440         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4441     uint32x4_t b_not_nan =
4442         vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4443     uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
4444     uint32x4_t a_neq_b = vmvnq_u32(
4445         vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4446     return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
4447 }
4448 
4449 // according to the documentation, these intrinsics behave the same as the
4450 // non-'u' versions.  We'll just alias them here.
4451 #define _mm_ucomilt_ss _mm_comilt_ss
4452 #define _mm_ucomile_ss _mm_comile_ss
4453 #define _mm_ucomigt_ss _mm_comigt_ss
4454 #define _mm_ucomige_ss _mm_comige_ss
4455 #define _mm_ucomieq_ss _mm_comieq_ss
4456 #define _mm_ucomineq_ss _mm_comineq_ss
4457 
4458 /* Conversions */
4459 
4460 // Convert packed signed 32-bit integers in b to packed single-precision
4461 // (32-bit) floating-point elements, store the results in the lower 2 elements
4462 // of dst, and copy the upper 2 packed elements from a to the upper elements of
4463 // dst.
4464 //
4465 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
4466 //   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
4467 //   dst[95:64] := a[95:64]
4468 //   dst[127:96] := a[127:96]
4469 //
4470 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
_mm_cvt_pi2ps(__m128 a,__m64 b)4471 FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
4472 {
4473     return vreinterpretq_m128_f32(
4474         vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
4475                      vget_high_f32(vreinterpretq_f32_m128(a))));
4476 }
4477 
4478 // Convert the signed 32-bit integer b to a single-precision (32-bit)
4479 // floating-point element, store the result in the lower element of dst, and
4480 // copy the upper 3 packed elements from a to the upper elements of dst.
4481 //
4482 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
4483 //   dst[127:32] := a[127:32]
4484 //
4485 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
_mm_cvt_si2ss(__m128 a,int b)4486 FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
4487 {
4488     return vreinterpretq_m128_f32(
4489         vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
4490 }
4491 
4492 // Convert the signed 32-bit integer b to a single-precision (32-bit)
4493 // floating-point element, store the result in the lower element of dst, and
4494 // copy the upper 3 packed elements from a to the upper elements of dst.
4495 //
4496 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
4497 //   dst[127:32] := a[127:32]
4498 //
4499 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
4500 #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
4501 
4502 // Convert the signed 64-bit integer b to a single-precision (32-bit)
4503 // floating-point element, store the result in the lower element of dst, and
4504 // copy the upper 3 packed elements from a to the upper elements of dst.
4505 //
4506 //   dst[31:0] := Convert_Int64_To_FP32(b[63:0])
4507 //   dst[127:32] := a[127:32]
4508 //
4509 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
_mm_cvtsi64_ss(__m128 a,int64_t b)4510 FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
4511 {
4512     return vreinterpretq_m128_f32(
4513         vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
4514 }
4515 
4516 // Convert the lower single-precision (32-bit) floating-point element in a to a
4517 // 32-bit integer, and store the result in dst.
4518 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
_mm_cvt_ss2si(__m128 a)4519 FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
4520 {
4521 #if defined(__aarch64__)
4522     return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0);
4523 #else
4524     float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
4525     float32_t diff = data - floor(data);
4526     if (diff > 0.5)
4527         return (int32_t) ceil(data);
4528     if (diff == 0.5) {
4529         int32_t f = (int32_t) floor(data);
4530         int32_t c = (int32_t) ceil(data);
4531         return c & 1 ? f : c;
4532     }
4533     return (int32_t) floor(data);
4534 #endif
4535 }
4536 
4537 // Convert packed 16-bit integers in a to packed single-precision (32-bit)
4538 // floating-point elements, and store the results in dst.
4539 //
4540 //   FOR j := 0 to 3
4541 //      i := j*16
4542 //      m := j*32
4543 //      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
4544 //   ENDFOR
4545 //
4546 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
_mm_cvtpi16_ps(__m64 a)4547 FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
4548 {
4549     return vreinterpretq_m128_f32(
4550         vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
4551 }
4552 
4553 // Convert packed 32-bit integers in b to packed single-precision (32-bit)
4554 // floating-point elements, store the results in the lower 2 elements of dst,
4555 // and copy the upper 2 packed elements from a to the upper elements of dst.
4556 //
4557 //   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
4558 //   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
4559 //   dst[95:64] := a[95:64]
4560 //   dst[127:96] := a[127:96]
4561 //
4562 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
_mm_cvtpi32_ps(__m128 a,__m64 b)4563 FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
4564 {
4565     return vreinterpretq_m128_f32(
4566         vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
4567                      vget_high_f32(vreinterpretq_f32_m128(a))));
4568 }
4569 
4570 // Convert packed signed 32-bit integers in a to packed single-precision
4571 // (32-bit) floating-point elements, store the results in the lower 2 elements
4572 // of dst, then covert the packed signed 32-bit integers in b to
4573 // single-precision (32-bit) floating-point element, and store the results in
4574 // the upper 2 elements of dst.
4575 //
4576 //   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
4577 //   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
4578 //   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
4579 //   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
4580 //
4581 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
_mm_cvtpi32x2_ps(__m64 a,__m64 b)4582 FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
4583 {
4584     return vreinterpretq_m128_f32(vcvtq_f32_s32(
4585         vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
4586 }
4587 
4588 // Convert the lower packed 8-bit integers in a to packed single-precision
4589 // (32-bit) floating-point elements, and store the results in dst.
4590 //
4591 //   FOR j := 0 to 3
4592 //      i := j*8
4593 //      m := j*32
4594 //      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
4595 //   ENDFOR
4596 //
4597 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
_mm_cvtpi8_ps(__m64 a)4598 FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
4599 {
4600     return vreinterpretq_m128_f32(vcvtq_f32_s32(
4601         vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
4602 }
4603 
4604 // Convert packed unsigned 16-bit integers in a to packed single-precision
4605 // (32-bit) floating-point elements, and store the results in dst.
4606 //
4607 //   FOR j := 0 to 3
4608 //      i := j*16
4609 //      m := j*32
4610 //      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
4611 //   ENDFOR
4612 //
4613 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
_mm_cvtpu16_ps(__m64 a)4614 FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
4615 {
4616     return vreinterpretq_m128_f32(
4617         vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
4618 }
4619 
4620 // Convert the lower packed unsigned 8-bit integers in a to packed
4621 // single-precision (32-bit) floating-point elements, and store the results in
4622 // dst.
4623 //
4624 //   FOR j := 0 to 3
4625 //      i := j*8
4626 //      m := j*32
4627 //      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
4628 //   ENDFOR
4629 //
4630 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
_mm_cvtpu8_ps(__m64 a)4631 FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
4632 {
4633     return vreinterpretq_m128_f32(vcvtq_f32_u32(
4634         vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
4635 }
4636 
4637 // Converts the four single-precision, floating-point values of a to signed
4638 // 32-bit integer values using truncate.
4639 // https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
_mm_cvttps_epi32(__m128 a)4640 FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
4641 {
4642     return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4643 }
4644 
4645 // Convert the lower double-precision (64-bit) floating-point element in a to a
4646 // 64-bit integer with truncation, and store the result in dst.
4647 //
4648 //   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4649 //
4650 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
_mm_cvttsd_si64(__m128d a)4651 FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
4652 {
4653 #if defined(__aarch64__)
4654     return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4655 #else
4656     double ret = *((double *) &a);
4657     return (int64_t) ret;
4658 #endif
4659 }
4660 
4661 // Convert the lower double-precision (64-bit) floating-point element in a to a
4662 // 64-bit integer with truncation, and store the result in dst.
4663 //
4664 //   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4665 //
4666 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
4667 #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4668 
4669 // Converts the four signed 32-bit integer values of a to single-precision,
4670 // floating-point values
4671 // https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
_mm_cvtepi32_ps(__m128i a)4672 FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
4673 {
4674     return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
4675 }
4676 
4677 // Converts the four unsigned 8-bit integers in the lower 16 bits to four
4678 // unsigned 32-bit integers.
_mm_cvtepu8_epi16(__m128i a)4679 FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
4680 {
4681     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx xxxx DCBA */
4682     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
4683     return vreinterpretq_m128i_u16(u16x8);
4684 }
4685 
4686 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
4687 // unsigned 32-bit integers.
4688 // https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
_mm_cvtepu8_epi32(__m128i a)4689 FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
4690 {
4691     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
4692     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
4693     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
4694     return vreinterpretq_m128i_u32(u32x4);
4695 }
4696 
4697 // Converts the two unsigned 8-bit integers in the lower 16 bits to two
4698 // unsigned 64-bit integers.
_mm_cvtepu8_epi64(__m128i a)4699 FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
4700 {
4701     uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
4702     uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
4703     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
4704     uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
4705     return vreinterpretq_m128i_u64(u64x2);
4706 }
4707 
4708 // Converts the four unsigned 8-bit integers in the lower 16 bits to four
4709 // unsigned 32-bit integers.
_mm_cvtepi8_epi16(__m128i a)4710 FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
4711 {
4712     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
4713     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
4714     return vreinterpretq_m128i_s16(s16x8);
4715 }
4716 
4717 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
4718 // unsigned 32-bit integers.
_mm_cvtepi8_epi32(__m128i a)4719 FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
4720 {
4721     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
4722     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
4723     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
4724     return vreinterpretq_m128i_s32(s32x4);
4725 }
4726 
4727 // Converts the two signed 8-bit integers in the lower 32 bits to four
4728 // signed 64-bit integers.
_mm_cvtepi8_epi64(__m128i a)4729 FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
4730 {
4731     int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
4732     int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
4733     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
4734     int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
4735     return vreinterpretq_m128i_s64(s64x2);
4736 }
4737 
4738 // Converts the four signed 16-bit integers in the lower 64 bits to four signed
4739 // 32-bit integers.
_mm_cvtepi16_epi32(__m128i a)4740 FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
4741 {
4742     return vreinterpretq_m128i_s32(
4743         vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
4744 }
4745 
4746 // Converts the two signed 16-bit integers in the lower 32 bits two signed
4747 // 32-bit integers.
_mm_cvtepi16_epi64(__m128i a)4748 FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
4749 {
4750     int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
4751     int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
4752     int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
4753     return vreinterpretq_m128i_s64(s64x2);
4754 }
4755 
4756 // Converts the four unsigned 16-bit integers in the lower 64 bits to four
4757 // unsigned 32-bit integers.
_mm_cvtepu16_epi32(__m128i a)4758 FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
4759 {
4760     return vreinterpretq_m128i_u32(
4761         vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
4762 }
4763 
4764 // Converts the two unsigned 16-bit integers in the lower 32 bits to two
4765 // unsigned 64-bit integers.
_mm_cvtepu16_epi64(__m128i a)4766 FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
4767 {
4768     uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
4769     uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
4770     uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
4771     return vreinterpretq_m128i_u64(u64x2);
4772 }
4773 
4774 // Converts the two unsigned 32-bit integers in the lower 64 bits to two
4775 // unsigned 64-bit integers.
_mm_cvtepu32_epi64(__m128i a)4776 FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
4777 {
4778     return vreinterpretq_m128i_u64(
4779         vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
4780 }
4781 
4782 // Converts the two signed 32-bit integers in the lower 64 bits to two signed
4783 // 64-bit integers.
_mm_cvtepi32_epi64(__m128i a)4784 FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
4785 {
4786     return vreinterpretq_m128i_s64(
4787         vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
4788 }
4789 
4790 // Converts the four single-precision, floating-point values of a to signed
4791 // 32-bit integer values.
4792 //
4793 //   r0 := (int) a0
4794 //   r1 := (int) a1
4795 //   r2 := (int) a2
4796 //   r3 := (int) a3
4797 //
4798 // https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
4799 // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
4800 // does not support! It is supported on ARMv8-A however.
_mm_cvtps_epi32(__m128 a)4801 FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
4802 {
4803 #if defined(__aarch64__)
4804     return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
4805 #else
4806     uint32x4_t signmask = vdupq_n_u32(0x80000000);
4807     float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
4808                                  vdupq_n_f32(0.5f)); /* +/- 0.5 */
4809     int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
4810         vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
4811     int32x4_t r_trunc =
4812         vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
4813     int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
4814         vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
4815     int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
4816                                  vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
4817     float32x4_t delta = vsubq_f32(
4818         vreinterpretq_f32_m128(a),
4819         vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
4820     uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
4821     return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
4822 #endif
4823 }
4824 
4825 // Copy the lower 32-bit integer in a to dst.
4826 //
4827 //   dst[31:0] := a[31:0]
4828 //
4829 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
_mm_cvtsi128_si32(__m128i a)4830 FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
4831 {
4832     return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4833 }
4834 
4835 // Copy the lower 64-bit integer in a to dst.
4836 //
4837 //   dst[63:0] := a[63:0]
4838 //
4839 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
_mm_cvtsi128_si64(__m128i a)4840 FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
4841 {
4842     return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4843 }
4844 
4845 // Copy the lower 64-bit integer in a to dst.
4846 //
4847 //   dst[63:0] := a[63:0]
4848 //
4849 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
4850 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4851 
4852 // Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
4853 // zero extending the upper bits.
4854 //
4855 //   r0 := a
4856 //   r1 := 0x0
4857 //   r2 := 0x0
4858 //   r3 := 0x0
4859 //
4860 // https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
_mm_cvtsi32_si128(int a)4861 FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
4862 {
4863     return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4864 }
4865 
4866 // Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
4867 // zero extending the upper bits.
4868 //
4869 //   r0 := a
4870 //   r1 := 0x0
_mm_cvtsi64_si128(int64_t a)4871 FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
4872 {
4873     return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4874 }
4875 
4876 // Cast vector of type __m128 to type __m128d. This intrinsic is only used for
4877 // compilation and does not generate any instructions, thus it has zero latency.
4878 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
_mm_castps_pd(__m128 a)4879 FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
4880 {
4881     return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
4882 }
4883 
4884 // Applies a type cast to reinterpret four 32-bit floating point values passed
4885 // in as a 128-bit parameter as packed 32-bit integers.
4886 // https://msdn.microsoft.com/en-us/library/bb514099.aspx
_mm_castps_si128(__m128 a)4887 FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
4888 {
4889     return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
4890 }
4891 
4892 // Applies a type cast to reinterpret four 32-bit integers passed in as a
4893 // 128-bit parameter as packed 32-bit floating point values.
4894 // https://msdn.microsoft.com/en-us/library/bb514029.aspx
_mm_castsi128_ps(__m128i a)4895 FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
4896 {
4897     return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
4898 }
4899 
4900 // Loads 128-bit value. :
4901 // https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
_mm_load_si128(const __m128i * p)4902 FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
4903 {
4904     return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4905 }
4906 
4907 // Load a double-precision (64-bit) floating-point element from memory into both
4908 // elements of dst.
4909 //
4910 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
4911 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
4912 //
4913 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
_mm_load1_pd(const double * p)4914 FORCE_INLINE __m128d _mm_load1_pd(const double *p)
4915 {
4916 #if defined(__aarch64__)
4917     return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4918 #else
4919     return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4920 #endif
4921 }
4922 
4923 // Load a double-precision (64-bit) floating-point element from memory into the
4924 // upper element of dst, and copy the lower element from a to dst. mem_addr does
4925 // not need to be aligned on any particular boundary.
4926 //
4927 //   dst[63:0] := a[63:0]
4928 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
4929 //
4930 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
_mm_loadh_pd(__m128d a,const double * p)4931 FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
4932 {
4933 #if defined(__aarch64__)
4934     return vreinterpretq_m128d_f64(
4935         vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4936 #else
4937     return vreinterpretq_m128d_f32(vcombine_f32(
4938         vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4939 #endif
4940 }
4941 
4942 // Load a double-precision (64-bit) floating-point element from memory into both
4943 // elements of dst.
4944 //
4945 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
4946 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
4947 //
4948 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
4949 #define _mm_load_pd1 _mm_load1_pd
4950 
4951 // Load a double-precision (64-bit) floating-point element from memory into both
4952 // elements of dst.
4953 //
4954 //   dst[63:0] := MEM[mem_addr+63:mem_addr]
4955 //   dst[127:64] := MEM[mem_addr+63:mem_addr]
4956 //
4957 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
4958 #define _mm_loaddup_pd _mm_load1_pd
4959 
4960 // Loads 128-bit value. :
4961 // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
_mm_loadu_si128(const __m128i * p)4962 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
4963 {
4964     return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4965 }
4966 
4967 // Load unaligned 32-bit integer from memory into the first element of dst.
4968 //
4969 //   dst[31:0] := MEM[mem_addr+31:mem_addr]
4970 //   dst[MAX:32] := 0
4971 //
4972 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
_mm_loadu_si32(const void * p)4973 FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
4974 {
4975     return vreinterpretq_m128i_s32(
4976         vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4977 }
4978 
4979 // Convert packed double-precision (64-bit) floating-point elements in a to
4980 // packed single-precision (32-bit) floating-point elements, and store the
4981 // results in dst.
4982 //
4983 //   FOR j := 0 to 1
4984 //     i := 32*j
4985 //     k := 64*j
4986 //     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
4987 //   ENDFOR
4988 //   dst[127:64] := 0
4989 //
4990 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
_mm_cvtpd_ps(__m128d a)4991 FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
4992 {
4993 #if defined(__aarch64__)
4994     float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
4995     return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
4996 #else
4997     float a0 = (float) ((double *) &a)[0];
4998     float a1 = (float) ((double *) &a)[1];
4999     return _mm_set_ps(0, 0, a1, a0);
5000 #endif
5001 }
5002 
5003 // Copy the lower double-precision (64-bit) floating-point element of a to dst.
5004 //
5005 //   dst[63:0] := a[63:0]
5006 //
5007 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
_mm_cvtsd_f64(__m128d a)5008 FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
5009 {
5010 #if defined(__aarch64__)
5011     return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
5012 #else
5013     return ((double *) &a)[0];
5014 #endif
5015 }
5016 
5017 // Convert packed single-precision (32-bit) floating-point elements in a to
5018 // packed double-precision (64-bit) floating-point elements, and store the
5019 // results in dst.
5020 //
5021 //   FOR j := 0 to 1
5022 //     i := 64*j
5023 //     k := 32*j
5024 //     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
5025 //   ENDFOR
5026 //
5027 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
_mm_cvtps_pd(__m128 a)5028 FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
5029 {
5030 #if defined(__aarch64__)
5031     return vreinterpretq_m128d_f64(
5032         vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
5033 #else
5034     double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
5035     double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
5036     return _mm_set_pd(a1, a0);
5037 #endif
5038 }
5039 
5040 // Cast vector of type __m128d to type __m128i. This intrinsic is only used for
5041 // compilation and does not generate any instructions, thus it has zero latency.
5042 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
_mm_castpd_si128(__m128d a)5043 FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
5044 {
5045     return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
5046 }
5047 
5048 // Blend packed single-precision (32-bit) floating-point elements from a and b
5049 // using mask, and store the results in dst.
5050 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
_mm_blendv_ps(__m128 a,__m128 b,__m128 mask)5051 FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 mask)
5052 {
5053     return vreinterpretq_m128_f32(vbslq_f32(vreinterpretq_u32_m128(mask),
5054                                             vreinterpretq_f32_m128(b),
5055                                             vreinterpretq_f32_m128(a)));
5056 }
5057 
5058 // Round the packed single-precision (32-bit) floating-point elements in a using
5059 // the rounding parameter, and store the results as packed single-precision
5060 // floating-point elements in dst.
5061 // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
_mm_round_ps(__m128 a,int rounding)5062 FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
5063 {
5064 #if defined(__aarch64__)
5065     switch (rounding) {
5066     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
5067         return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
5068     case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
5069         return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
5070     case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
5071         return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
5072     case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
5073         return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
5074     default:  //_MM_FROUND_CUR_DIRECTION
5075         return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
5076     }
5077 #else
5078     float *v_float = (float *) &a;
5079     __m128 zero, neg_inf, pos_inf;
5080 
5081     switch (rounding) {
5082     case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
5083         return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
5084     case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
5085         return (__m128){floorf(v_float[0]), floorf(v_float[1]),
5086                         floorf(v_float[2]), floorf(v_float[3])};
5087     case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
5088         return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]),
5089                         ceilf(v_float[3])};
5090     case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
5091         zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
5092         neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]),
5093                              floorf(v_float[2]), floorf(v_float[3]));
5094         pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]),
5095                              ceilf(v_float[2]), ceilf(v_float[3]));
5096         return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero));
5097     default:  //_MM_FROUND_CUR_DIRECTION
5098         return (__m128){roundf(v_float[0]), roundf(v_float[1]),
5099                         roundf(v_float[2]), roundf(v_float[3])};
5100     }
5101 #endif
5102 }
5103 
5104 // Convert packed single-precision (32-bit) floating-point elements in a to
5105 // packed 32-bit integers, and store the results in dst.
5106 //
5107 //   FOR j := 0 to 1
5108 //       i := 32*j
5109 //       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
5110 //   ENDFOR
5111 //
5112 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
_mm_cvt_ps2pi(__m128 a)5113 FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
5114 {
5115 #if defined(__aarch64__)
5116     return vreinterpret_m64_s32(
5117         vget_low_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))));
5118 #else
5119     return vreinterpret_m64_s32(
5120         vcvt_s32_f32(vget_low_f32(vreinterpretq_f32_m128(
5121             _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)))));
5122 #endif
5123 }
5124 
5125 // Round the packed single-precision (32-bit) floating-point elements in a up to
5126 // an integer value, and store the results as packed single-precision
5127 // floating-point elements in dst.
5128 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
_mm_ceil_ps(__m128 a)5129 FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
5130 {
5131     return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
5132 }
5133 
5134 // Round the packed single-precision (32-bit) floating-point elements in a down
5135 // to an integer value, and store the results as packed single-precision
5136 // floating-point elements in dst.
5137 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
_mm_floor_ps(__m128 a)5138 FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
5139 {
5140     return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
5141 }
5142 
5143 
5144 // Load 128-bits of integer data from unaligned memory into dst. This intrinsic
5145 // may perform better than _mm_loadu_si128 when the data crosses a cache line
5146 // boundary.
5147 //
5148 //   dst[127:0] := MEM[mem_addr+127:mem_addr]
5149 //
5150 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
5151 #define _mm_lddqu_si128 _mm_loadu_si128
5152 
5153 /* Miscellaneous Operations */
5154 
5155 // Shifts the 8 signed 16-bit integers in a right by count bits while shifting
5156 // in the sign bit.
5157 //
5158 //   r0 := a0 >> count
5159 //   r1 := a1 >> count
5160 //   ...
5161 //   r7 := a7 >> count
5162 //
5163 // https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
_mm_sra_epi16(__m128i a,__m128i count)5164 FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
5165 {
5166     int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5167     if (c > 15)
5168         return _mm_cmplt_epi16(a, _mm_setzero_si128());
5169     return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
5170 }
5171 
5172 // Shifts the 4 signed 32-bit integers in a right by count bits while shifting
5173 // in the sign bit.
5174 //
5175 //   r0 := a0 >> count
5176 //   r1 := a1 >> count
5177 //   r2 := a2 >> count
5178 //   r3 := a3 >> count
5179 //
5180 // https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
_mm_sra_epi32(__m128i a,__m128i count)5181 FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
5182 {
5183     int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5184     if (c > 31)
5185         return _mm_cmplt_epi32(a, _mm_setzero_si128());
5186     return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
5187 }
5188 
5189 // Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
5190 // saturates.
5191 // https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
_mm_packs_epi16(__m128i a,__m128i b)5192 FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
5193 {
5194     return vreinterpretq_m128i_s8(
5195         vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
5196                     vqmovn_s16(vreinterpretq_s16_m128i(b))));
5197 }
5198 
5199 // Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
5200 // integers and saturates.
5201 //
5202 //   r0 := UnsignedSaturate(a0)
5203 //   r1 := UnsignedSaturate(a1)
5204 //   ...
5205 //   r7 := UnsignedSaturate(a7)
5206 //   r8 := UnsignedSaturate(b0)
5207 //   r9 := UnsignedSaturate(b1)
5208 //   ...
5209 //   r15 := UnsignedSaturate(b7)
5210 //
5211 // https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
_mm_packus_epi16(const __m128i a,const __m128i b)5212 FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
5213 {
5214     return vreinterpretq_m128i_u8(
5215         vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
5216                     vqmovun_s16(vreinterpretq_s16_m128i(b))));
5217 }
5218 
5219 // Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
5220 // and saturates.
5221 //
5222 //   r0 := SignedSaturate(a0)
5223 //   r1 := SignedSaturate(a1)
5224 //   r2 := SignedSaturate(a2)
5225 //   r3 := SignedSaturate(a3)
5226 //   r4 := SignedSaturate(b0)
5227 //   r5 := SignedSaturate(b1)
5228 //   r6 := SignedSaturate(b2)
5229 //   r7 := SignedSaturate(b3)
5230 //
5231 // https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
_mm_packs_epi32(__m128i a,__m128i b)5232 FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
5233 {
5234     return vreinterpretq_m128i_s16(
5235         vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
5236                      vqmovn_s32(vreinterpretq_s32_m128i(b))));
5237 }
5238 
5239 // Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
5240 // integers and saturates.
5241 //
5242 //   r0 := UnsignedSaturate(a0)
5243 //   r1 := UnsignedSaturate(a1)
5244 //   r2 := UnsignedSaturate(a2)
5245 //   r3 := UnsignedSaturate(a3)
5246 //   r4 := UnsignedSaturate(b0)
5247 //   r5 := UnsignedSaturate(b1)
5248 //   r6 := UnsignedSaturate(b2)
5249 //   r7 := UnsignedSaturate(b3)
_mm_packus_epi32(__m128i a,__m128i b)5250 FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
5251 {
5252     return vreinterpretq_m128i_u16(
5253         vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
5254                      vqmovun_s32(vreinterpretq_s32_m128i(b))));
5255 }
5256 
5257 // Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
5258 // 8 signed or unsigned 8-bit integers in b.
5259 //
5260 //   r0 := a0
5261 //   r1 := b0
5262 //   r2 := a1
5263 //   r3 := b1
5264 //   ...
5265 //   r14 := a7
5266 //   r15 := b7
5267 //
5268 // https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
_mm_unpacklo_epi8(__m128i a,__m128i b)5269 FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
5270 {
5271 #if defined(__aarch64__)
5272     return vreinterpretq_m128i_s8(
5273         vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5274 #else
5275     int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
5276     int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
5277     int8x8x2_t result = vzip_s8(a1, b1);
5278     return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5279 #endif
5280 }
5281 
5282 // Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
5283 // lower 4 signed or unsigned 16-bit integers in b.
5284 //
5285 //   r0 := a0
5286 //   r1 := b0
5287 //   r2 := a1
5288 //   r3 := b1
5289 //   r4 := a2
5290 //   r5 := b2
5291 //   r6 := a3
5292 //   r7 := b3
5293 //
5294 // https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
_mm_unpacklo_epi16(__m128i a,__m128i b)5295 FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
5296 {
5297 #if defined(__aarch64__)
5298     return vreinterpretq_m128i_s16(
5299         vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5300 #else
5301     int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
5302     int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
5303     int16x4x2_t result = vzip_s16(a1, b1);
5304     return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5305 #endif
5306 }
5307 
5308 // Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
5309 // lower 2 signed or unsigned 32 - bit integers in b.
5310 //
5311 //   r0 := a0
5312 //   r1 := b0
5313 //   r2 := a1
5314 //   r3 := b1
5315 //
5316 // https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
_mm_unpacklo_epi32(__m128i a,__m128i b)5317 FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
5318 {
5319 #if defined(__aarch64__)
5320     return vreinterpretq_m128i_s32(
5321         vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5322 #else
5323     int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
5324     int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
5325     int32x2x2_t result = vzip_s32(a1, b1);
5326     return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5327 #endif
5328 }
5329 
_mm_unpacklo_epi64(__m128i a,__m128i b)5330 FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
5331 {
5332     int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
5333     int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
5334     return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
5335 }
5336 
5337 // Selects and interleaves the lower two single-precision, floating-point values
5338 // from a and b.
5339 //
5340 //   r0 := a0
5341 //   r1 := b0
5342 //   r2 := a1
5343 //   r3 := b1
5344 //
5345 // https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
_mm_unpacklo_ps(__m128 a,__m128 b)5346 FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
5347 {
5348 #if defined(__aarch64__)
5349     return vreinterpretq_m128_f32(
5350         vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
5351 #else
5352     float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
5353     float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
5354     float32x2x2_t result = vzip_f32(a1, b1);
5355     return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
5356 #endif
5357 }
5358 
5359 // Selects and interleaves the upper two single-precision, floating-point values
5360 // from a and b.
5361 //
5362 //   r0 := a2
5363 //   r1 := b2
5364 //   r2 := a3
5365 //   r3 := b3
5366 //
5367 // https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
_mm_unpackhi_ps(__m128 a,__m128 b)5368 FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
5369 {
5370 #if defined(__aarch64__)
5371     return vreinterpretq_m128_f32(
5372         vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
5373 #else
5374     float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
5375     float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
5376     float32x2x2_t result = vzip_f32(a1, b1);
5377     return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
5378 #endif
5379 }
5380 
5381 // Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
5382 // 8 signed or unsigned 8-bit integers in b.
5383 //
5384 //   r0 := a8
5385 //   r1 := b8
5386 //   r2 := a9
5387 //   r3 := b9
5388 //   ...
5389 //   r14 := a15
5390 //   r15 := b15
5391 //
5392 // https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
_mm_unpackhi_epi8(__m128i a,__m128i b)5393 FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
5394 {
5395 #if defined(__aarch64__)
5396     return vreinterpretq_m128i_s8(
5397         vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5398 #else
5399     int8x8_t a1 =
5400         vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
5401     int8x8_t b1 =
5402         vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
5403     int8x8x2_t result = vzip_s8(a1, b1);
5404     return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5405 #endif
5406 }
5407 
5408 // Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
5409 // upper 4 signed or unsigned 16-bit integers in b.
5410 //
5411 //   r0 := a4
5412 //   r1 := b4
5413 //   r2 := a5
5414 //   r3 := b5
5415 //   r4 := a6
5416 //   r5 := b6
5417 //   r6 := a7
5418 //   r7 := b7
5419 //
5420 // https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
_mm_unpackhi_epi16(__m128i a,__m128i b)5421 FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
5422 {
5423 #if defined(__aarch64__)
5424     return vreinterpretq_m128i_s16(
5425         vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5426 #else
5427     int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
5428     int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
5429     int16x4x2_t result = vzip_s16(a1, b1);
5430     return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5431 #endif
5432 }
5433 
5434 // Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
5435 // upper 2 signed or unsigned 32-bit integers in b.
5436 // https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
_mm_unpackhi_epi32(__m128i a,__m128i b)5437 FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
5438 {
5439 #if defined(__aarch64__)
5440     return vreinterpretq_m128i_s32(
5441         vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5442 #else
5443     int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
5444     int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
5445     int32x2x2_t result = vzip_s32(a1, b1);
5446     return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5447 #endif
5448 }
5449 
5450 // Interleaves the upper signed or unsigned 64-bit integer in a with the
5451 // upper signed or unsigned 64-bit integer in b.
5452 //
5453 //   r0 := a1
5454 //   r1 := b1
_mm_unpackhi_epi64(__m128i a,__m128i b)5455 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
5456 {
5457     int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
5458     int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
5459     return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
5460 }
5461 
5462 // Horizontally compute the minimum amongst the packed unsigned 16-bit integers
5463 // in a, store the minimum and index in dst, and zero the remaining bits in dst.
5464 //
5465 //   index[2:0] := 0
5466 //   min[15:0] := a[15:0]
5467 //   FOR j := 0 to 7
5468 //       i := j*16
5469 //       IF a[i+15:i] < min[15:0]
5470 //           index[2:0] := j
5471 //           min[15:0] := a[i+15:i]
5472 //       FI
5473 //   ENDFOR
5474 //   dst[15:0] := min[15:0]
5475 //   dst[18:16] := index[2:0]
5476 //   dst[127:19] := 0
5477 //
5478 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
_mm_minpos_epu16(__m128i a)5479 FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
5480 {
5481     __m128i dst;
5482     uint16_t min, idx = 0;
5483     // Find the minimum value
5484 #if defined(__aarch64__)
5485     min = vminvq_u16(vreinterpretq_u16_m128i(a));
5486 #else
5487     __m64 tmp;
5488     tmp = vreinterpret_m64_u16(
5489         vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
5490                  vget_high_u16(vreinterpretq_u16_m128i(a))));
5491     tmp = vreinterpret_m64_u16(
5492         vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
5493     tmp = vreinterpret_m64_u16(
5494         vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
5495     min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
5496 #endif
5497     // Get the index of the minimum value
5498     int i;
5499     for (i = 0; i < 8; i++) {
5500         if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
5501             idx = (uint16_t) i;
5502             break;
5503         }
5504         a = _mm_srli_si128(a, 2);
5505     }
5506     // Generate result
5507     dst = _mm_setzero_si128();
5508     dst = vreinterpretq_m128i_u16(
5509         vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
5510     dst = vreinterpretq_m128i_u16(
5511         vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
5512     return dst;
5513 }
5514 
5515 // shift to right
5516 // https://msdn.microsoft.com/en-us/library/bb514041(v=vs.120).aspx
5517 // http://blog.csdn.net/hemmingway/article/details/44828303
5518 // Clang requires a macro here, as it is extremely picky about c being a
5519 // literal.
5520 #define _mm_alignr_epi8(a, b, c) \
5521     ((__m128i) vextq_s8((int8x16_t)(b), (int8x16_t)(a), (c)))
5522 
5523 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
5524 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
5525 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
5526 // otherwise set CF to 0. Return the CF value.
5527 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
_mm_testc_si128(__m128i a,__m128i b)5528 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
5529 {
5530     int64x2_t s64 =
5531         vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
5532                   vreinterpretq_s64_m128i(b));
5533     return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
5534 }
5535 
5536 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
5537 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
5538 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
5539 // otherwise set CF to 0. Return the ZF value.
5540 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
_mm_testz_si128(__m128i a,__m128i b)5541 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
5542 {
5543     int64x2_t s64 =
5544         vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
5545     return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
5546 }
5547 
5548 // Extracts the selected signed or unsigned 8-bit integer from a and zero
5549 // extends.
5550 // FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
5551 #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
5552 
5553 // Inserts the least significant 8 bits of b into the selected 8-bit integer
5554 // of a.
5555 // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
5556 //                                      __constrange(0,16) int imm)
5557 #define _mm_insert_epi8(a, b, imm)                                 \
5558     __extension__({                                                \
5559         vreinterpretq_m128i_s8(                                    \
5560             vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
5561     })
5562 
5563 // Extracts the selected signed or unsigned 16-bit integer from a and zero
5564 // extends.
5565 // https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
5566 // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
5567 #define _mm_extract_epi16(a, imm) \
5568     vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
5569 
5570 // Inserts the least significant 16 bits of b into the selected 16-bit integer
5571 // of a.
5572 // https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
5573 // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
5574 //                                       __constrange(0,8) int imm)
5575 #define _mm_insert_epi16(a, b, imm)                                  \
5576     __extension__({                                                  \
5577         vreinterpretq_m128i_s16(                                     \
5578             vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
5579     })
5580 
5581 // Extracts the selected signed or unsigned 32-bit integer from a and zero
5582 // extends.
5583 // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
5584 #define _mm_extract_epi32(a, imm) \
5585     vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
5586 
5587 // Extracts the selected single-precision (32-bit) floating-point from a.
5588 // FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
5589 #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
5590 
5591 // Inserts the least significant 32 bits of b into the selected 32-bit integer
5592 // of a.
5593 // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
5594 //                                       __constrange(0,4) int imm)
5595 #define _mm_insert_epi32(a, b, imm)                                  \
5596     __extension__({                                                  \
5597         vreinterpretq_m128i_s32(                                     \
5598             vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
5599     })
5600 
5601 // Extracts the selected signed or unsigned 64-bit integer from a and zero
5602 // extends.
5603 // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
5604 #define _mm_extract_epi64(a, imm) \
5605     vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
5606 
5607 // Inserts the least significant 64 bits of b into the selected 64-bit integer
5608 // of a.
5609 // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
5610 //                                       __constrange(0,2) int imm)
5611 #define _mm_insert_epi64(a, b, imm)                                  \
5612     __extension__({                                                  \
5613         vreinterpretq_m128i_s64(                                     \
5614             vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
5615     })
5616 
5617 // Count the number of bits set to 1 in unsigned 32-bit integer a, and
5618 // return that count in dst.
5619 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
_mm_popcnt_u32(unsigned int a)5620 FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
5621 {
5622 #if defined(__aarch64__)
5623 #if __has_builtin(__builtin_popcount)
5624     return __builtin_popcount(a);
5625 #else
5626     return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
5627 #endif
5628 #else
5629     uint32_t count = 0;
5630     uint8x8_t input_val, count8x8_val;
5631     uint16x4_t count16x4_val;
5632     uint32x2_t count32x2_val;
5633 
5634     input_val = vld1_u8((uint8_t *) &a);
5635     count8x8_val = vcnt_u8(input_val);
5636     count16x4_val = vpaddl_u8(count8x8_val);
5637     count32x2_val = vpaddl_u16(count16x4_val);
5638 
5639     vst1_u32(&count, count32x2_val);
5640     return count;
5641 #endif
5642 }
5643 
5644 // Count the number of bits set to 1 in unsigned 64-bit integer a, and
5645 // return that count in dst.
5646 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
_mm_popcnt_u64(uint64_t a)5647 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
5648 {
5649 #if defined(__aarch64__)
5650 #if __has_builtin(__builtin_popcountll)
5651     return __builtin_popcountll(a);
5652 #else
5653     return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
5654 #endif
5655 #else
5656     uint64_t count = 0;
5657     uint8x8_t input_val, count8x8_val;
5658     uint16x4_t count16x4_val;
5659     uint32x2_t count32x2_val;
5660     uint64x1_t count64x1_val;
5661 
5662     input_val = vld1_u8((uint8_t *) &a);
5663     count8x8_val = vcnt_u8(input_val);
5664     count16x4_val = vpaddl_u8(count8x8_val);
5665     count32x2_val = vpaddl_u16(count16x4_val);
5666     count64x1_val = vpaddl_u32(count32x2_val);
5667     vst1_u64(&count, count64x1_val);
5668     return count;
5669 #endif
5670 }
5671 
5672 // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
5673 // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
5674 // transposed matrix in these vectors (row0 now contains column 0, etc.).
5675 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
5676 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
5677     do {                                                  \
5678         float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
5679         float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
5680         row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
5681                             vget_low_f32(ROW23.val[0]));  \
5682         row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
5683                             vget_low_f32(ROW23.val[1]));  \
5684         row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
5685                             vget_high_f32(ROW23.val[0])); \
5686         row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
5687                             vget_high_f32(ROW23.val[1])); \
5688     } while (0)
5689 
5690 /* Crypto Extensions */
5691 
5692 #if defined(__ARM_FEATURE_CRYPTO)
5693 // Wraps vmull_p64
_sse2neon_vmull_p64(uint64x1_t _a,uint64x1_t _b)5694 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
5695 {
5696     poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
5697     poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
5698     return vreinterpretq_u64_p128(vmull_p64(a, b));
5699 }
5700 #else  // ARMv7 polyfill
5701 // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
5702 //
5703 // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
5704 // 64-bit->128-bit polynomial multiply.
5705 //
5706 // It needs some work and is somewhat slow, but it is still faster than all
5707 // known scalar methods.
5708 //
5709 // Algorithm adapted to C from
5710 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
5711 // from "Fast Software Polynomial Multiplication on ARM Processors Using the
5712 // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
5713 // (https://hal.inria.fr/hal-01506572)
_sse2neon_vmull_p64(uint64x1_t _a,uint64x1_t _b)5714 static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
5715 {
5716     poly8x8_t a = vreinterpret_p8_u64(_a);
5717     poly8x8_t b = vreinterpret_p8_u64(_b);
5718 
5719     // Masks
5720     uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
5721                                     vcreate_u8(0x00000000ffffffff));
5722     uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
5723                                     vcreate_u8(0x0000000000000000));
5724 
5725     // Do the multiplies, rotating with vext to get all combinations
5726     uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
5727     uint8x16_t e =
5728         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
5729     uint8x16_t f =
5730         vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
5731     uint8x16_t g =
5732         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
5733     uint8x16_t h =
5734         vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
5735     uint8x16_t i =
5736         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
5737     uint8x16_t j =
5738         vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
5739     uint8x16_t k =
5740         vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
5741 
5742     // Add cross products
5743     uint8x16_t l = veorq_u8(e, f);  // L = E + F
5744     uint8x16_t m = veorq_u8(g, h);  // M = G + H
5745     uint8x16_t n = veorq_u8(i, j);  // N = I + J
5746 
5747     // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
5748     // instructions.
5749 #if defined(__aarch64__)
5750     uint8x16_t lm_p0 = vreinterpretq_u8_u64(
5751         vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
5752     uint8x16_t lm_p1 = vreinterpretq_u8_u64(
5753         vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
5754     uint8x16_t nk_p0 = vreinterpretq_u8_u64(
5755         vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
5756     uint8x16_t nk_p1 = vreinterpretq_u8_u64(
5757         vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
5758 #else
5759     uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
5760     uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
5761     uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
5762     uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
5763 #endif
5764     // t0 = (L) (P0 + P1) << 8
5765     // t1 = (M) (P2 + P3) << 16
5766     uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
5767     uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
5768     uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
5769 
5770     // t2 = (N) (P4 + P5) << 24
5771     // t3 = (K) (P6 + P7) << 32
5772     uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
5773     uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
5774     uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
5775 
5776     // De-interleave
5777 #if defined(__aarch64__)
5778     uint8x16_t t0 = vreinterpretq_u8_u64(
5779         vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
5780     uint8x16_t t1 = vreinterpretq_u8_u64(
5781         vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
5782     uint8x16_t t2 = vreinterpretq_u8_u64(
5783         vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
5784     uint8x16_t t3 = vreinterpretq_u8_u64(
5785         vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
5786 #else
5787     uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
5788     uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
5789     uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
5790     uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
5791 #endif
5792     // Shift the cross products
5793     uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
5794     uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
5795     uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
5796     uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
5797 
5798     // Accumulate the products
5799     uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
5800     uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
5801     uint8x16_t mix = veorq_u8(d, cross1);
5802     uint8x16_t r = veorq_u8(mix, cross2);
5803     return vreinterpretq_u64_u8(r);
5804 }
5805 #endif  // ARMv7 polyfill
5806 
_mm_clmulepi64_si128(__m128i _a,__m128i _b,const int imm)5807 FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
5808 {
5809     uint64x2_t a = vreinterpretq_u64_m128i(_a);
5810     uint64x2_t b = vreinterpretq_u64_m128i(_b);
5811     switch (imm & 0x11) {
5812     case 0x00:
5813         return vreinterpretq_m128i_u64(
5814             _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
5815     case 0x01:
5816         return vreinterpretq_m128i_u64(
5817             _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
5818     case 0x10:
5819         return vreinterpretq_m128i_u64(
5820             _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
5821     case 0x11:
5822         return vreinterpretq_m128i_u64(
5823             _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
5824     default:
5825         abort();
5826     }
5827 }
5828 
5829 #if !defined(__ARM_FEATURE_CRYPTO)
5830 /* clang-format off */
5831 #define SSE2NEON_AES_DATA(w)                                           \
5832     {                                                                  \
5833         w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
5834         w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
5835         w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
5836         w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
5837         w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
5838         w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
5839         w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
5840         w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
5841         w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
5842         w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
5843         w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
5844         w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
5845         w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
5846         w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
5847         w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
5848         w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
5849         w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
5850         w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
5851         w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
5852         w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
5853         w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
5854         w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
5855         w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
5856         w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
5857         w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
5858         w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
5859         w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
5860         w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
5861         w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
5862         w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
5863         w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
5864         w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
5865         w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
5866         w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
5867         w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
5868         w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
5869         w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
5870     }
5871 /* clang-format on */
5872 
5873 /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
5874 #define SSE2NEON_AES_H0(x) (x)
5875 static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
5876 #undef SSE2NEON_AES_H0
5877 
5878 // In the absence of crypto extensions, implement aesenc using regular neon
5879 // intrinsics instead. See:
5880 // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
5881 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
5882 // https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
5883 // for more information Reproduced with permission of the author.
_mm_aesenc_si128(__m128i EncBlock,__m128i RoundKey)5884 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
5885 {
5886 #if defined(__aarch64__)
5887     static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
5888                                          0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
5889                                          0xc, 0x1, 0x6, 0xb};
5890     static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
5891                                        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
5892 
5893     uint8x16_t v;
5894     uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
5895 
5896     // shift rows
5897     w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
5898 
5899     // sub bytes
5900     v = vqtbl4q_u8(vld1q_u8_x4(SSE2NEON_sbox), w);
5901     v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
5902     v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
5903     v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
5904 
5905     // mix columns
5906     w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
5907     w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
5908     w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
5909 
5910     //  add round key
5911     return vreinterpretq_m128i_u8(w) ^ RoundKey;
5912 
5913 #else /* ARMv7-A NEON implementation */
5914 #define SSE2NEON_AES_B2W(b0, b1, b2, b3)                                       \
5915     (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
5916      (b0))
5917 #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
5918 #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
5919 #define SSE2NEON_AES_U0(p) \
5920     SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
5921 #define SSE2NEON_AES_U1(p) \
5922     SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
5923 #define SSE2NEON_AES_U2(p) \
5924     SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
5925 #define SSE2NEON_AES_U3(p) \
5926     SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
5927     static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
5928         SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
5929         SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
5930         SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
5931         SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
5932     };
5933 #undef SSE2NEON_AES_B2W
5934 #undef SSE2NEON_AES_F2
5935 #undef SSE2NEON_AES_F3
5936 #undef SSE2NEON_AES_U0
5937 #undef SSE2NEON_AES_U1
5938 #undef SSE2NEON_AES_U2
5939 #undef SSE2NEON_AES_U3
5940 
5941     uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
5942     uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
5943     uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
5944     uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
5945 
5946     __m128i out = _mm_set_epi32(
5947         (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
5948          aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
5949         (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
5950          aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
5951         (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
5952          aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
5953         (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
5954          aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
5955 
5956     return _mm_xor_si128(out, RoundKey);
5957 #endif
5958 }
5959 
_mm_aesenclast_si128(__m128i a,__m128i RoundKey)5960 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
5961 {
5962     /* FIXME: optimized for NEON */
5963     uint8_t v[4][4] = {
5964         [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
5965                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
5966                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
5967                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
5968         [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
5969                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
5970                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
5971                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
5972         [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
5973                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
5974                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
5975                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
5976         [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
5977                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
5978                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
5979                SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
5980     };
5981     for (int i = 0; i < 16; i++)
5982         vreinterpretq_nth_u8_m128i(a, i) =
5983             v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
5984     return a;
5985 }
5986 
5987 // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
5988 // This instruction generates a round key for AES encryption. See
5989 // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
5990 // for details.
5991 //
5992 // https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
_mm_aeskeygenassist_si128(__m128i key,const int rcon)5993 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
5994 {
5995     uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
5996     uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
5997     for (int i = 0; i < 4; ++i) {
5998         ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
5999         ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
6000     }
6001     return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
6002                          ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
6003 }
6004 #undef SSE2NEON_AES_DATA
6005 
6006 #else /* __ARM_FEATURE_CRYPTO */
6007 // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
6008 // AESMC and then manually applying the real key as an xor operation. This
6009 // unfortunately means an additional xor op; the compiler should be able to
6010 // optimize this away for repeated calls however. See
6011 // https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
6012 // for more details.
_mm_aesenc_si128(__m128i a,__m128i b)6013 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
6014 {
6015     return vreinterpretq_m128i_u8(
6016         vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
6017         vreinterpretq_u8_m128i(b));
6018 }
6019 
6020 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
_mm_aesenclast_si128(__m128i a,__m128i RoundKey)6021 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
6022 {
6023     return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
6024                              vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
6025                          RoundKey);
6026 }
6027 
_mm_aeskeygenassist_si128(__m128i a,const int rcon)6028 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
6029 {
6030     // AESE does ShiftRows and SubBytes on A
6031     uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
6032 
6033     uint8x16_t dest = {
6034         // Undo ShiftRows step from AESE and extract X1 and X3
6035         u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
6036         u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
6037         u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
6038         u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
6039     };
6040     uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
6041     return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
6042 }
6043 #endif
6044 
6045 /* Streaming Extensions */
6046 
6047 // Guarantees that every preceding store is globally visible before any
6048 // subsequent store.
6049 // https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
_mm_sfence(void)6050 FORCE_INLINE void _mm_sfence(void)
6051 {
6052     __sync_synchronize();
6053 }
6054 
6055 // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
6056 // point elements) from a into memory using a non-temporal memory hint.
6057 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
_mm_stream_ps(float * p,__m128 a)6058 FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
6059 {
6060 #if __has_builtin(__builtin_nontemporal_store)
6061     __builtin_nontemporal_store(a, (float32x4_t *) p);
6062 #else
6063     vst1q_f32(p, vreinterpretq_f32_m128(a));
6064 #endif
6065 }
6066 
6067 // Stores the data in a to the address p without polluting the caches.  If the
6068 // cache line containing address p is already in the cache, the cache will be
6069 // updated.
6070 // https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
_mm_stream_si128(__m128i * p,__m128i a)6071 FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
6072 {
6073 #if __has_builtin(__builtin_nontemporal_store)
6074     __builtin_nontemporal_store(a, p);
6075 #else
6076     vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
6077 #endif
6078 }
6079 
6080 // Load 128-bits of integer data from memory into dst using a non-temporal
6081 // memory hint. mem_addr must be aligned on a 16-byte boundary or a
6082 // general-protection exception may be generated.
6083 //
6084 //   dst[127:0] := MEM[mem_addr+127:mem_addr]
6085 //
6086 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
_mm_stream_load_si128(__m128i * p)6087 FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
6088 {
6089 #if __has_builtin(__builtin_nontemporal_store)
6090     return __builtin_nontemporal_load(p);
6091 #else
6092     return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
6093 #endif
6094 }
6095 
6096 // Cache line containing p is flushed and invalidated from all caches in the
6097 // coherency domain. :
6098 // https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
_mm_clflush(void const * p)6099 FORCE_INLINE void _mm_clflush(void const *p)
6100 {
6101     (void) p;
6102     // no corollary for Neon?
6103 }
6104 
6105 // Allocate aligned blocks of memory.
6106 // https://software.intel.com/en-us/
6107 //         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
_mm_malloc(size_t size,size_t align)6108 FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
6109 {
6110     void *ptr;
6111     if (align == 1)
6112         return malloc(size);
6113     if (align == 2 || (sizeof(void *) == 8 && align == 4))
6114         align = sizeof(void *);
6115     if (!posix_memalign(&ptr, align, size))
6116         return ptr;
6117     return NULL;
6118 }
6119 
_mm_free(void * addr)6120 FORCE_INLINE void _mm_free(void *addr)
6121 {
6122     free(addr);
6123 }
6124 
6125 // Starting with the initial value in crc, accumulates a CRC32 value for
6126 // unsigned 8-bit integer v.
6127 // https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
_mm_crc32_u8(uint32_t crc,uint8_t v)6128 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
6129 {
6130 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
6131     __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
6132                          : [c] "+r"(crc)
6133                          : [v] "r"(v));
6134 #else
6135     crc ^= v;
6136     for (int bit = 0; bit < 8; bit++) {
6137         if (crc & 1)
6138             crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
6139         else
6140             crc = (crc >> 1);
6141     }
6142 #endif
6143     return crc;
6144 }
6145 
6146 // Starting with the initial value in crc, accumulates a CRC32 value for
6147 // unsigned 16-bit integer v.
6148 // https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
_mm_crc32_u16(uint32_t crc,uint16_t v)6149 FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
6150 {
6151 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
6152     __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
6153                          : [c] "+r"(crc)
6154                          : [v] "r"(v));
6155 #else
6156     crc = _mm_crc32_u8(crc, v & 0xff);
6157     crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
6158 #endif
6159     return crc;
6160 }
6161 
6162 // Starting with the initial value in crc, accumulates a CRC32 value for
6163 // unsigned 32-bit integer v.
6164 // https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
_mm_crc32_u32(uint32_t crc,uint32_t v)6165 FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
6166 {
6167 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
6168     __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
6169                          : [c] "+r"(crc)
6170                          : [v] "r"(v));
6171 #else
6172     crc = _mm_crc32_u16(crc, v & 0xffff);
6173     crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
6174 #endif
6175     return crc;
6176 }
6177 
6178 // Starting with the initial value in crc, accumulates a CRC32 value for
6179 // unsigned 64-bit integer v.
6180 // https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
_mm_crc32_u64(uint64_t crc,uint64_t v)6181 FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
6182 {
6183 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
6184     __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
6185                          : [c] "+r"(crc)
6186                          : [v] "r"(v));
6187 #else
6188     crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
6189     crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
6190 #endif
6191     return crc;
6192 }
6193 
6194 #if defined(__GNUC__) || defined(__clang__)
6195 #pragma pop_macro("ALIGN_STRUCT")
6196 #pragma pop_macro("FORCE_INLINE")
6197 #endif
6198 
6199 #if defined(__GNUC__)
6200 #pragma GCC pop_options
6201 #endif
6202 
6203 #endif
6204