1 #ifndef SSE2NEON_H
2 #define SSE2NEON_H
3
4 // This header file provides a simple API translation layer
5 // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
6 //
7 // This header file does not yet translate all of the SSE intrinsics.
8 //
9 // Contributors to this work are:
10 // John W. Ratcliff <jratcliffscarab@gmail.com>
11 // Brandon Rowlett <browlett@nvidia.com>
12 // Ken Fast <kfast@gdeb.com>
13 // Eric van Beurden <evanbeurden@nvidia.com>
14 // Alexander Potylitsin <apotylitsin@nvidia.com>
15 // Hasindu Gamaarachchi <hasindu2008@gmail.com>
16 // Jim Huang <jserv@biilabs.io>
17 // Mark Cheng <marktwtn@biilabs.io>
18 // Malcolm James MacLeod <malcolm@gulden.com>
19 // Devin Hussey (easyaspi314) <husseydevin@gmail.com>
20 // Sebastian Pop <spop@amazon.com>
21 // Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
22 // Danila Kutenin <danilak@google.com>
23 // François Turban (JishinMaster) <francois.turban@gmail.com>
24 // Pei-Hsuan Hung <afcidk@gmail.com>
25 // Yang-Hao Yuan <yanghau@biilabs.io>
26 // Syoyo Fujita <syoyo@lighttransport.com>
27 // Brecht Van Lommel <brecht@blender.org>
28
29 /*
30 * sse2neon is freely redistributable under the MIT License.
31 *
32 * Permission is hereby granted, free of charge, to any person obtaining a copy
33 * of this software and associated documentation files (the "Software"), to deal
34 * in the Software without restriction, including without limitation the rights
35 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
36 * copies of the Software, and to permit persons to whom the Software is
37 * furnished to do so, subject to the following conditions:
38 *
39 * The above copyright notice and this permission notice shall be included in
40 * all copies or substantial portions of the Software.
41 *
42 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
43 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
44 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
47 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
48 * SOFTWARE.
49 */
50
51 /* Tunable configurations */
52
53 /* Enable precise implementation of math operations
54 * This would slow down the computation a bit, but gives consistent result with
55 * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
56 */
57 /* _mm_min_ps and _mm_max_ps */
58 #ifndef SSE2NEON_PRECISE_MINMAX
59 #define SSE2NEON_PRECISE_MINMAX (0)
60 #endif
61 /* _mm_rcp_ps and _mm_div_ps */
62 #ifndef SSE2NEON_PRECISE_DIV
63 #define SSE2NEON_PRECISE_DIV (0)
64 #endif
65 /* _mm_sqrt_ps and _mm_rsqrt_ps */
66 #ifndef SSE2NEON_PRECISE_SQRT
67 #define SSE2NEON_PRECISE_SQRT (0)
68 #endif
69 #ifndef SSE2NEON_PRECISE_RSQRT
70 #define SSE2NEON_PRECISE_RSQRT (0)
71 #endif
72
73 #if defined(__GNUC__) || defined(__clang__)
74 #pragma push_macro("FORCE_INLINE")
75 #pragma push_macro("ALIGN_STRUCT")
76 #define FORCE_INLINE static inline __attribute__((always_inline))
77 #define ALIGN_STRUCT(x) __attribute__((aligned(x)))
78 #ifndef likely
79 #define likely(x) __builtin_expect(!!(x), 1)
80 #endif
81 #ifndef unlikely
82 #define unlikely(x) __builtin_expect(!!(x), 0)
83 #endif
84 #else
85 #error "Macro name collisions may happen with unsupported compiler."
86 #ifdef FORCE_INLINE
87 #undef FORCE_INLINE
88 #endif
89 #define FORCE_INLINE static inline
90 #ifndef ALIGN_STRUCT
91 #define ALIGN_STRUCT(x) __declspec(align(x))
92 #endif
93 #endif
94 #ifndef likely
95 #define likely(x) (x)
96 #endif
97 #ifndef unlikely
98 #define unlikely(x) (x)
99 #endif
100
101 #include <stdint.h>
102 #include <stdlib.h>
103
104 /* Architecture-specific build options */
105 /* FIXME: #pragma GCC push_options is only available on GCC */
106 #if defined(__GNUC__)
107 #if defined(__arm__) && __ARM_ARCH == 7
108 /* According to ARM C Language Extensions Architecture specification,
109 * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
110 * architecture supported.
111 */
112 #if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
113 #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
114 #endif
115 #if !defined(__clang__)
116 #pragma GCC push_options
117 #pragma GCC target("fpu=neon")
118 #endif
119 #elif defined(__aarch64__)
120 #if !defined(__clang__)
121 #pragma GCC push_options
122 #pragma GCC target("+simd")
123 #endif
124 #else
125 #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
126 #endif
127 #endif
128
129 #include <arm_neon.h>
130
131 /* Rounding functions require either Aarch64 instructions or libm failback */
132 #if !defined(__aarch64__)
133 #include <math.h>
134 #endif
135
136 /* "__has_builtin" can be used to query support for built-in functions
137 * provided by gcc/clang and other compilers that support it.
138 */
139 #ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
140 /* Compatibility with gcc <= 9 */
141 #if __GNUC__ <= 9
142 #define __has_builtin(x) HAS##x
143 #define HAS__builtin_popcount 1
144 #define HAS__builtin_popcountll 1
145 #else
146 #define __has_builtin(x) 0
147 #endif
148 #endif
149
150 /**
151 * MACRO for shuffle parameter for _mm_shuffle_ps().
152 * Argument fp3 is a digit[0123] that represents the fp from argument "b"
153 * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
154 * for fp2 in result. fp1 is a digit[0123] that represents the fp from
155 * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
156 * fp0 is the same for fp0 of result.
157 */
158 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
159 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
160
161 /* Rounding mode macros. */
162 #define _MM_FROUND_TO_NEAREST_INT 0x00
163 #define _MM_FROUND_TO_NEG_INF 0x01
164 #define _MM_FROUND_TO_POS_INF 0x02
165 #define _MM_FROUND_TO_ZERO 0x03
166 #define _MM_FROUND_CUR_DIRECTION 0x04
167 #define _MM_FROUND_NO_EXC 0x08
168 #define _MM_ROUND_NEAREST 0x0000
169 #define _MM_ROUND_DOWN 0x2000
170 #define _MM_ROUND_UP 0x4000
171 #define _MM_ROUND_TOWARD_ZERO 0x6000
172
173 /* indicate immediate constant argument in a given range */
174 #define __constrange(a, b) const
175
176 /* A few intrinsics accept traditional data types like ints or floats, but
177 * most operate on data types that are specific to SSE.
178 * If a vector type ends in d, it contains doubles, and if it does not have
179 * a suffix, it contains floats. An integer vector type can contain any type
180 * of integer, from chars to shorts to unsigned long longs.
181 */
182 typedef int64x1_t __m64;
183 typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
184 // On ARM 32-bit architecture, the float64x2_t is not supported.
185 // The data type __m128d should be represented in a different way for related
186 // intrinsic conversion.
187 #if defined(__aarch64__)
188 typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
189 #else
190 typedef float32x4_t __m128d;
191 #endif
192 typedef int64x2_t __m128i; /* 128-bit vector containing integers */
193
194 /* type-safe casting between types */
195
196 #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
197 #define vreinterpretq_m128_f32(x) (x)
198 #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
199
200 #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
201 #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
202 #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
203 #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
204
205 #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
206 #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
207 #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
208 #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
209
210 #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
211 #define vreinterpretq_f32_m128(x) (x)
212 #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
213
214 #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
215 #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
216 #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
217 #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
218
219 #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
220 #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
221 #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
222 #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
223
224 #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
225 #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
226 #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
227 #define vreinterpretq_m128i_s64(x) (x)
228
229 #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
230 #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
231 #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
232 #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
233
234 #define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
235 #define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
236
237 #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
238 #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
239 #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
240 #define vreinterpretq_s64_m128i(x) (x)
241
242 #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
243 #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
244 #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
245 #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
246
247 #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
248 #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
249 #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
250 #define vreinterpret_m64_s64(x) (x)
251
252 #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
253 #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
254 #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
255 #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
256
257 #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
258 #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
259 #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
260
261 #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
262 #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
263 #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
264 #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
265
266 #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
267 #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
268 #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
269 #define vreinterpret_s64_m64(x) (x)
270
271 #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
272
273 #if defined(__aarch64__)
274 #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
275 #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
276
277 #define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
278
279 #define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
280 #define vreinterpretq_m128d_f64(x) (x)
281
282 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
283
284 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
285
286 #define vreinterpretq_f64_m128d(x) (x)
287 #define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
288 #else
289 #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
290 #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
291
292 #define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
293 #define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
294
295 #define vreinterpretq_m128d_f32(x) (x)
296
297 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
298
299 #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
300 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
301
302 #define vreinterpretq_f32_m128d(x) (x)
303 #endif
304
305 // A struct is defined in this header file called 'SIMDVec' which can be used
306 // by applications which attempt to access the contents of an _m128 struct
307 // directly. It is important to note that accessing the __m128 struct directly
308 // is bad coding practice by Microsoft: @see:
309 // https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
310 //
311 // However, some legacy source code may try to access the contents of an __m128
312 // struct directly so the developer can use the SIMDVec as an alias for it. Any
313 // casting must be done manually by the developer, as you cannot cast or
314 // otherwise alias the base NEON data type for intrinsic operations.
315 //
316 // union intended to allow direct access to an __m128 variable using the names
317 // that the MSVC compiler provides. This union should really only be used when
318 // trying to access the members of the vector as integer values. GCC/clang
319 // allow native access to the float members through a simple array access
320 // operator (in C since 4.6, in C++ since 4.8).
321 //
322 // Ideally direct accesses to SIMD vectors should not be used since it can cause
323 // a performance hit. If it really is needed however, the original __m128
324 // variable can be aliased with a pointer to this union and used to access
325 // individual components. The use of this union should be hidden behind a macro
326 // that is used throughout the codebase to access the members instead of always
327 // declaring this type of variable.
328 typedef union ALIGN_STRUCT(16) SIMDVec {
329 float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
330 int8_t m128_i8[16]; // as signed 8-bit integers.
331 int16_t m128_i16[8]; // as signed 16-bit integers.
332 int32_t m128_i32[4]; // as signed 32-bit integers.
333 int64_t m128_i64[2]; // as signed 64-bit integers.
334 uint8_t m128_u8[16]; // as unsigned 8-bit integers.
335 uint16_t m128_u16[8]; // as unsigned 16-bit integers.
336 uint32_t m128_u32[4]; // as unsigned 32-bit integers.
337 uint64_t m128_u64[2]; // as unsigned 64-bit integers.
338 } SIMDVec;
339
340 // casting using SIMDVec
341 #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
342 #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
343 #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
344
345 /* Backwards compatibility for compilers with lack of specific type support */
346
347 // Older gcc does not define vld1q_u8_x4 type
348 #if defined(__GNUC__) && !defined(__clang__) && \
349 ((__GNUC__ == 10 && (__GNUC_MINOR__ <= 1)) || \
350 (__GNUC__ == 9 && (__GNUC_MINOR__ <= 3)) || \
351 (__GNUC__ == 8 && (__GNUC_MINOR__ <= 4)) || __GNUC__ <= 7)
_sse2neon_vld1q_u8_x4(const uint8_t * p)352 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
353 {
354 uint8x16x4_t ret;
355 ret.val[0] = vld1q_u8(p + 0);
356 ret.val[1] = vld1q_u8(p + 16);
357 ret.val[2] = vld1q_u8(p + 32);
358 ret.val[3] = vld1q_u8(p + 48);
359 return ret;
360 }
361 #else
362 // Wraps vld1q_u8_x4
_sse2neon_vld1q_u8_x4(const uint8_t * p)363 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
364 {
365 return vld1q_u8_x4(p);
366 }
367 #endif
368
369 /* Function Naming Conventions
370 * The naming convention of SSE intrinsics is straightforward. A generic SSE
371 * intrinsic function is given as follows:
372 * _mm_<name>_<data_type>
373 *
374 * The parts of this format are given as follows:
375 * 1. <name> describes the operation performed by the intrinsic
376 * 2. <data_type> identifies the data type of the function's primary arguments
377 *
378 * This last part, <data_type>, is a little complicated. It identifies the
379 * content of the input values, and can be set to any of the following values:
380 * + ps - vectors contain floats (ps stands for packed single-precision)
381 * + pd - vectors cantain doubles (pd stands for packed double-precision)
382 * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
383 * signed integers
384 * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
385 * unsigned integers
386 * + si128 - unspecified 128-bit vector or 256-bit vector
387 * + m128/m128i/m128d - identifies input vector types when they are different
388 * than the type of the returned vector
389 *
390 * For example, _mm_setzero_ps. The _mm implies that the function returns
391 * a 128-bit vector. The _ps at the end implies that the argument vectors
392 * contain floats.
393 *
394 * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
395 * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
396 * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
397 * // Set packed 8-bit integers
398 * // 128 bits, 16 chars, per 8 bits
399 * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
400 * 4, 5, 12, 13, 6, 7, 14, 15);
401 * // Shuffle packed 8-bit integers
402 * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
403 *
404 * Data (Number, Binary, Byte Index):
405 +------+------+-------------+------+------+-------------+
406 | 1 | 2 | 3 | 4 | Number
407 +------+------+------+------+------+------+------+------+
408 | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
409 +------+------+------+------+------+------+------+------+
410 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index
411 +------+------+------+------+------+------+------+------+
412
413 +------+------+------+------+------+------+------+------+
414 | 5 | 6 | 7 | 8 | Number
415 +------+------+------+------+------+------+------+------+
416 | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
417 +------+------+------+------+------+------+------+------+
418 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index
419 +------+------+------+------+------+------+------+------+
420 * Index (Byte Index):
421 +------+------+------+------+------+------+------+------+
422 | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 |
423 +------+------+------+------+------+------+------+------+
424
425 +------+------+------+------+------+------+------+------+
426 | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 |
427 +------+------+------+------+------+------+------+------+
428 * Result:
429 +------+------+------+------+------+------+------+------+
430 | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index
431 +------+------+------+------+------+------+------+------+
432 | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
433 +------+------+------+------+------+------+------+------+
434 | 256 | 2 | 5 | 6 | Number
435 +------+------+------+------+------+------+------+------+
436
437 +------+------+------+------+------+------+------+------+
438 | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index
439 +------+------+------+------+------+------+------+------+
440 | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
441 +------+------+------+------+------+------+------+------+
442 | 3 | 7 | 4 | 8 | Number
443 +------+------+------+------+------+------+-------------+
444 */
445
446 /* Set/get methods */
447
448 /* Constants for use with _mm_prefetch. */
449 enum _mm_hint {
450 _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
451 _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
452 _MM_HINT_T1 = 2, /* load data to L2 cache only */
453 _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
454 _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
455 _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */
456 _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */
457 _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */
458 };
459
460 // Loads one cache line of data from address p to a location closer to the
461 // processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
_mm_prefetch(const void * p,int i)462 FORCE_INLINE void _mm_prefetch(const void *p, int i)
463 {
464 (void) i;
465 __builtin_prefetch(p);
466 }
467
468 // Pause the processor. This is typically used in spin-wait loops and depending
469 // on the x86 processor typical values are in the 40-100 cycle range. The
470 // 'yield' instruction isn't a good fit beacuse it's effectively a nop on most
471 // Arm cores. Experience with several databases has shown has shown an 'isb' is
472 // a reasonable approximation.
_mm_pause()473 FORCE_INLINE void _mm_pause()
474 {
475 __asm__ __volatile__("isb\n");
476 }
477
478 // Copy the lower single-precision (32-bit) floating-point element of a to dst.
479 //
480 // dst[31:0] := a[31:0]
481 //
482 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
_mm_cvtss_f32(__m128 a)483 FORCE_INLINE float _mm_cvtss_f32(__m128 a)
484 {
485 return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
486 }
487
488 // Convert the lower single-precision (32-bit) floating-point element in b to a
489 // double-precision (64-bit) floating-point element, store the result in the
490 // lower element of dst, and copy the upper element from a to the upper element
491 // of dst.
492 //
493 // dst[63:0] := Convert_FP32_To_FP64(b[31:0])
494 // dst[127:64] := a[127:64]
495 //
496 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
_mm_cvtss_sd(__m128d a,__m128 b)497 FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
498 {
499 double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
500 #if defined(__aarch64__)
501 return vreinterpretq_m128d_f64(
502 vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
503 #else
504 return vreinterpretq_m128d_s64(
505 vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
506 #endif
507 }
508
509 // Convert the lower single-precision (32-bit) floating-point element in a to a
510 // 32-bit integer, and store the result in dst.
511 //
512 // dst[31:0] := Convert_FP32_To_Int32(a[31:0])
513 //
514 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
515 #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
516
517 // Convert the lower single-precision (32-bit) floating-point element in a to a
518 // 64-bit integer, and store the result in dst.
519 //
520 // dst[63:0] := Convert_FP32_To_Int64(a[31:0])
521 //
522 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
_mm_cvtss_si64(__m128 a)523 FORCE_INLINE int _mm_cvtss_si64(__m128 a)
524 {
525 #if defined(__aarch64__)
526 return vgetq_lane_s64(
527 vreinterpretq_s64_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))), 0);
528 #else
529 float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
530 float32_t diff = data - floor(data);
531 if (diff > 0.5)
532 return (int64_t) ceil(data);
533 if (unlikely(diff == 0.5)) {
534 int64_t f = (int64_t) floor(data);
535 int64_t c = (int64_t) ceil(data);
536 return c & 1 ? f : c;
537 }
538 return (int64_t) floor(data);
539 #endif
540 }
541
542 // Convert packed single-precision (32-bit) floating-point elements in a to
543 // packed 32-bit integers with truncation, and store the results in dst.
544 //
545 // FOR j := 0 to 1
546 // i := 32*j
547 // dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
548 // ENDFOR
549 //
550 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
_mm_cvtt_ps2pi(__m128 a)551 FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
552 {
553 return vreinterpret_m64_s32(
554 vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
555 }
556
557 // Convert the lower single-precision (32-bit) floating-point element in a to a
558 // 32-bit integer with truncation, and store the result in dst.
559 //
560 // dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
561 //
562 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
_mm_cvtt_ss2si(__m128 a)563 FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
564 {
565 return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
566 }
567
568 // Convert packed single-precision (32-bit) floating-point elements in a to
569 // packed 32-bit integers with truncation, and store the results in dst.
570 //
571 // FOR j := 0 to 1
572 // i := 32*j
573 // dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
574 // ENDFOR
575 //
576 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
577 #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
578
579 // Convert the lower single-precision (32-bit) floating-point element in a to a
580 // 32-bit integer with truncation, and store the result in dst.
581 //
582 // dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
583 //
584 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
585 #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
586
587 // Convert the lower single-precision (32-bit) floating-point element in a to a
588 // 64-bit integer with truncation, and store the result in dst.
589 //
590 // dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
591 //
592 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
_mm_cvttss_si64(__m128 a)593 FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
594 {
595 return vgetq_lane_s64(
596 vmovl_s32(vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))), 0);
597 }
598
599 // Sets the 128-bit value to zero
600 // https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
_mm_setzero_si128(void)601 FORCE_INLINE __m128i _mm_setzero_si128(void)
602 {
603 return vreinterpretq_m128i_s32(vdupq_n_s32(0));
604 }
605
606 // Clears the four single-precision, floating-point values.
607 // https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
_mm_setzero_ps(void)608 FORCE_INLINE __m128 _mm_setzero_ps(void)
609 {
610 return vreinterpretq_m128_f32(vdupq_n_f32(0));
611 }
612
613 // Return vector of type __m128d with all elements set to zero.
614 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
_mm_setzero_pd(void)615 FORCE_INLINE __m128d _mm_setzero_pd(void)
616 {
617 #if defined(__aarch64__)
618 return vreinterpretq_m128d_f64(vdupq_n_f64(0));
619 #else
620 return vreinterpretq_m128d_f32(vdupq_n_f32(0));
621 #endif
622 }
623
624 // Sets the four single-precision, floating-point values to w.
625 //
626 // r0 := r1 := r2 := r3 := w
627 //
628 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
_mm_set1_ps(float _w)629 FORCE_INLINE __m128 _mm_set1_ps(float _w)
630 {
631 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
632 }
633
634 // Sets the four single-precision, floating-point values to w.
635 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
_mm_set_ps1(float _w)636 FORCE_INLINE __m128 _mm_set_ps1(float _w)
637 {
638 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
639 }
640
641 // Sets the four single-precision, floating-point values to the four inputs.
642 // https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
_mm_set_ps(float w,float z,float y,float x)643 FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
644 {
645 float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
646 return vreinterpretq_m128_f32(vld1q_f32(data));
647 }
648
649 // Copy single-precision (32-bit) floating-point element a to the lower element
650 // of dst, and zero the upper 3 elements.
651 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
_mm_set_ss(float a)652 FORCE_INLINE __m128 _mm_set_ss(float a)
653 {
654 float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
655 return vreinterpretq_m128_f32(vld1q_f32(data));
656 }
657
658 // Sets the four single-precision, floating-point values to the four inputs in
659 // reverse order.
660 // https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
_mm_setr_ps(float w,float z,float y,float x)661 FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
662 {
663 float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
664 return vreinterpretq_m128_f32(vld1q_f32(data));
665 }
666
667 // Sets the 8 signed 16-bit integer values in reverse order.
668 //
669 // Return Value
670 // r0 := w0
671 // r1 := w1
672 // ...
673 // r7 := w7
_mm_setr_epi16(short w0,short w1,short w2,short w3,short w4,short w5,short w6,short w7)674 FORCE_INLINE __m128i _mm_setr_epi16(short w0,
675 short w1,
676 short w2,
677 short w3,
678 short w4,
679 short w5,
680 short w6,
681 short w7)
682 {
683 int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
684 return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
685 }
686
687 // Sets the 4 signed 32-bit integer values in reverse order
688 // https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
_mm_setr_epi32(int i3,int i2,int i1,int i0)689 FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
690 {
691 int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
692 return vreinterpretq_m128i_s32(vld1q_s32(data));
693 }
694
695 // Set packed 64-bit integers in dst with the supplied values in reverse order.
696 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
_mm_setr_epi64(__m64 e1,__m64 e0)697 FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
698 {
699 return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
700 }
701
702 // Sets the 16 signed 8-bit integer values to b.
703 //
704 // r0 := b
705 // r1 := b
706 // ...
707 // r15 := b
708 //
709 // https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
_mm_set1_epi8(signed char w)710 FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
711 {
712 return vreinterpretq_m128i_s8(vdupq_n_s8(w));
713 }
714
715 // Broadcast double-precision (64-bit) floating-point value a to all elements of
716 // dst.
717 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
_mm_set1_pd(double d)718 FORCE_INLINE __m128d _mm_set1_pd(double d)
719 {
720 #if defined(__aarch64__)
721 return vreinterpretq_m128d_f64(vdupq_n_f64(d));
722 #else
723 return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
724 #endif
725 }
726
727 // Sets the 8 signed 16-bit integer values to w.
728 //
729 // r0 := w
730 // r1 := w
731 // ...
732 // r7 := w
733 //
734 // https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
_mm_set1_epi16(short w)735 FORCE_INLINE __m128i _mm_set1_epi16(short w)
736 {
737 return vreinterpretq_m128i_s16(vdupq_n_s16(w));
738 }
739
740 // Sets the 16 signed 8-bit integer values.
741 // https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
_mm_set_epi8(signed char b15,signed char b14,signed char b13,signed char b12,signed char b11,signed char b10,signed char b9,signed char b8,signed char b7,signed char b6,signed char b5,signed char b4,signed char b3,signed char b2,signed char b1,signed char b0)742 FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
743 signed char b14,
744 signed char b13,
745 signed char b12,
746 signed char b11,
747 signed char b10,
748 signed char b9,
749 signed char b8,
750 signed char b7,
751 signed char b6,
752 signed char b5,
753 signed char b4,
754 signed char b3,
755 signed char b2,
756 signed char b1,
757 signed char b0)
758 {
759 int8_t ALIGN_STRUCT(16)
760 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
761 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
762 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
763 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
764 return (__m128i) vld1q_s8(data);
765 }
766
767 // Sets the 8 signed 16-bit integer values.
768 // https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
_mm_set_epi16(short i7,short i6,short i5,short i4,short i3,short i2,short i1,short i0)769 FORCE_INLINE __m128i _mm_set_epi16(short i7,
770 short i6,
771 short i5,
772 short i4,
773 short i3,
774 short i2,
775 short i1,
776 short i0)
777 {
778 int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
779 return vreinterpretq_m128i_s16(vld1q_s16(data));
780 }
781
782 // Sets the 16 signed 8-bit integer values in reverse order.
783 // https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
_mm_setr_epi8(signed char b0,signed char b1,signed char b2,signed char b3,signed char b4,signed char b5,signed char b6,signed char b7,signed char b8,signed char b9,signed char b10,signed char b11,signed char b12,signed char b13,signed char b14,signed char b15)784 FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
785 signed char b1,
786 signed char b2,
787 signed char b3,
788 signed char b4,
789 signed char b5,
790 signed char b6,
791 signed char b7,
792 signed char b8,
793 signed char b9,
794 signed char b10,
795 signed char b11,
796 signed char b12,
797 signed char b13,
798 signed char b14,
799 signed char b15)
800 {
801 int8_t ALIGN_STRUCT(16)
802 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
803 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
804 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
805 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
806 return (__m128i) vld1q_s8(data);
807 }
808
809 // Sets the 4 signed 32-bit integer values to i.
810 //
811 // r0 := i
812 // r1 := i
813 // r2 := i
814 // r3 := I
815 //
816 // https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
_mm_set1_epi32(int _i)817 FORCE_INLINE __m128i _mm_set1_epi32(int _i)
818 {
819 return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
820 }
821
822 // Sets the 2 signed 64-bit integer values to i.
823 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
_mm_set1_epi64(__m64 _i)824 FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
825 {
826 return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
827 }
828
829 // Sets the 2 signed 64-bit integer values to i.
830 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
_mm_set1_epi64x(int64_t _i)831 FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
832 {
833 return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
834 }
835
836 // Sets the 4 signed 32-bit integer values.
837 // https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
_mm_set_epi32(int i3,int i2,int i1,int i0)838 FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
839 {
840 int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
841 return vreinterpretq_m128i_s32(vld1q_s32(data));
842 }
843
844 // Returns the __m128i structure with its two 64-bit integer values
845 // initialized to the values of the two 64-bit integers passed in.
846 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
_mm_set_epi64x(int64_t i1,int64_t i2)847 FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
848 {
849 return vreinterpretq_m128i_s64(
850 vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
851 }
852
853 // Returns the __m128i structure with its two 64-bit integer values
854 // initialized to the values of the two 64-bit integers passed in.
855 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
_mm_set_epi64(__m64 i1,__m64 i2)856 FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
857 {
858 return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
859 }
860
861 // Set packed double-precision (64-bit) floating-point elements in dst with the
862 // supplied values.
863 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
_mm_set_pd(double e1,double e0)864 FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
865 {
866 double ALIGN_STRUCT(16) data[2] = {e0, e1};
867 #if defined(__aarch64__)
868 return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
869 #else
870 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
871 #endif
872 }
873
874 // Set packed double-precision (64-bit) floating-point elements in dst with the
875 // supplied values in reverse order.
876 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
_mm_setr_pd(double e1,double e0)877 FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
878 {
879 return _mm_set_pd(e0, e1);
880 }
881
882 // Copy double-precision (64-bit) floating-point element a to the lower element
883 // of dst, and zero the upper element.
884 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
_mm_set_sd(double a)885 FORCE_INLINE __m128d _mm_set_sd(double a)
886 {
887 return _mm_set_pd(0, a);
888 }
889
890 // Broadcast double-precision (64-bit) floating-point value a to all elements of
891 // dst.
892 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
893 #define _mm_set_pd1 _mm_set1_pd
894
895 // Stores four single-precision, floating-point values.
896 // https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
_mm_store_ps(float * p,__m128 a)897 FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
898 {
899 vst1q_f32(p, vreinterpretq_f32_m128(a));
900 }
901
902 // Store the lower single-precision (32-bit) floating-point element from a into
903 // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
904 // boundary or a general-protection exception may be generated.
905 //
906 // MEM[mem_addr+31:mem_addr] := a[31:0]
907 // MEM[mem_addr+63:mem_addr+32] := a[31:0]
908 // MEM[mem_addr+95:mem_addr+64] := a[31:0]
909 // MEM[mem_addr+127:mem_addr+96] := a[31:0]
910 //
911 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
_mm_store_ps1(float * p,__m128 a)912 FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
913 {
914 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
915 vst1q_f32(p, vdupq_n_f32(a0));
916 }
917
918 // Store the lower single-precision (32-bit) floating-point element from a into
919 // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
920 // boundary or a general-protection exception may be generated.
921 //
922 // MEM[mem_addr+31:mem_addr] := a[31:0]
923 // MEM[mem_addr+63:mem_addr+32] := a[31:0]
924 // MEM[mem_addr+95:mem_addr+64] := a[31:0]
925 // MEM[mem_addr+127:mem_addr+96] := a[31:0]
926 //
927 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
928 #define _mm_store1_ps _mm_store_ps1
929
930 // Store 4 single-precision (32-bit) floating-point elements from a into memory
931 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
932 // general-protection exception may be generated.
933 //
934 // MEM[mem_addr+31:mem_addr] := a[127:96]
935 // MEM[mem_addr+63:mem_addr+32] := a[95:64]
936 // MEM[mem_addr+95:mem_addr+64] := a[63:32]
937 // MEM[mem_addr+127:mem_addr+96] := a[31:0]
938 //
939 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
_mm_storer_ps(float * p,__m128 a)940 FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
941 {
942 float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
943 float32x4_t rev = vextq_f32(tmp, tmp, 2);
944 vst1q_f32(p, rev);
945 }
946
947 // Stores four single-precision, floating-point values.
948 // https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
_mm_storeu_ps(float * p,__m128 a)949 FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
950 {
951 vst1q_f32(p, vreinterpretq_f32_m128(a));
952 }
953
954 // Stores four 32-bit integer values as (as a __m128i value) at the address p.
955 // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
_mm_store_si128(__m128i * p,__m128i a)956 FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
957 {
958 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
959 }
960
961 // Stores four 32-bit integer values as (as a __m128i value) at the address p.
962 // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
_mm_storeu_si128(__m128i * p,__m128i a)963 FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
964 {
965 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
966 }
967
968 // Stores the lower single - precision, floating - point value.
969 // https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
_mm_store_ss(float * p,__m128 a)970 FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
971 {
972 vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
973 }
974
975 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
976 // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
977 // or a general-protection exception may be generated.
978 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
_mm_store_pd(double * mem_addr,__m128d a)979 FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
980 {
981 #if defined(__aarch64__)
982 vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
983 #else
984 vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
985 #endif
986 }
987
988 // Store the upper double-precision (64-bit) floating-point element from a into
989 // memory.
990 //
991 // MEM[mem_addr+63:mem_addr] := a[127:64]
992 //
993 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
_mm_storeh_pd(double * mem_addr,__m128d a)994 FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
995 {
996 #if defined(__aarch64__)
997 vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
998 #else
999 vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
1000 #endif
1001 }
1002
1003 // Store the lower double-precision (64-bit) floating-point element from a into
1004 // memory.
1005 //
1006 // MEM[mem_addr+63:mem_addr] := a[63:0]
1007 //
1008 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
_mm_storel_pd(double * mem_addr,__m128d a)1009 FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
1010 {
1011 #if defined(__aarch64__)
1012 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
1013 #else
1014 vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
1015 #endif
1016 }
1017
1018 // Store 2 double-precision (64-bit) floating-point elements from a into memory
1019 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1020 // general-protection exception may be generated.
1021 //
1022 // MEM[mem_addr+63:mem_addr] := a[127:64]
1023 // MEM[mem_addr+127:mem_addr+64] := a[63:0]
1024 //
1025 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
_mm_storer_pd(double * mem_addr,__m128d a)1026 FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
1027 {
1028 float32x4_t f = vreinterpretq_f32_m128d(a);
1029 _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
1030 }
1031
1032 // Store the lower double-precision (64-bit) floating-point element from a into
1033 // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
1034 // boundary or a general-protection exception may be generated.
1035 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
_mm_store_pd1(double * mem_addr,__m128d a)1036 FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
1037 {
1038 #if defined(__aarch64__)
1039 float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
1040 vst1q_f64((float64_t *) mem_addr,
1041 vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
1042 #else
1043 float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
1044 vst1q_f32((float32_t *) mem_addr,
1045 vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
1046 #endif
1047 }
1048
1049 // Store the lower double-precision (64-bit) floating-point element from a into
1050 // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
1051 // boundary or a general-protection exception may be generated.
1052 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
1053 #define _mm_store1_pd _mm_store_pd1
1054
1055 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
1056 // elements) from a into memory. mem_addr does not need to be aligned on any
1057 // particular boundary.
1058 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
_mm_storeu_pd(double * mem_addr,__m128d a)1059 FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
1060 {
1061 _mm_store_pd(mem_addr, a);
1062 }
1063
1064 // Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
1065 // https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
_mm_storel_epi64(__m128i * a,__m128i b)1066 FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
1067 {
1068 uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
1069 uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
1070 *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
1071 }
1072
1073 // Stores the lower two single-precision floating point values of a to the
1074 // address p.
1075 //
1076 // *p0 := a0
1077 // *p1 := a1
1078 //
1079 // https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
_mm_storel_pi(__m64 * p,__m128 a)1080 FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
1081 {
1082 *p = vreinterpret_m64_f32(vget_low_f32(a));
1083 }
1084
1085 // Stores the upper two single-precision, floating-point values of a to the
1086 // address p.
1087 //
1088 // *p0 := a2
1089 // *p1 := a3
1090 //
1091 // https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
_mm_storeh_pi(__m64 * p,__m128 a)1092 FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
1093 {
1094 *p = vreinterpret_m64_f32(vget_high_f32(a));
1095 }
1096
1097 // Loads a single single-precision, floating-point value, copying it into all
1098 // four words
1099 // https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
_mm_load1_ps(const float * p)1100 FORCE_INLINE __m128 _mm_load1_ps(const float *p)
1101 {
1102 return vreinterpretq_m128_f32(vld1q_dup_f32(p));
1103 }
1104
1105 // Load a single-precision (32-bit) floating-point element from memory into all
1106 // elements of dst.
1107 //
1108 // dst[31:0] := MEM[mem_addr+31:mem_addr]
1109 // dst[63:32] := MEM[mem_addr+31:mem_addr]
1110 // dst[95:64] := MEM[mem_addr+31:mem_addr]
1111 // dst[127:96] := MEM[mem_addr+31:mem_addr]
1112 //
1113 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
1114 #define _mm_load_ps1 _mm_load1_ps
1115
1116 // Sets the lower two single-precision, floating-point values with 64
1117 // bits of data loaded from the address p; the upper two values are passed
1118 // through from a.
1119 //
1120 // Return Value
1121 // r0 := *p0
1122 // r1 := *p1
1123 // r2 := a2
1124 // r3 := a3
1125 //
1126 // https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
_mm_loadl_pi(__m128 a,__m64 const * p)1127 FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
1128 {
1129 return vreinterpretq_m128_f32(
1130 vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
1131 }
1132
1133 // Load 4 single-precision (32-bit) floating-point elements from memory into dst
1134 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1135 // general-protection exception may be generated.
1136 //
1137 // dst[31:0] := MEM[mem_addr+127:mem_addr+96]
1138 // dst[63:32] := MEM[mem_addr+95:mem_addr+64]
1139 // dst[95:64] := MEM[mem_addr+63:mem_addr+32]
1140 // dst[127:96] := MEM[mem_addr+31:mem_addr]
1141 //
1142 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
_mm_loadr_ps(const float * p)1143 FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
1144 {
1145 float32x4_t v = vrev64q_f32(vld1q_f32(p));
1146 return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
1147 }
1148
1149 // Sets the upper two single-precision, floating-point values with 64
1150 // bits of data loaded from the address p; the lower two values are passed
1151 // through from a.
1152 //
1153 // r0 := a0
1154 // r1 := a1
1155 // r2 := *p0
1156 // r3 := *p1
1157 //
1158 // https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
_mm_loadh_pi(__m128 a,__m64 const * p)1159 FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
1160 {
1161 return vreinterpretq_m128_f32(
1162 vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
1163 }
1164
1165 // Loads four single-precision, floating-point values.
1166 // https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
_mm_load_ps(const float * p)1167 FORCE_INLINE __m128 _mm_load_ps(const float *p)
1168 {
1169 return vreinterpretq_m128_f32(vld1q_f32(p));
1170 }
1171
1172 // Loads four single-precision, floating-point values.
1173 // https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
_mm_loadu_ps(const float * p)1174 FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
1175 {
1176 // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
1177 // equivalent for neon
1178 return vreinterpretq_m128_f32(vld1q_f32(p));
1179 }
1180
1181 // Load unaligned 16-bit integer from memory into the first element of dst.
1182 //
1183 // dst[15:0] := MEM[mem_addr+15:mem_addr]
1184 // dst[MAX:16] := 0
1185 //
1186 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
_mm_loadu_si16(const void * p)1187 FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
1188 {
1189 return vreinterpretq_m128i_s16(
1190 vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
1191 }
1192
1193 // Load unaligned 64-bit integer from memory into the first element of dst.
1194 //
1195 // dst[63:0] := MEM[mem_addr+63:mem_addr]
1196 // dst[MAX:64] := 0
1197 //
1198 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
_mm_loadu_si64(const void * p)1199 FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
1200 {
1201 return vreinterpretq_m128i_s64(
1202 vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
1203 }
1204
1205 // Load a double-precision (64-bit) floating-point element from memory into the
1206 // lower of dst, and zero the upper element. mem_addr does not need to be
1207 // aligned on any particular boundary.
1208 //
1209 // dst[63:0] := MEM[mem_addr+63:mem_addr]
1210 // dst[127:64] := 0
1211 //
1212 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
_mm_load_sd(const double * p)1213 FORCE_INLINE __m128d _mm_load_sd(const double *p)
1214 {
1215 #if defined(__aarch64__)
1216 return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
1217 #else
1218 const float *fp = (const float *) p;
1219 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
1220 return vreinterpretq_m128d_f32(vld1q_f32(data));
1221 #endif
1222 }
1223
1224 // Loads two double-precision from 16-byte aligned memory, floating-point
1225 // values.
1226 //
1227 // dst[127:0] := MEM[mem_addr+127:mem_addr]
1228 //
1229 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
_mm_load_pd(const double * p)1230 FORCE_INLINE __m128d _mm_load_pd(const double *p)
1231 {
1232 #if defined(__aarch64__)
1233 return vreinterpretq_m128d_f64(vld1q_f64(p));
1234 #else
1235 const float *fp = (const float *) p;
1236 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
1237 return vreinterpretq_m128d_f32(vld1q_f32(data));
1238 #endif
1239 }
1240
1241 // Loads two double-precision from unaligned memory, floating-point values.
1242 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
_mm_loadu_pd(const double * p)1243 FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
1244 {
1245 return _mm_load_pd(p);
1246 }
1247
1248 // Loads an single - precision, floating - point value into the low word and
1249 // clears the upper three words.
1250 // https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
_mm_load_ss(const float * p)1251 FORCE_INLINE __m128 _mm_load_ss(const float *p)
1252 {
1253 return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1254 }
1255
1256 // Load 64-bit integer from memory into the first element of dst.
1257 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
_mm_loadl_epi64(__m128i const * p)1258 FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
1259 {
1260 /* Load the lower 64 bits of the value pointed to by p into the
1261 * lower 64 bits of the result, zeroing the upper 64 bits of the result.
1262 */
1263 return vreinterpretq_m128i_s32(
1264 vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
1265 }
1266
1267 // Load a double-precision (64-bit) floating-point element from memory into the
1268 // lower element of dst, and copy the upper element from a to dst. mem_addr does
1269 // not need to be aligned on any particular boundary.
1270 //
1271 // dst[63:0] := MEM[mem_addr+63:mem_addr]
1272 // dst[127:64] := a[127:64]
1273 //
1274 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
_mm_loadl_pd(__m128d a,const double * p)1275 FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
1276 {
1277 #if defined(__aarch64__)
1278 return vreinterpretq_m128d_f64(
1279 vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
1280 #else
1281 return vreinterpretq_m128d_f32(
1282 vcombine_f32(vld1_f32((const float *) p),
1283 vget_high_f32(vreinterpretq_f32_m128d(a))));
1284 #endif
1285 }
1286
1287 // Load 2 double-precision (64-bit) floating-point elements from memory into dst
1288 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1289 // general-protection exception may be generated.
1290 //
1291 // dst[63:0] := MEM[mem_addr+127:mem_addr+64]
1292 // dst[127:64] := MEM[mem_addr+63:mem_addr]
1293 //
1294 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
_mm_loadr_pd(const double * p)1295 FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
1296 {
1297 #if defined(__aarch64__)
1298 float64x2_t v = vld1q_f64(p);
1299 return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
1300 #else
1301 int64x2_t v = vld1q_s64((const int64_t *) p);
1302 return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
1303 #endif
1304 }
1305
1306 // Sets the low word to the single-precision, floating-point value of b
1307 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
_mm_move_ss(__m128 a,__m128 b)1308 FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
1309 {
1310 return vreinterpretq_m128_f32(
1311 vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
1312 vreinterpretq_f32_m128(a), 0));
1313 }
1314
1315 // Move the lower double-precision (64-bit) floating-point element from b to the
1316 // lower element of dst, and copy the upper element from a to the upper element
1317 // of dst.
1318 //
1319 // dst[63:0] := b[63:0]
1320 // dst[127:64] := a[127:64]
1321 //
1322 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
_mm_move_sd(__m128d a,__m128d b)1323 FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
1324 {
1325 return vreinterpretq_m128d_f32(
1326 vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
1327 vget_high_f32(vreinterpretq_f32_m128d(a))));
1328 }
1329
1330 // Copy the lower 64-bit integer in a to the lower element of dst, and zero the
1331 // upper element.
1332 //
1333 // dst[63:0] := a[63:0]
1334 // dst[127:64] := 0
1335 //
1336 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
_mm_move_epi64(__m128i a)1337 FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
1338 {
1339 return vreinterpretq_m128i_s64(
1340 vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
1341 }
1342
1343 // Return vector of type __m128 with undefined elements.
1344 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
_mm_undefined_ps(void)1345 FORCE_INLINE __m128 _mm_undefined_ps(void)
1346 {
1347 #if defined(__GNUC__) || defined(__clang__)
1348 #pragma GCC diagnostic push
1349 #pragma GCC diagnostic ignored "-Wuninitialized"
1350 #endif
1351 __m128 a;
1352 return a;
1353 #if defined(__GNUC__) || defined(__clang__)
1354 #pragma GCC diagnostic pop
1355 #endif
1356 }
1357
1358 /* Logic/Binary operations */
1359
1360 // Computes the bitwise AND-NOT of the four single-precision, floating-point
1361 // values of a and b.
1362 //
1363 // r0 := ~a0 & b0
1364 // r1 := ~a1 & b1
1365 // r2 := ~a2 & b2
1366 // r3 := ~a3 & b3
1367 //
1368 // https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
_mm_andnot_ps(__m128 a,__m128 b)1369 FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
1370 {
1371 return vreinterpretq_m128_s32(
1372 vbicq_s32(vreinterpretq_s32_m128(b),
1373 vreinterpretq_s32_m128(a))); // *NOTE* argument swap
1374 }
1375
1376 // Compute the bitwise NOT of packed double-precision (64-bit) floating-point
1377 // elements in a and then AND with b, and store the results in dst.
1378 //
1379 // FOR j := 0 to 1
1380 // i := j*64
1381 // dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
1382 // ENDFOR
1383 //
1384 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
_mm_andnot_pd(__m128d a,__m128d b)1385 FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
1386 {
1387 // *NOTE* argument swap
1388 return vreinterpretq_m128d_s64(
1389 vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
1390 }
1391
1392 // Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
1393 // 128-bit value in a.
1394 //
1395 // r := (~a) & b
1396 //
1397 // https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
_mm_andnot_si128(__m128i a,__m128i b)1398 FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
1399 {
1400 return vreinterpretq_m128i_s32(
1401 vbicq_s32(vreinterpretq_s32_m128i(b),
1402 vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
1403 }
1404
1405 // Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
1406 // b.
1407 //
1408 // r := a & b
1409 //
1410 // https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
_mm_and_si128(__m128i a,__m128i b)1411 FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
1412 {
1413 return vreinterpretq_m128i_s32(
1414 vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1415 }
1416
1417 // Computes the bitwise AND of the four single-precision, floating-point values
1418 // of a and b.
1419 //
1420 // r0 := a0 & b0
1421 // r1 := a1 & b1
1422 // r2 := a2 & b2
1423 // r3 := a3 & b3
1424 //
1425 // https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
_mm_and_ps(__m128 a,__m128 b)1426 FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
1427 {
1428 return vreinterpretq_m128_s32(
1429 vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1430 }
1431
1432 // Compute the bitwise AND of packed double-precision (64-bit) floating-point
1433 // elements in a and b, and store the results in dst.
1434 //
1435 // FOR j := 0 to 1
1436 // i := j*64
1437 // dst[i+63:i] := a[i+63:i] AND b[i+63:i]
1438 // ENDFOR
1439 //
1440 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
_mm_and_pd(__m128d a,__m128d b)1441 FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
1442 {
1443 return vreinterpretq_m128d_s64(
1444 vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
1445 }
1446
1447 // Computes the bitwise OR of the four single-precision, floating-point values
1448 // of a and b.
1449 // https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
_mm_or_ps(__m128 a,__m128 b)1450 FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
1451 {
1452 return vreinterpretq_m128_s32(
1453 vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1454 }
1455
1456 // Computes bitwise EXOR (exclusive-or) of the four single-precision,
1457 // floating-point values of a and b.
1458 // https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
_mm_xor_ps(__m128 a,__m128 b)1459 FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
1460 {
1461 return vreinterpretq_m128_s32(
1462 veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1463 }
1464
1465 // Compute the bitwise XOR of packed double-precision (64-bit) floating-point
1466 // elements in a and b, and store the results in dst.
1467 //
1468 // FOR j := 0 to 1
1469 // i := j*64
1470 // dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
1471 // ENDFOR
1472 //
1473 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
_mm_xor_pd(__m128d a,__m128d b)1474 FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
1475 {
1476 return vreinterpretq_m128d_s64(
1477 veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
1478 }
1479
1480 // Compute the bitwise OR of packed double-precision (64-bit) floating-point
1481 // elements in a and b, and store the results in dst.
1482 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
_mm_or_pd(__m128d a,__m128d b)1483 FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
1484 {
1485 return vreinterpretq_m128d_s64(
1486 vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
1487 }
1488
1489 // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
1490 //
1491 // r := a | b
1492 //
1493 // https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
_mm_or_si128(__m128i a,__m128i b)1494 FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
1495 {
1496 return vreinterpretq_m128i_s32(
1497 vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1498 }
1499
1500 // Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
1501 // b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
_mm_xor_si128(__m128i a,__m128i b)1502 FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
1503 {
1504 return vreinterpretq_m128i_s32(
1505 veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1506 }
1507
1508 // Duplicate the low double-precision (64-bit) floating-point element from a,
1509 // and store the results in dst.
1510 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
_mm_movedup_pd(__m128d a)1511 FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
1512 {
1513 #if (__aarch64__)
1514 return vreinterpretq_m128d_f64(
1515 vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
1516 #else
1517 return vreinterpretq_m128d_u64(
1518 vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
1519 #endif
1520 }
1521
1522 // Duplicate odd-indexed single-precision (32-bit) floating-point elements
1523 // from a, and store the results in dst.
1524 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
_mm_movehdup_ps(__m128 a)1525 FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
1526 {
1527 #if __has_builtin(__builtin_shufflevector)
1528 return vreinterpretq_m128_f32(__builtin_shufflevector(
1529 vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
1530 #else
1531 float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
1532 float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
1533 float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
1534 return vreinterpretq_m128_f32(vld1q_f32(data));
1535 #endif
1536 }
1537
1538 // Duplicate even-indexed single-precision (32-bit) floating-point elements
1539 // from a, and store the results in dst.
1540 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
_mm_moveldup_ps(__m128 a)1541 FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
1542 {
1543 #if __has_builtin(__builtin_shufflevector)
1544 return vreinterpretq_m128_f32(__builtin_shufflevector(
1545 vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
1546 #else
1547 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1548 float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
1549 float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
1550 return vreinterpretq_m128_f32(vld1q_f32(data));
1551 #endif
1552 }
1553
1554 // Moves the upper two values of B into the lower two values of A.
1555 //
1556 // r3 := a3
1557 // r2 := a2
1558 // r1 := b3
1559 // r0 := b2
_mm_movehl_ps(__m128 __A,__m128 __B)1560 FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
1561 {
1562 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
1563 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
1564 return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
1565 }
1566
1567 // Moves the lower two values of B into the upper two values of A.
1568 //
1569 // r3 := b1
1570 // r2 := b0
1571 // r1 := a1
1572 // r0 := a0
_mm_movelh_ps(__m128 __A,__m128 __B)1573 FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
1574 {
1575 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
1576 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
1577 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
1578 }
1579
1580 // Compute the absolute value of packed signed 32-bit integers in a, and store
1581 // the unsigned results in dst.
1582 //
1583 // FOR j := 0 to 3
1584 // i := j*32
1585 // dst[i+31:i] := ABS(a[i+31:i])
1586 // ENDFOR
1587 //
1588 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
_mm_abs_epi32(__m128i a)1589 FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
1590 {
1591 return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
1592 }
1593
1594 // Compute the absolute value of packed signed 16-bit integers in a, and store
1595 // the unsigned results in dst.
1596 //
1597 // FOR j := 0 to 7
1598 // i := j*16
1599 // dst[i+15:i] := ABS(a[i+15:i])
1600 // ENDFOR
1601 //
1602 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
_mm_abs_epi16(__m128i a)1603 FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
1604 {
1605 return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
1606 }
1607
1608 // Compute the absolute value of packed signed 8-bit integers in a, and store
1609 // the unsigned results in dst.
1610 //
1611 // FOR j := 0 to 15
1612 // i := j*8
1613 // dst[i+7:i] := ABS(a[i+7:i])
1614 // ENDFOR
1615 //
1616 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
_mm_abs_epi8(__m128i a)1617 FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
1618 {
1619 return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
1620 }
1621
1622 // Compute the absolute value of packed signed 32-bit integers in a, and store
1623 // the unsigned results in dst.
1624 //
1625 // FOR j := 0 to 1
1626 // i := j*32
1627 // dst[i+31:i] := ABS(a[i+31:i])
1628 // ENDFOR
1629 //
1630 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
_mm_abs_pi32(__m64 a)1631 FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
1632 {
1633 return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
1634 }
1635
1636 // Compute the absolute value of packed signed 16-bit integers in a, and store
1637 // the unsigned results in dst.
1638 //
1639 // FOR j := 0 to 3
1640 // i := j*16
1641 // dst[i+15:i] := ABS(a[i+15:i])
1642 // ENDFOR
1643 //
1644 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
_mm_abs_pi16(__m64 a)1645 FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
1646 {
1647 return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
1648 }
1649
1650 // Compute the absolute value of packed signed 8-bit integers in a, and store
1651 // the unsigned results in dst.
1652 //
1653 // FOR j := 0 to 7
1654 // i := j*8
1655 // dst[i+7:i] := ABS(a[i+7:i])
1656 // ENDFOR
1657 //
1658 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
_mm_abs_pi8(__m64 a)1659 FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
1660 {
1661 return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
1662 }
1663
1664 // Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
1665 // the result right by imm8 bytes, and store the low 16 bytes in dst.
1666 //
1667 // tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
1668 // dst[127:0] := tmp[127:0]
1669 //
1670 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
1671 #define _mm_alignr_epi8(a, b, imm) \
1672 __extension__({ \
1673 __m128i ret; \
1674 if (unlikely((imm) >= 32)) { \
1675 ret = _mm_setzero_si128(); \
1676 } else { \
1677 uint8x16_t tmp_low, tmp_high; \
1678 if (imm >= 16) { \
1679 const int idx = imm - 16; \
1680 tmp_low = vreinterpretq_u8_m128i(a); \
1681 tmp_high = vdupq_n_u8(0); \
1682 ret = \
1683 vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
1684 } else { \
1685 const int idx = imm; \
1686 tmp_low = vreinterpretq_u8_m128i(b); \
1687 tmp_high = vreinterpretq_u8_m128i(a); \
1688 ret = \
1689 vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
1690 } \
1691 } \
1692 ret; \
1693 })
1694
1695 // Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
1696 // the result right by imm8 bytes, and store the low 8 bytes in dst.
1697 //
1698 // tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
1699 // dst[63:0] := tmp[63:0]
1700 //
1701 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
1702 #define _mm_alignr_pi8(a, b, imm) \
1703 __extension__({ \
1704 __m64 ret; \
1705 if (unlikely((imm) >= 16)) { \
1706 ret = vreinterpret_m64_s8(vdup_n_s8(0)); \
1707 } else { \
1708 uint8x8_t tmp_low, tmp_high; \
1709 if (imm >= 8) { \
1710 const int idx = imm - 8; \
1711 tmp_low = vreinterpret_u8_m64(a); \
1712 tmp_high = vdup_n_u8(0); \
1713 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
1714 } else { \
1715 const int idx = imm; \
1716 tmp_low = vreinterpret_u8_m64(b); \
1717 tmp_high = vreinterpret_u8_m64(a); \
1718 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
1719 } \
1720 } \
1721 ret; \
1722 })
1723
1724 // Takes the upper 64 bits of a and places it in the low end of the result
1725 // Takes the lower 64 bits of b and places it into the high end of the result.
_mm_shuffle_ps_1032(__m128 a,__m128 b)1726 FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
1727 {
1728 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
1729 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1730 return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
1731 }
1732
1733 // takes the lower two 32-bit values from a and swaps them and places in high
1734 // end of result takes the higher two 32 bit values from b and swaps them and
1735 // places in low end of result.
_mm_shuffle_ps_2301(__m128 a,__m128 b)1736 FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
1737 {
1738 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1739 float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
1740 return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
1741 }
1742
_mm_shuffle_ps_0321(__m128 a,__m128 b)1743 FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
1744 {
1745 float32x2_t a21 = vget_high_f32(
1746 vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
1747 float32x2_t b03 = vget_low_f32(
1748 vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
1749 return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
1750 }
1751
_mm_shuffle_ps_2103(__m128 a,__m128 b)1752 FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
1753 {
1754 float32x2_t a03 = vget_low_f32(
1755 vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
1756 float32x2_t b21 = vget_high_f32(
1757 vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
1758 return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
1759 }
1760
_mm_shuffle_ps_1010(__m128 a,__m128 b)1761 FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
1762 {
1763 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1764 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1765 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
1766 }
1767
_mm_shuffle_ps_1001(__m128 a,__m128 b)1768 FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
1769 {
1770 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1771 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1772 return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
1773 }
1774
_mm_shuffle_ps_0101(__m128 a,__m128 b)1775 FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
1776 {
1777 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1778 float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
1779 return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
1780 }
1781
1782 // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
1783 // high
_mm_shuffle_ps_3210(__m128 a,__m128 b)1784 FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
1785 {
1786 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1787 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
1788 return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
1789 }
1790
_mm_shuffle_ps_0011(__m128 a,__m128 b)1791 FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
1792 {
1793 float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
1794 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1795 return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
1796 }
1797
_mm_shuffle_ps_0022(__m128 a,__m128 b)1798 FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
1799 {
1800 float32x2_t a22 =
1801 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
1802 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1803 return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
1804 }
1805
_mm_shuffle_ps_2200(__m128 a,__m128 b)1806 FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
1807 {
1808 float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
1809 float32x2_t b22 =
1810 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
1811 return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
1812 }
1813
_mm_shuffle_ps_3202(__m128 a,__m128 b)1814 FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
1815 {
1816 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1817 float32x2_t a22 =
1818 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
1819 float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
1820 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
1821 return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
1822 }
1823
_mm_shuffle_ps_1133(__m128 a,__m128 b)1824 FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
1825 {
1826 float32x2_t a33 =
1827 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
1828 float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
1829 return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
1830 }
1831
_mm_shuffle_ps_2010(__m128 a,__m128 b)1832 FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
1833 {
1834 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1835 float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
1836 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1837 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1838 return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
1839 }
1840
_mm_shuffle_ps_2001(__m128 a,__m128 b)1841 FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
1842 {
1843 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1844 float32_t b2 = vgetq_lane_f32(b, 2);
1845 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1846 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1847 return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
1848 }
1849
_mm_shuffle_ps_2032(__m128 a,__m128 b)1850 FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
1851 {
1852 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
1853 float32_t b2 = vgetq_lane_f32(b, 2);
1854 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1855 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1856 return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
1857 }
1858
1859 // NEON does not support a general purpose permute intrinsic
1860 // Selects four specific single-precision, floating-point values from a and b,
1861 // based on the mask i.
1862 //
1863 // C equivalent:
1864 // __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
1865 // __constrange(0, 255) int imm) {
1866 // __m128 ret;
1867 // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
1868 // ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];
1869 // return ret;
1870 // }
1871 //
1872 // https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
1873 #define _mm_shuffle_ps_default(a, b, imm) \
1874 __extension__({ \
1875 float32x4_t ret; \
1876 ret = vmovq_n_f32( \
1877 vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \
1878 ret = vsetq_lane_f32( \
1879 vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
1880 ret, 1); \
1881 ret = vsetq_lane_f32( \
1882 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
1883 ret, 2); \
1884 ret = vsetq_lane_f32( \
1885 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
1886 ret, 3); \
1887 vreinterpretq_m128_f32(ret); \
1888 })
1889
1890 // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
1891 // int imm)
1892 #if __has_builtin(__builtin_shufflevector)
1893 #define _mm_shuffle_ps(a, b, imm) \
1894 __extension__({ \
1895 float32x4_t _input1 = vreinterpretq_f32_m128(a); \
1896 float32x4_t _input2 = vreinterpretq_f32_m128(b); \
1897 float32x4_t _shuf = __builtin_shufflevector( \
1898 _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
1899 (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
1900 vreinterpretq_m128_f32(_shuf); \
1901 })
1902 #else // generic
1903 #define _mm_shuffle_ps(a, b, imm) \
1904 __extension__({ \
1905 __m128 ret; \
1906 switch (imm) { \
1907 case _MM_SHUFFLE(1, 0, 3, 2): \
1908 ret = _mm_shuffle_ps_1032((a), (b)); \
1909 break; \
1910 case _MM_SHUFFLE(2, 3, 0, 1): \
1911 ret = _mm_shuffle_ps_2301((a), (b)); \
1912 break; \
1913 case _MM_SHUFFLE(0, 3, 2, 1): \
1914 ret = _mm_shuffle_ps_0321((a), (b)); \
1915 break; \
1916 case _MM_SHUFFLE(2, 1, 0, 3): \
1917 ret = _mm_shuffle_ps_2103((a), (b)); \
1918 break; \
1919 case _MM_SHUFFLE(1, 0, 1, 0): \
1920 ret = _mm_movelh_ps((a), (b)); \
1921 break; \
1922 case _MM_SHUFFLE(1, 0, 0, 1): \
1923 ret = _mm_shuffle_ps_1001((a), (b)); \
1924 break; \
1925 case _MM_SHUFFLE(0, 1, 0, 1): \
1926 ret = _mm_shuffle_ps_0101((a), (b)); \
1927 break; \
1928 case _MM_SHUFFLE(3, 2, 1, 0): \
1929 ret = _mm_shuffle_ps_3210((a), (b)); \
1930 break; \
1931 case _MM_SHUFFLE(0, 0, 1, 1): \
1932 ret = _mm_shuffle_ps_0011((a), (b)); \
1933 break; \
1934 case _MM_SHUFFLE(0, 0, 2, 2): \
1935 ret = _mm_shuffle_ps_0022((a), (b)); \
1936 break; \
1937 case _MM_SHUFFLE(2, 2, 0, 0): \
1938 ret = _mm_shuffle_ps_2200((a), (b)); \
1939 break; \
1940 case _MM_SHUFFLE(3, 2, 0, 2): \
1941 ret = _mm_shuffle_ps_3202((a), (b)); \
1942 break; \
1943 case _MM_SHUFFLE(3, 2, 3, 2): \
1944 ret = _mm_movehl_ps((b), (a)); \
1945 break; \
1946 case _MM_SHUFFLE(1, 1, 3, 3): \
1947 ret = _mm_shuffle_ps_1133((a), (b)); \
1948 break; \
1949 case _MM_SHUFFLE(2, 0, 1, 0): \
1950 ret = _mm_shuffle_ps_2010((a), (b)); \
1951 break; \
1952 case _MM_SHUFFLE(2, 0, 0, 1): \
1953 ret = _mm_shuffle_ps_2001((a), (b)); \
1954 break; \
1955 case _MM_SHUFFLE(2, 0, 3, 2): \
1956 ret = _mm_shuffle_ps_2032((a), (b)); \
1957 break; \
1958 default: \
1959 ret = _mm_shuffle_ps_default((a), (b), (imm)); \
1960 break; \
1961 } \
1962 ret; \
1963 })
1964 #endif
1965
1966 // Takes the upper 64 bits of a and places it in the low end of the result
1967 // Takes the lower 64 bits of a and places it into the high end of the result.
_mm_shuffle_epi_1032(__m128i a)1968 FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
1969 {
1970 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1971 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1972 return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
1973 }
1974
1975 // takes the lower two 32-bit values from a and swaps them and places in low end
1976 // of result takes the higher two 32 bit values from a and swaps them and places
1977 // in high end of result.
_mm_shuffle_epi_2301(__m128i a)1978 FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
1979 {
1980 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1981 int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
1982 return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
1983 }
1984
1985 // rotates the least significant 32 bits into the most signficant 32 bits, and
1986 // shifts the rest down
_mm_shuffle_epi_0321(__m128i a)1987 FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
1988 {
1989 return vreinterpretq_m128i_s32(
1990 vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
1991 }
1992
1993 // rotates the most significant 32 bits into the least signficant 32 bits, and
1994 // shifts the rest up
_mm_shuffle_epi_2103(__m128i a)1995 FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
1996 {
1997 return vreinterpretq_m128i_s32(
1998 vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
1999 }
2000
2001 // gets the lower 64 bits of a, and places it in the upper 64 bits
2002 // gets the lower 64 bits of a and places it in the lower 64 bits
_mm_shuffle_epi_1010(__m128i a)2003 FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
2004 {
2005 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
2006 return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
2007 }
2008
2009 // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
2010 // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
_mm_shuffle_epi_1001(__m128i a)2011 FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
2012 {
2013 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
2014 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
2015 return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
2016 }
2017
2018 // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
2019 // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
2020 // places it in the lower 64 bits
_mm_shuffle_epi_0101(__m128i a)2021 FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
2022 {
2023 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
2024 return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
2025 }
2026
_mm_shuffle_epi_2211(__m128i a)2027 FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
2028 {
2029 int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
2030 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
2031 return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
2032 }
2033
_mm_shuffle_epi_0122(__m128i a)2034 FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
2035 {
2036 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
2037 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
2038 return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
2039 }
2040
_mm_shuffle_epi_3332(__m128i a)2041 FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
2042 {
2043 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
2044 int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
2045 return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
2046 }
2047
2048 // Shuffle packed 8-bit integers in a according to shuffle control mask in the
2049 // corresponding 8-bit element of b, and store the results in dst.
2050 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
_mm_shuffle_epi8(__m128i a,__m128i b)2051 FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
2052 {
2053 int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
2054 uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
2055 uint8x16_t idx_masked =
2056 vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
2057 #if defined(__aarch64__)
2058 return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
2059 #elif defined(__GNUC__)
2060 int8x16_t ret;
2061 // %e and %f represent the even and odd D registers
2062 // respectively.
2063 __asm__ __volatile__(
2064 "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
2065 "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
2066 : [ret] "=&w"(ret)
2067 : [tbl] "w"(tbl), [idx] "w"(idx_masked));
2068 return vreinterpretq_m128i_s8(ret);
2069 #else
2070 // use this line if testing on aarch64
2071 int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
2072 return vreinterpretq_m128i_s8(
2073 vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
2074 vtbl2_s8(a_split, vget_high_u8(idx_masked))));
2075 #endif
2076 }
2077
2078 // C equivalent:
2079 // __m128i _mm_shuffle_epi32_default(__m128i a,
2080 // __constrange(0, 255) int imm) {
2081 // __m128i ret;
2082 // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
2083 // ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];
2084 // return ret;
2085 // }
2086 #define _mm_shuffle_epi32_default(a, imm) \
2087 __extension__({ \
2088 int32x4_t ret; \
2089 ret = vmovq_n_s32( \
2090 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \
2091 ret = vsetq_lane_s32( \
2092 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
2093 ret, 1); \
2094 ret = vsetq_lane_s32( \
2095 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
2096 ret, 2); \
2097 ret = vsetq_lane_s32( \
2098 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
2099 ret, 3); \
2100 vreinterpretq_m128i_s32(ret); \
2101 })
2102
2103 // FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
2104 // int imm)
2105 #if defined(__aarch64__)
2106 #define _mm_shuffle_epi32_splat(a, imm) \
2107 __extension__({ \
2108 vreinterpretq_m128i_s32( \
2109 vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
2110 })
2111 #else
2112 #define _mm_shuffle_epi32_splat(a, imm) \
2113 __extension__({ \
2114 vreinterpretq_m128i_s32( \
2115 vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
2116 })
2117 #endif
2118
2119 // Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
2120 // https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
2121 // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
2122 // __constrange(0,255) int imm)
2123 #if __has_builtin(__builtin_shufflevector)
2124 #define _mm_shuffle_epi32(a, imm) \
2125 __extension__({ \
2126 int32x4_t _input = vreinterpretq_s32_m128i(a); \
2127 int32x4_t _shuf = __builtin_shufflevector( \
2128 _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2129 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
2130 vreinterpretq_m128i_s32(_shuf); \
2131 })
2132 #else // generic
2133 #define _mm_shuffle_epi32(a, imm) \
2134 __extension__({ \
2135 __m128i ret; \
2136 switch (imm) { \
2137 case _MM_SHUFFLE(1, 0, 3, 2): \
2138 ret = _mm_shuffle_epi_1032((a)); \
2139 break; \
2140 case _MM_SHUFFLE(2, 3, 0, 1): \
2141 ret = _mm_shuffle_epi_2301((a)); \
2142 break; \
2143 case _MM_SHUFFLE(0, 3, 2, 1): \
2144 ret = _mm_shuffle_epi_0321((a)); \
2145 break; \
2146 case _MM_SHUFFLE(2, 1, 0, 3): \
2147 ret = _mm_shuffle_epi_2103((a)); \
2148 break; \
2149 case _MM_SHUFFLE(1, 0, 1, 0): \
2150 ret = _mm_shuffle_epi_1010((a)); \
2151 break; \
2152 case _MM_SHUFFLE(1, 0, 0, 1): \
2153 ret = _mm_shuffle_epi_1001((a)); \
2154 break; \
2155 case _MM_SHUFFLE(0, 1, 0, 1): \
2156 ret = _mm_shuffle_epi_0101((a)); \
2157 break; \
2158 case _MM_SHUFFLE(2, 2, 1, 1): \
2159 ret = _mm_shuffle_epi_2211((a)); \
2160 break; \
2161 case _MM_SHUFFLE(0, 1, 2, 2): \
2162 ret = _mm_shuffle_epi_0122((a)); \
2163 break; \
2164 case _MM_SHUFFLE(3, 3, 3, 2): \
2165 ret = _mm_shuffle_epi_3332((a)); \
2166 break; \
2167 case _MM_SHUFFLE(0, 0, 0, 0): \
2168 ret = _mm_shuffle_epi32_splat((a), 0); \
2169 break; \
2170 case _MM_SHUFFLE(1, 1, 1, 1): \
2171 ret = _mm_shuffle_epi32_splat((a), 1); \
2172 break; \
2173 case _MM_SHUFFLE(2, 2, 2, 2): \
2174 ret = _mm_shuffle_epi32_splat((a), 2); \
2175 break; \
2176 case _MM_SHUFFLE(3, 3, 3, 3): \
2177 ret = _mm_shuffle_epi32_splat((a), 3); \
2178 break; \
2179 default: \
2180 ret = _mm_shuffle_epi32_default((a), (imm)); \
2181 break; \
2182 } \
2183 ret; \
2184 })
2185 #endif
2186
2187 // Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
2188 // by imm.
2189 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
2190 // FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
2191 // __constrange(0,255) int
2192 // imm)
2193 #define _mm_shufflelo_epi16_function(a, imm) \
2194 __extension__({ \
2195 int16x8_t ret = vreinterpretq_s16_m128i(a); \
2196 int16x4_t lowBits = vget_low_s16(ret); \
2197 ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
2198 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
2199 1); \
2200 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
2201 2); \
2202 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
2203 3); \
2204 vreinterpretq_m128i_s16(ret); \
2205 })
2206
2207 // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
2208 // __constrange(0,255) int imm)
2209 #if __has_builtin(__builtin_shufflevector)
2210 #define _mm_shufflelo_epi16(a, imm) \
2211 __extension__({ \
2212 int16x8_t _input = vreinterpretq_s16_m128i(a); \
2213 int16x8_t _shuf = __builtin_shufflevector( \
2214 _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
2215 (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
2216 vreinterpretq_m128i_s16(_shuf); \
2217 })
2218 #else // generic
2219 #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
2220 #endif
2221
2222 // Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
2223 // by imm.
2224 // https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
2225 // FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
2226 // __constrange(0,255) int
2227 // imm)
2228 #define _mm_shufflehi_epi16_function(a, imm) \
2229 __extension__({ \
2230 int16x8_t ret = vreinterpretq_s16_m128i(a); \
2231 int16x4_t highBits = vget_high_s16(ret); \
2232 ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
2233 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
2234 5); \
2235 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
2236 6); \
2237 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
2238 7); \
2239 vreinterpretq_m128i_s16(ret); \
2240 })
2241
2242 // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
2243 // __constrange(0,255) int imm)
2244 #if __has_builtin(__builtin_shufflevector)
2245 #define _mm_shufflehi_epi16(a, imm) \
2246 __extension__({ \
2247 int16x8_t _input = vreinterpretq_s16_m128i(a); \
2248 int16x8_t _shuf = __builtin_shufflevector( \
2249 _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
2250 (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
2251 (((imm) >> 6) & 0x3) + 4); \
2252 vreinterpretq_m128i_s16(_shuf); \
2253 })
2254 #else // generic
2255 #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
2256 #endif
2257
2258 // Shuffle double-precision (64-bit) floating-point elements using the control
2259 // in imm8, and store the results in dst.
2260 //
2261 // dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
2262 // dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
2263 //
2264 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
2265 #if __has_builtin(__builtin_shufflevector)
2266 #define _mm_shuffle_pd(a, b, imm8) \
2267 vreinterpretq_m128d_s64(__builtin_shufflevector( \
2268 vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
2269 ((imm8 & 0x2) >> 1) + 2))
2270 #else
2271 #define _mm_shuffle_pd(a, b, imm8) \
2272 _mm_castsi128_pd(_mm_set_epi64x( \
2273 vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
2274 vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
2275 #endif
2276
2277 // Blend packed 16-bit integers from a and b using control mask imm8, and store
2278 // the results in dst.
2279 //
2280 // FOR j := 0 to 7
2281 // i := j*16
2282 // IF imm8[j]
2283 // dst[i+15:i] := b[i+15:i]
2284 // ELSE
2285 // dst[i+15:i] := a[i+15:i]
2286 // FI
2287 // ENDFOR
2288 // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
2289 // __constrange(0,255) int imm)
2290 #define _mm_blend_epi16(a, b, imm) \
2291 __extension__({ \
2292 const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000, \
2293 ((imm) & (1 << 1)) ? 0xFFFF : 0x0000, \
2294 ((imm) & (1 << 2)) ? 0xFFFF : 0x0000, \
2295 ((imm) & (1 << 3)) ? 0xFFFF : 0x0000, \
2296 ((imm) & (1 << 4)) ? 0xFFFF : 0x0000, \
2297 ((imm) & (1 << 5)) ? 0xFFFF : 0x0000, \
2298 ((imm) & (1 << 6)) ? 0xFFFF : 0x0000, \
2299 ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \
2300 uint16x8_t _mask_vec = vld1q_u16(_mask); \
2301 uint16x8_t _a = vreinterpretq_u16_m128i(a); \
2302 uint16x8_t _b = vreinterpretq_u16_m128i(b); \
2303 vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
2304 })
2305
2306 // Blend packed 8-bit integers from a and b using mask, and store the results in
2307 // dst.
2308 //
2309 // FOR j := 0 to 15
2310 // i := j*8
2311 // IF mask[i+7]
2312 // dst[i+7:i] := b[i+7:i]
2313 // ELSE
2314 // dst[i+7:i] := a[i+7:i]
2315 // FI
2316 // ENDFOR
_mm_blendv_epi8(__m128i _a,__m128i _b,__m128i _mask)2317 FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
2318 {
2319 // Use a signed shift right to create a mask with the sign bit
2320 uint8x16_t mask =
2321 vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
2322 uint8x16_t a = vreinterpretq_u8_m128i(_a);
2323 uint8x16_t b = vreinterpretq_u8_m128i(_b);
2324 return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
2325 }
2326
2327 /* Shifts */
2328
2329
2330 // Shift packed 16-bit integers in a right by imm while shifting in sign
2331 // bits, and store the results in dst.
2332 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
_mm_srai_epi16(__m128i a,int imm)2333 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
2334 {
2335 const int count = (imm & ~15) ? 15 : imm;
2336 return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
2337 }
2338
2339 // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
2340 // shifting in zeros.
2341 //
2342 // r0 := a0 << count
2343 // r1 := a1 << count
2344 // ...
2345 // r7 := a7 << count
2346 //
2347 // https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
2348 #define _mm_slli_epi16(a, imm) \
2349 __extension__({ \
2350 __m128i ret; \
2351 if (unlikely((imm)) <= 0) { \
2352 ret = a; \
2353 } \
2354 if (unlikely((imm) > 15)) { \
2355 ret = _mm_setzero_si128(); \
2356 } else { \
2357 ret = vreinterpretq_m128i_s16( \
2358 vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
2359 } \
2360 ret; \
2361 })
2362
2363 // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
2364 // shifting in zeros. :
2365 // https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
2366 // FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
_mm_slli_epi32(__m128i a,int imm)2367 FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
2368 {
2369 if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */
2370 return a;
2371 if (unlikely(imm > 31))
2372 return _mm_setzero_si128();
2373 return vreinterpretq_m128i_s32(
2374 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
2375 }
2376
2377 // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
2378 // store the results in dst.
_mm_slli_epi64(__m128i a,int imm)2379 FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
2380 {
2381 if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */
2382 return a;
2383 if (unlikely(imm > 63))
2384 return _mm_setzero_si128();
2385 return vreinterpretq_m128i_s64(
2386 vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
2387 }
2388
2389 // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
2390 // store the results in dst.
2391 //
2392 // FOR j := 0 to 7
2393 // i := j*16
2394 // IF imm8[7:0] > 15
2395 // dst[i+15:i] := 0
2396 // ELSE
2397 // dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
2398 // FI
2399 // ENDFOR
2400 //
2401 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
2402 #define _mm_srli_epi16(a, imm) \
2403 __extension__({ \
2404 __m128i ret; \
2405 if (unlikely(imm) == 0) { \
2406 ret = a; \
2407 } \
2408 if (likely(0 < (imm) && (imm) < 16)) { \
2409 ret = vreinterpretq_m128i_u16( \
2410 vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
2411 } else { \
2412 ret = _mm_setzero_si128(); \
2413 } \
2414 ret; \
2415 })
2416
2417 // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
2418 // store the results in dst.
2419 //
2420 // FOR j := 0 to 3
2421 // i := j*32
2422 // IF imm8[7:0] > 31
2423 // dst[i+31:i] := 0
2424 // ELSE
2425 // dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
2426 // FI
2427 // ENDFOR
2428 //
2429 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
2430 // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
2431 #define _mm_srli_epi32(a, imm) \
2432 __extension__({ \
2433 __m128i ret; \
2434 if (unlikely((imm) == 0)) { \
2435 ret = a; \
2436 } \
2437 if (likely(0 < (imm) && (imm) < 32)) { \
2438 ret = vreinterpretq_m128i_u32( \
2439 vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
2440 } else { \
2441 ret = _mm_setzero_si128(); \
2442 } \
2443 ret; \
2444 })
2445
2446 // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
2447 // store the results in dst.
2448 //
2449 // FOR j := 0 to 1
2450 // i := j*64
2451 // IF imm8[7:0] > 63
2452 // dst[i+63:i] := 0
2453 // ELSE
2454 // dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
2455 // FI
2456 // ENDFOR
2457 //
2458 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
2459 #define _mm_srli_epi64(a, imm) \
2460 __extension__({ \
2461 __m128i ret; \
2462 if (unlikely((imm) == 0)) { \
2463 ret = a; \
2464 } \
2465 if (likely(0 < (imm) && (imm) < 64)) { \
2466 ret = vreinterpretq_m128i_u64( \
2467 vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
2468 } else { \
2469 ret = _mm_setzero_si128(); \
2470 } \
2471 ret; \
2472 })
2473
2474 // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
2475 // and store the results in dst.
2476 //
2477 // FOR j := 0 to 3
2478 // i := j*32
2479 // IF imm8[7:0] > 31
2480 // dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
2481 // ELSE
2482 // dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
2483 // FI
2484 // ENDFOR
2485 //
2486 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
2487 // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
2488 #define _mm_srai_epi32(a, imm) \
2489 __extension__({ \
2490 __m128i ret; \
2491 if (unlikely((imm) == 0)) { \
2492 ret = a; \
2493 } \
2494 if (likely(0 < (imm) && (imm) < 32)) { \
2495 ret = vreinterpretq_m128i_s32( \
2496 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
2497 } else { \
2498 ret = vreinterpretq_m128i_s32( \
2499 vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \
2500 } \
2501 ret; \
2502 })
2503
2504 // Shifts the 128 - bit value in a right by imm bytes while shifting in
2505 // zeros.imm must be an immediate.
2506 //
2507 // r := srl(a, imm*8)
2508 //
2509 // https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
2510 // FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
2511 #define _mm_srli_si128(a, imm) \
2512 __extension__({ \
2513 __m128i ret; \
2514 if (unlikely((imm) <= 0)) { \
2515 ret = a; \
2516 } \
2517 if (unlikely((imm) > 15)) { \
2518 ret = _mm_setzero_si128(); \
2519 } else { \
2520 ret = vreinterpretq_m128i_s8( \
2521 vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
2522 } \
2523 ret; \
2524 })
2525
2526 // Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
2527 // must be an immediate.
2528 //
2529 // r := a << (imm * 8)
2530 //
2531 // https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
2532 // FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
2533 #define _mm_slli_si128(a, imm) \
2534 __extension__({ \
2535 __m128i ret; \
2536 if (unlikely((imm) <= 0)) { \
2537 ret = a; \
2538 } \
2539 if (unlikely((imm) > 15)) { \
2540 ret = _mm_setzero_si128(); \
2541 } else { \
2542 ret = vreinterpretq_m128i_s8(vextq_s8( \
2543 vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
2544 } \
2545 ret; \
2546 })
2547
2548 // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
2549 // shifting in zeros.
2550 //
2551 // r0 := a0 << count
2552 // r1 := a1 << count
2553 // ...
2554 // r7 := a7 << count
2555 //
2556 // https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
_mm_sll_epi16(__m128i a,__m128i count)2557 FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
2558 {
2559 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2560 if (unlikely(c > 15))
2561 return _mm_setzero_si128();
2562
2563 int16x8_t vc = vdupq_n_s16((int16_t) c);
2564 return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
2565 }
2566
2567 // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
2568 // shifting in zeros.
2569 //
2570 // r0 := a0 << count
2571 // r1 := a1 << count
2572 // r2 := a2 << count
2573 // r3 := a3 << count
2574 //
2575 // https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
_mm_sll_epi32(__m128i a,__m128i count)2576 FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
2577 {
2578 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2579 if (unlikely(c > 31))
2580 return _mm_setzero_si128();
2581
2582 int32x4_t vc = vdupq_n_s32((int32_t) c);
2583 return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
2584 }
2585
2586 // Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
2587 // shifting in zeros.
2588 //
2589 // r0 := a0 << count
2590 // r1 := a1 << count
2591 //
2592 // https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
_mm_sll_epi64(__m128i a,__m128i count)2593 FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
2594 {
2595 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2596 if (unlikely(c > 63))
2597 return _mm_setzero_si128();
2598
2599 int64x2_t vc = vdupq_n_s64((int64_t) c);
2600 return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
2601 }
2602
2603 // Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
2604 // while shifting in zeros.
2605 //
2606 // r0 := srl(a0, count)
2607 // r1 := srl(a1, count)
2608 // ...
2609 // r7 := srl(a7, count)
2610 //
2611 // https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
_mm_srl_epi16(__m128i a,__m128i count)2612 FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
2613 {
2614 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2615 if (unlikely(c > 15))
2616 return _mm_setzero_si128();
2617
2618 int16x8_t vc = vdupq_n_s16(-(int16_t) c);
2619 return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
2620 }
2621
2622 // Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
2623 // while shifting in zeros.
2624 //
2625 // r0 := srl(a0, count)
2626 // r1 := srl(a1, count)
2627 // r2 := srl(a2, count)
2628 // r3 := srl(a3, count)
2629 //
2630 // https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
_mm_srl_epi32(__m128i a,__m128i count)2631 FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
2632 {
2633 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2634 if (unlikely(c > 31))
2635 return _mm_setzero_si128();
2636
2637 int32x4_t vc = vdupq_n_s32(-(int32_t) c);
2638 return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
2639 }
2640
2641 // Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
2642 // while shifting in zeros.
2643 //
2644 // r0 := srl(a0, count)
2645 // r1 := srl(a1, count)
2646 //
2647 // https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
_mm_srl_epi64(__m128i a,__m128i count)2648 FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
2649 {
2650 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2651 if (unlikely(c > 63))
2652 return _mm_setzero_si128();
2653
2654 int64x2_t vc = vdupq_n_s64(-(int64_t) c);
2655 return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
2656 }
2657
2658 // NEON does not provide a version of this function.
2659 // Creates a 16-bit mask from the most significant bits of the 16 signed or
2660 // unsigned 8-bit integers in a and zero extends the upper bits.
2661 // https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
_mm_movemask_epi8(__m128i a)2662 FORCE_INLINE int _mm_movemask_epi8(__m128i a)
2663 {
2664 // Use increasingly wide shifts+adds to collect the sign bits
2665 // together.
2666 // Since the widening shifts would be rather confusing to follow in little
2667 // endian, everything will be illustrated in big endian order instead. This
2668 // has a different result - the bits would actually be reversed on a big
2669 // endian machine.
2670
2671 // Starting input (only half the elements are shown):
2672 // 89 ff 1d c0 00 10 99 33
2673 uint8x16_t input = vreinterpretq_u8_m128i(a);
2674
2675 // Shift out everything but the sign bits with an unsigned shift right.
2676 //
2677 // Bytes of the vector::
2678 // 89 ff 1d c0 00 10 99 33
2679 // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
2680 // | | | | | | | |
2681 // 01 01 00 01 00 00 01 00
2682 //
2683 // Bits of first important lane(s):
2684 // 10001001 (89)
2685 // \______
2686 // |
2687 // 00000001 (01)
2688 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
2689
2690 // Merge the even lanes together with a 16-bit unsigned shift right + add.
2691 // 'xx' represents garbage data which will be ignored in the final result.
2692 // In the important bytes, the add functions like a binary OR.
2693 //
2694 // 01 01 00 01 00 00 01 00
2695 // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
2696 // \| \| \| \|
2697 // xx 03 xx 01 xx 00 xx 02
2698 //
2699 // 00000001 00000001 (01 01)
2700 // \_______ |
2701 // \|
2702 // xxxxxxxx xxxxxx11 (xx 03)
2703 uint32x4_t paired16 =
2704 vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
2705
2706 // Repeat with a wider 32-bit shift + add.
2707 // xx 03 xx 01 xx 00 xx 02
2708 // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>
2709 // 14))
2710 // \| \|
2711 // xx xx xx 0d xx xx xx 02
2712 //
2713 // 00000011 00000001 (03 01)
2714 // \\_____ ||
2715 // '----.\||
2716 // xxxxxxxx xxxx1101 (xx 0d)
2717 uint64x2_t paired32 =
2718 vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
2719
2720 // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
2721 // lanes. xx xx xx 0d xx xx xx 02
2722 // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>
2723 // 28))
2724 // \|
2725 // xx xx xx xx xx xx xx d2
2726 //
2727 // 00001101 00000010 (0d 02)
2728 // \ \___ | |
2729 // '---. \| |
2730 // xxxxxxxx 11010010 (xx d2)
2731 uint8x16_t paired64 =
2732 vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
2733
2734 // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
2735 // xx xx xx xx xx xx xx d2
2736 // || return paired64[0]
2737 // d2
2738 // Note: Little endian would return the correct value 4b (01001011) instead.
2739 return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
2740 }
2741
2742 // Copy the lower 64-bit integer in a to dst.
2743 //
2744 // dst[63:0] := a[63:0]
2745 //
2746 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
_mm_movepi64_pi64(__m128i a)2747 FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
2748 {
2749 return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
2750 }
2751
2752 // Copy the 64-bit integer a to the lower element of dst, and zero the upper
2753 // element.
2754 //
2755 // dst[63:0] := a[63:0]
2756 // dst[127:64] := 0
2757 //
2758 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
_mm_movpi64_epi64(__m64 a)2759 FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
2760 {
2761 return vreinterpretq_m128i_s64(
2762 vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
2763 }
2764
2765 // NEON does not provide this method
2766 // Creates a 4-bit mask from the most significant bits of the four
2767 // single-precision, floating-point values.
2768 // https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
_mm_movemask_ps(__m128 a)2769 FORCE_INLINE int _mm_movemask_ps(__m128 a)
2770 {
2771 uint32x4_t input = vreinterpretq_u32_m128(a);
2772 #if defined(__aarch64__)
2773 static const int32x4_t shift = {0, 1, 2, 3};
2774 uint32x4_t tmp = vshrq_n_u32(input, 31);
2775 return vaddvq_u32(vshlq_u32(tmp, shift));
2776 #else
2777 // Uses the exact same method as _mm_movemask_epi8, see that for details.
2778 // Shift out everything but the sign bits with a 32-bit unsigned shift
2779 // right.
2780 uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2781 // Merge the two pairs together with a 64-bit unsigned shift right + add.
2782 uint8x16_t paired =
2783 vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2784 // Extract the result.
2785 return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2786 #endif
2787 }
2788
2789 // Compute the bitwise NOT of a and then AND with a 128-bit vector containing
2790 // all 1's, and return 1 if the result is zero, otherwise return 0.
2791 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
_mm_test_all_ones(__m128i a)2792 FORCE_INLINE int _mm_test_all_ones(__m128i a)
2793 {
2794 return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
2795 ~(uint64_t) 0;
2796 }
2797
2798 // Compute the bitwise AND of 128 bits (representing integer data) in a and
2799 // mask, and return 1 if the result is zero, otherwise return 0.
2800 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
_mm_test_all_zeros(__m128i a,__m128i mask)2801 FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
2802 {
2803 int64x2_t a_and_mask =
2804 vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
2805 return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
2806 : 1;
2807 }
2808
2809 /* Math operations */
2810
2811 // Subtracts the four single-precision, floating-point values of a and b.
2812 //
2813 // r0 := a0 - b0
2814 // r1 := a1 - b1
2815 // r2 := a2 - b2
2816 // r3 := a3 - b3
2817 //
2818 // https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
_mm_sub_ps(__m128 a,__m128 b)2819 FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2820 {
2821 return vreinterpretq_m128_f32(
2822 vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2823 }
2824
2825 // Subtract the lower single-precision (32-bit) floating-point element in b from
2826 // the lower single-precision (32-bit) floating-point element in a, store the
2827 // result in the lower element of dst, and copy the upper 3 packed elements from
2828 // a to the upper elements of dst.
2829 //
2830 // dst[31:0] := a[31:0] - b[31:0]
2831 // dst[127:32] := a[127:32]
2832 //
2833 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
_mm_sub_ss(__m128 a,__m128 b)2834 FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2835 {
2836 return _mm_move_ss(a, _mm_sub_ps(a, b));
2837 }
2838
2839 // Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
2840 // and store the results in dst.
2841 // r0 := a0 - b0
2842 // r1 := a1 - b1
_mm_sub_epi64(__m128i a,__m128i b)2843 FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
2844 {
2845 return vreinterpretq_m128i_s64(
2846 vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2847 }
2848
2849 // Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
2850 // unsigned 32-bit integers of a.
2851 //
2852 // r0 := a0 - b0
2853 // r1 := a1 - b1
2854 // r2 := a2 - b2
2855 // r3 := a3 - b3
2856 //
2857 // https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
_mm_sub_epi32(__m128i a,__m128i b)2858 FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
2859 {
2860 return vreinterpretq_m128i_s32(
2861 vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2862 }
2863
2864 // Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
2865 // store the results in dst.
2866 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
_mm_sub_epi16(__m128i a,__m128i b)2867 FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
2868 {
2869 return vreinterpretq_m128i_s16(
2870 vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2871 }
2872
2873 // Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
2874 // store the results in dst.
2875 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
_mm_sub_epi8(__m128i a,__m128i b)2876 FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
2877 {
2878 return vreinterpretq_m128i_s8(
2879 vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2880 }
2881
2882 // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
2883 //
2884 // dst[63:0] := a[63:0] - b[63:0]
2885 //
2886 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
_mm_sub_si64(__m64 a,__m64 b)2887 FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
2888 {
2889 return vreinterpret_m64_s64(
2890 vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2891 }
2892
2893 // Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
2894 // integers of a and saturates..
2895 // https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
_mm_subs_epu16(__m128i a,__m128i b)2896 FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
2897 {
2898 return vreinterpretq_m128i_u16(
2899 vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
2900 }
2901
2902 // Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
2903 // integers of a and saturates.
2904 //
2905 // r0 := UnsignedSaturate(a0 - b0)
2906 // r1 := UnsignedSaturate(a1 - b1)
2907 // ...
2908 // r15 := UnsignedSaturate(a15 - b15)
2909 //
2910 // https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
_mm_subs_epu8(__m128i a,__m128i b)2911 FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
2912 {
2913 return vreinterpretq_m128i_u8(
2914 vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2915 }
2916
2917 // Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
2918 // of a and saturates.
2919 //
2920 // r0 := SignedSaturate(a0 - b0)
2921 // r1 := SignedSaturate(a1 - b1)
2922 // ...
2923 // r15 := SignedSaturate(a15 - b15)
2924 //
2925 // https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
_mm_subs_epi8(__m128i a,__m128i b)2926 FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
2927 {
2928 return vreinterpretq_m128i_s8(
2929 vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2930 }
2931
2932 // Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
2933 // of a and saturates.
2934 //
2935 // r0 := SignedSaturate(a0 - b0)
2936 // r1 := SignedSaturate(a1 - b1)
2937 // ...
2938 // r7 := SignedSaturate(a7 - b7)
2939 //
2940 // https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
_mm_subs_epi16(__m128i a,__m128i b)2941 FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
2942 {
2943 return vreinterpretq_m128i_s16(
2944 vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2945 }
2946
2947 // Subtract packed double-precision (64-bit) floating-point elements in b from
2948 // packed double-precision (64-bit) floating-point elements in a, and store the
2949 // results in dst.
2950 //
2951 // FOR j := 0 to 1
2952 // i := j*64
2953 // dst[i+63:i] := a[i+63:i] - b[i+63:i]
2954 // ENDFOR
2955 //
2956 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
_mm_sub_pd(__m128d a,__m128d b)2957 FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
2958 {
2959 #if defined(__aarch64__)
2960 return vreinterpretq_m128d_f64(
2961 vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2962 #else
2963 double *da = (double *) &a;
2964 double *db = (double *) &b;
2965 double c[2];
2966 c[0] = da[0] - db[0];
2967 c[1] = da[1] - db[1];
2968 return vld1q_f32((float32_t *) c);
2969 #endif
2970 }
2971
2972 // Subtract the lower double-precision (64-bit) floating-point element in b from
2973 // the lower double-precision (64-bit) floating-point element in a, store the
2974 // result in the lower element of dst, and copy the upper element from a to the
2975 // upper element of dst.
2976 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
_mm_sub_sd(__m128d a,__m128d b)2977 FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
2978 {
2979 return _mm_move_sd(a, _mm_sub_pd(a, b));
2980 }
2981
2982 // Add packed unsigned 16-bit integers in a and b using saturation, and store
2983 // the results in dst.
2984 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
_mm_adds_epu16(__m128i a,__m128i b)2985 FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
2986 {
2987 return vreinterpretq_m128i_u16(
2988 vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
2989 }
2990
2991 // Negate packed 8-bit integers in a when the corresponding signed
2992 // 8-bit integer in b is negative, and store the results in dst.
2993 // Element in dst are zeroed out when the corresponding element
2994 // in b is zero.
2995 //
2996 // for i in 0..15
2997 // if b[i] < 0
2998 // r[i] := -a[i]
2999 // else if b[i] == 0
3000 // r[i] := 0
3001 // else
3002 // r[i] := a[i]
3003 // fi
3004 // done
_mm_sign_epi8(__m128i _a,__m128i _b)3005 FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
3006 {
3007 int8x16_t a = vreinterpretq_s8_m128i(_a);
3008 int8x16_t b = vreinterpretq_s8_m128i(_b);
3009
3010 // signed shift right: faster than vclt
3011 // (b < 0) ? 0xFF : 0
3012 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
3013
3014 // (b == 0) ? 0xFF : 0
3015 #if defined(__aarch64__)
3016 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
3017 #else
3018 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
3019 #endif
3020
3021 // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
3022 // based on ltMask
3023 int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
3024 // res = masked & (~zeroMask)
3025 int8x16_t res = vbicq_s8(masked, zeroMask);
3026
3027 return vreinterpretq_m128i_s8(res);
3028 }
3029
3030 // Negate packed 16-bit integers in a when the corresponding signed
3031 // 16-bit integer in b is negative, and store the results in dst.
3032 // Element in dst are zeroed out when the corresponding element
3033 // in b is zero.
3034 //
3035 // for i in 0..7
3036 // if b[i] < 0
3037 // r[i] := -a[i]
3038 // else if b[i] == 0
3039 // r[i] := 0
3040 // else
3041 // r[i] := a[i]
3042 // fi
3043 // done
_mm_sign_epi16(__m128i _a,__m128i _b)3044 FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
3045 {
3046 int16x8_t a = vreinterpretq_s16_m128i(_a);
3047 int16x8_t b = vreinterpretq_s16_m128i(_b);
3048
3049 // signed shift right: faster than vclt
3050 // (b < 0) ? 0xFFFF : 0
3051 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
3052 // (b == 0) ? 0xFFFF : 0
3053 #if defined(__aarch64__)
3054 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
3055 #else
3056 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
3057 #endif
3058
3059 // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
3060 // 'a') based on ltMask
3061 int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
3062 // res = masked & (~zeroMask)
3063 int16x8_t res = vbicq_s16(masked, zeroMask);
3064 return vreinterpretq_m128i_s16(res);
3065 }
3066
3067 // Negate packed 32-bit integers in a when the corresponding signed
3068 // 32-bit integer in b is negative, and store the results in dst.
3069 // Element in dst are zeroed out when the corresponding element
3070 // in b is zero.
3071 //
3072 // for i in 0..3
3073 // if b[i] < 0
3074 // r[i] := -a[i]
3075 // else if b[i] == 0
3076 // r[i] := 0
3077 // else
3078 // r[i] := a[i]
3079 // fi
3080 // done
_mm_sign_epi32(__m128i _a,__m128i _b)3081 FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
3082 {
3083 int32x4_t a = vreinterpretq_s32_m128i(_a);
3084 int32x4_t b = vreinterpretq_s32_m128i(_b);
3085
3086 // signed shift right: faster than vclt
3087 // (b < 0) ? 0xFFFFFFFF : 0
3088 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
3089
3090 // (b == 0) ? 0xFFFFFFFF : 0
3091 #if defined(__aarch64__)
3092 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
3093 #else
3094 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
3095 #endif
3096
3097 // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
3098 // 'a') based on ltMask
3099 int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
3100 // res = masked & (~zeroMask)
3101 int32x4_t res = vbicq_s32(masked, zeroMask);
3102 return vreinterpretq_m128i_s32(res);
3103 }
3104
3105 // Negate packed 16-bit integers in a when the corresponding signed 16-bit
3106 // integer in b is negative, and store the results in dst. Element in dst are
3107 // zeroed out when the corresponding element in b is zero.
3108 //
3109 // FOR j := 0 to 3
3110 // i := j*16
3111 // IF b[i+15:i] < 0
3112 // dst[i+15:i] := -(a[i+15:i])
3113 // ELSE IF b[i+15:i] == 0
3114 // dst[i+15:i] := 0
3115 // ELSE
3116 // dst[i+15:i] := a[i+15:i]
3117 // FI
3118 // ENDFOR
3119 //
3120 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
_mm_sign_pi16(__m64 _a,__m64 _b)3121 FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
3122 {
3123 int16x4_t a = vreinterpret_s16_m64(_a);
3124 int16x4_t b = vreinterpret_s16_m64(_b);
3125
3126 // signed shift right: faster than vclt
3127 // (b < 0) ? 0xFFFF : 0
3128 uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
3129
3130 // (b == 0) ? 0xFFFF : 0
3131 #if defined(__aarch64__)
3132 int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
3133 #else
3134 int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
3135 #endif
3136
3137 // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
3138 // based on ltMask
3139 int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
3140 // res = masked & (~zeroMask)
3141 int16x4_t res = vbic_s16(masked, zeroMask);
3142
3143 return vreinterpret_m64_s16(res);
3144 }
3145
3146 // Negate packed 32-bit integers in a when the corresponding signed 32-bit
3147 // integer in b is negative, and store the results in dst. Element in dst are
3148 // zeroed out when the corresponding element in b is zero.
3149 //
3150 // FOR j := 0 to 1
3151 // i := j*32
3152 // IF b[i+31:i] < 0
3153 // dst[i+31:i] := -(a[i+31:i])
3154 // ELSE IF b[i+31:i] == 0
3155 // dst[i+31:i] := 0
3156 // ELSE
3157 // dst[i+31:i] := a[i+31:i]
3158 // FI
3159 // ENDFOR
3160 //
3161 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
_mm_sign_pi32(__m64 _a,__m64 _b)3162 FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
3163 {
3164 int32x2_t a = vreinterpret_s32_m64(_a);
3165 int32x2_t b = vreinterpret_s32_m64(_b);
3166
3167 // signed shift right: faster than vclt
3168 // (b < 0) ? 0xFFFFFFFF : 0
3169 uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
3170
3171 // (b == 0) ? 0xFFFFFFFF : 0
3172 #if defined(__aarch64__)
3173 int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
3174 #else
3175 int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
3176 #endif
3177
3178 // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
3179 // based on ltMask
3180 int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
3181 // res = masked & (~zeroMask)
3182 int32x2_t res = vbic_s32(masked, zeroMask);
3183
3184 return vreinterpret_m64_s32(res);
3185 }
3186
3187 // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
3188 // in b is negative, and store the results in dst. Element in dst are zeroed out
3189 // when the corresponding element in b is zero.
3190 //
3191 // FOR j := 0 to 7
3192 // i := j*8
3193 // IF b[i+7:i] < 0
3194 // dst[i+7:i] := -(a[i+7:i])
3195 // ELSE IF b[i+7:i] == 0
3196 // dst[i+7:i] := 0
3197 // ELSE
3198 // dst[i+7:i] := a[i+7:i]
3199 // FI
3200 // ENDFOR
3201 //
3202 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
_mm_sign_pi8(__m64 _a,__m64 _b)3203 FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
3204 {
3205 int8x8_t a = vreinterpret_s8_m64(_a);
3206 int8x8_t b = vreinterpret_s8_m64(_b);
3207
3208 // signed shift right: faster than vclt
3209 // (b < 0) ? 0xFF : 0
3210 uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
3211
3212 // (b == 0) ? 0xFF : 0
3213 #if defined(__aarch64__)
3214 int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
3215 #else
3216 int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
3217 #endif
3218
3219 // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
3220 // based on ltMask
3221 int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
3222 // res = masked & (~zeroMask)
3223 int8x8_t res = vbic_s8(masked, zeroMask);
3224
3225 return vreinterpret_m64_s8(res);
3226 }
3227
3228 // Average packed unsigned 16-bit integers in a and b, and store the results in
3229 // dst.
3230 //
3231 // FOR j := 0 to 3
3232 // i := j*16
3233 // dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
3234 // ENDFOR
3235 //
3236 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
_mm_avg_pu16(__m64 a,__m64 b)3237 FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
3238 {
3239 return vreinterpret_m64_u16(
3240 vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
3241 }
3242
3243 // Average packed unsigned 8-bit integers in a and b, and store the results in
3244 // dst.
3245 //
3246 // FOR j := 0 to 7
3247 // i := j*8
3248 // dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
3249 // ENDFOR
3250 //
3251 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
_mm_avg_pu8(__m64 a,__m64 b)3252 FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
3253 {
3254 return vreinterpret_m64_u8(
3255 vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3256 }
3257
3258 // Average packed unsigned 8-bit integers in a and b, and store the results in
3259 // dst.
3260 //
3261 // FOR j := 0 to 7
3262 // i := j*8
3263 // dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
3264 // ENDFOR
3265 //
3266 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
3267 #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
3268
3269 // Average packed unsigned 16-bit integers in a and b, and store the results in
3270 // dst.
3271 //
3272 // FOR j := 0 to 3
3273 // i := j*16
3274 // dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
3275 // ENDFOR
3276 //
3277 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
3278 #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
3279
3280 // Extract a 16-bit integer from a, selected with imm8, and store the result in
3281 // the lower element of dst.
3282 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
3283 #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
3284
3285 // Copy a to dst, and insert the 16-bit integer i into dst at the location
3286 // specified by imm8.
3287 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
3288 #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
3289
3290 // Compare packed signed 16-bit integers in a and b, and store packed maximum
3291 // values in dst.
3292 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
3293 #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
3294
3295 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
3296 // values in dst.
3297 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
3298 #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
3299
3300 // Compare packed signed 16-bit integers in a and b, and store packed minimum
3301 // values in dst.
3302 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
3303 #define _m_pminsw(a, b) _mm_min_pi16(a, b)
3304
3305 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
3306 // values in dst.
3307 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
3308 #define _m_pminub(a, b) _mm_min_pu8(a, b)
3309
3310 // Computes the average of the 16 unsigned 8-bit integers in a and the 16
3311 // unsigned 8-bit integers in b and rounds.
3312 //
3313 // r0 := (a0 + b0) / 2
3314 // r1 := (a1 + b1) / 2
3315 // ...
3316 // r15 := (a15 + b15) / 2
3317 //
3318 // https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
_mm_avg_epu8(__m128i a,__m128i b)3319 FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
3320 {
3321 return vreinterpretq_m128i_u8(
3322 vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3323 }
3324
3325 // Computes the average of the 8 unsigned 16-bit integers in a and the 8
3326 // unsigned 16-bit integers in b and rounds.
3327 //
3328 // r0 := (a0 + b0) / 2
3329 // r1 := (a1 + b1) / 2
3330 // ...
3331 // r7 := (a7 + b7) / 2
3332 //
3333 // https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
_mm_avg_epu16(__m128i a,__m128i b)3334 FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
3335 {
3336 return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
3337 vreinterpretq_u16_m128i(b));
3338 }
3339
3340 // Adds the four single-precision, floating-point values of a and b.
3341 //
3342 // r0 := a0 + b0
3343 // r1 := a1 + b1
3344 // r2 := a2 + b2
3345 // r3 := a3 + b3
3346 //
3347 // https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
_mm_add_ps(__m128 a,__m128 b)3348 FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
3349 {
3350 return vreinterpretq_m128_f32(
3351 vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3352 }
3353
3354 // Add packed double-precision (64-bit) floating-point elements in a and b, and
3355 // store the results in dst.
3356 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
_mm_add_pd(__m128d a,__m128d b)3357 FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
3358 {
3359 #if defined(__aarch64__)
3360 return vreinterpretq_m128d_f64(
3361 vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3362 #else
3363 double *da = (double *) &a;
3364 double *db = (double *) &b;
3365 double c[2];
3366 c[0] = da[0] + db[0];
3367 c[1] = da[1] + db[1];
3368 return vld1q_f32((float32_t *) c);
3369 #endif
3370 }
3371
3372 // Add the lower double-precision (64-bit) floating-point element in a and b,
3373 // store the result in the lower element of dst, and copy the upper element from
3374 // a to the upper element of dst.
3375 //
3376 // dst[63:0] := a[63:0] + b[63:0]
3377 // dst[127:64] := a[127:64]
3378 //
3379 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
_mm_add_sd(__m128d a,__m128d b)3380 FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
3381 {
3382 #if defined(__aarch64__)
3383 return _mm_move_sd(a, _mm_add_pd(a, b));
3384 #else
3385 double *da = (double *) &a;
3386 double *db = (double *) &b;
3387 double c[2];
3388 c[0] = da[0] + db[0];
3389 c[1] = da[1];
3390 return vld1q_f32((float32_t *) c);
3391 #endif
3392 }
3393
3394 // Add 64-bit integers a and b, and store the result in dst.
3395 //
3396 // dst[63:0] := a[63:0] + b[63:0]
3397 //
3398 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
_mm_add_si64(__m64 a,__m64 b)3399 FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
3400 {
3401 return vreinterpret_m64_s64(
3402 vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
3403 }
3404
3405 // adds the scalar single-precision floating point values of a and b.
3406 // https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
_mm_add_ss(__m128 a,__m128 b)3407 FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
3408 {
3409 float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
3410 float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
3411 // the upper values in the result must be the remnants of <a>.
3412 return vreinterpretq_m128_f32(vaddq_f32(a, value));
3413 }
3414
3415 // Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
3416 // unsigned 32-bit integers in b.
3417 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
_mm_add_epi64(__m128i a,__m128i b)3418 FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
3419 {
3420 return vreinterpretq_m128i_s64(
3421 vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
3422 }
3423
3424 // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
3425 // unsigned 32-bit integers in b.
3426 //
3427 // r0 := a0 + b0
3428 // r1 := a1 + b1
3429 // r2 := a2 + b2
3430 // r3 := a3 + b3
3431 //
3432 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
_mm_add_epi32(__m128i a,__m128i b)3433 FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
3434 {
3435 return vreinterpretq_m128i_s32(
3436 vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3437 }
3438
3439 // Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
3440 // unsigned 16-bit integers in b.
3441 // https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
_mm_add_epi16(__m128i a,__m128i b)3442 FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
3443 {
3444 return vreinterpretq_m128i_s16(
3445 vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3446 }
3447
3448 // Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
3449 // unsigned 8-bit integers in b.
3450 // https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
_mm_add_epi8(__m128i a,__m128i b)3451 FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
3452 {
3453 return vreinterpretq_m128i_s8(
3454 vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3455 }
3456
3457 // Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
3458 // and saturates.
3459 //
3460 // r0 := SignedSaturate(a0 + b0)
3461 // r1 := SignedSaturate(a1 + b1)
3462 // ...
3463 // r7 := SignedSaturate(a7 + b7)
3464 //
3465 // https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
_mm_adds_epi16(__m128i a,__m128i b)3466 FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
3467 {
3468 return vreinterpretq_m128i_s16(
3469 vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3470 }
3471
3472 // Add packed signed 8-bit integers in a and b using saturation, and store the
3473 // results in dst.
3474 //
3475 // FOR j := 0 to 15
3476 // i := j*8
3477 // dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
3478 // ENDFOR
3479 //
3480 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
_mm_adds_epi8(__m128i a,__m128i b)3481 FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
3482 {
3483 return vreinterpretq_m128i_s8(
3484 vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3485 }
3486
3487 // Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
3488 // b and saturates..
3489 // https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
_mm_adds_epu8(__m128i a,__m128i b)3490 FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
3491 {
3492 return vreinterpretq_m128i_u8(
3493 vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3494 }
3495
3496 // Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
3497 // unsigned 16-bit integers from b.
3498 //
3499 // r0 := (a0 * b0)[15:0]
3500 // r1 := (a1 * b1)[15:0]
3501 // ...
3502 // r7 := (a7 * b7)[15:0]
3503 //
3504 // https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
_mm_mullo_epi16(__m128i a,__m128i b)3505 FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
3506 {
3507 return vreinterpretq_m128i_s16(
3508 vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3509 }
3510
3511 // Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
3512 // unsigned 32-bit integers from b.
3513 // https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
_mm_mullo_epi32(__m128i a,__m128i b)3514 FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
3515 {
3516 return vreinterpretq_m128i_s32(
3517 vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3518 }
3519
3520 // Multiply the packed unsigned 16-bit integers in a and b, producing
3521 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
3522 // integers in dst.
3523 //
3524 // FOR j := 0 to 3
3525 // i := j*16
3526 // tmp[31:0] := a[i+15:i] * b[i+15:i]
3527 // dst[i+15:i] := tmp[31:16]
3528 // ENDFOR
3529 //
3530 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
3531 #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
3532
3533 // Multiplies the four single-precision, floating-point values of a and b.
3534 //
3535 // r0 := a0 * b0
3536 // r1 := a1 * b1
3537 // r2 := a2 * b2
3538 // r3 := a3 * b3
3539 //
3540 // https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
_mm_mul_ps(__m128 a,__m128 b)3541 FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
3542 {
3543 return vreinterpretq_m128_f32(
3544 vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3545 }
3546
3547 // Multiply packed double-precision (64-bit) floating-point elements in a and b,
3548 // and store the results in dst.
3549 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
_mm_mul_pd(__m128d a,__m128d b)3550 FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
3551 {
3552 #if defined(__aarch64__)
3553 return vreinterpretq_m128d_f64(
3554 vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3555 #else
3556 double *da = (double *) &a;
3557 double *db = (double *) &b;
3558 double c[2];
3559 c[0] = da[0] * db[0];
3560 c[1] = da[1] * db[1];
3561 return vld1q_f32((float32_t *) c);
3562 #endif
3563 }
3564
3565 // Multiply the lower double-precision (64-bit) floating-point element in a and
3566 // b, store the result in the lower element of dst, and copy the upper element
3567 // from a to the upper element of dst.
3568 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
_mm_mul_sd(__m128d a,__m128d b)3569 FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
3570 {
3571 return _mm_move_sd(a, _mm_mul_pd(a, b));
3572 }
3573
3574 // Multiply the lower single-precision (32-bit) floating-point element in a and
3575 // b, store the result in the lower element of dst, and copy the upper 3 packed
3576 // elements from a to the upper elements of dst.
3577 //
3578 // dst[31:0] := a[31:0] * b[31:0]
3579 // dst[127:32] := a[127:32]
3580 //
3581 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
_mm_mul_ss(__m128 a,__m128 b)3582 FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
3583 {
3584 return _mm_move_ss(a, _mm_mul_ps(a, b));
3585 }
3586
3587 // Multiply the low unsigned 32-bit integers from each packed 64-bit element in
3588 // a and b, and store the unsigned 64-bit results in dst.
3589 //
3590 // r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
3591 // r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
_mm_mul_epu32(__m128i a,__m128i b)3592 FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
3593 {
3594 // vmull_u32 upcasts instead of masking, so we downcast.
3595 uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
3596 uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
3597 return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
3598 }
3599
3600 // Multiply the low unsigned 32-bit integers from a and b, and store the
3601 // unsigned 64-bit result in dst.
3602 //
3603 // dst[63:0] := a[31:0] * b[31:0]
3604 //
3605 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
_mm_mul_su32(__m64 a,__m64 b)3606 FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
3607 {
3608 return vreinterpret_m64_u64(vget_low_u64(
3609 vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
3610 }
3611
3612 // Multiply the low signed 32-bit integers from each packed 64-bit element in
3613 // a and b, and store the signed 64-bit results in dst.
3614 //
3615 // r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
3616 // r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
_mm_mul_epi32(__m128i a,__m128i b)3617 FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
3618 {
3619 // vmull_s32 upcasts instead of masking, so we downcast.
3620 int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
3621 int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
3622 return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
3623 }
3624
3625 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
3626 // integers from b.
3627 //
3628 // r0 := (a0 * b0) + (a1 * b1)
3629 // r1 := (a2 * b2) + (a3 * b3)
3630 // r2 := (a4 * b4) + (a5 * b5)
3631 // r3 := (a6 * b6) + (a7 * b7)
3632 // https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
_mm_madd_epi16(__m128i a,__m128i b)3633 FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
3634 {
3635 int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
3636 vget_low_s16(vreinterpretq_s16_m128i(b)));
3637 int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
3638 vget_high_s16(vreinterpretq_s16_m128i(b)));
3639
3640 int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
3641 int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
3642
3643 return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
3644 }
3645
3646 // Multiply packed signed 16-bit integers in a and b, producing intermediate
3647 // signed 32-bit integers. Shift right by 15 bits while rounding up, and store
3648 // the packed 16-bit integers in dst.
3649 //
3650 // r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
3651 // r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
3652 // r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
3653 // ...
3654 // r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
_mm_mulhrs_epi16(__m128i a,__m128i b)3655 FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
3656 {
3657 // Has issues due to saturation
3658 // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
3659
3660 // Multiply
3661 int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
3662 vget_low_s16(vreinterpretq_s16_m128i(b)));
3663 int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
3664 vget_high_s16(vreinterpretq_s16_m128i(b)));
3665
3666 // Rounding narrowing shift right
3667 // narrow = (int16_t)((mul + 16384) >> 15);
3668 int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
3669 int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
3670
3671 // Join together
3672 return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
3673 }
3674
3675 // Vertically multiply each unsigned 8-bit integer from a with the corresponding
3676 // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
3677 // Horizontally add adjacent pairs of intermediate signed 16-bit integers,
3678 // and pack the saturated results in dst.
3679 //
3680 // FOR j := 0 to 7
3681 // i := j*16
3682 // dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
3683 // a[i+7:i]*b[i+7:i] )
3684 // ENDFOR
_mm_maddubs_epi16(__m128i _a,__m128i _b)3685 FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
3686 {
3687 #if defined(__aarch64__)
3688 uint8x16_t a = vreinterpretq_u8_m128i(_a);
3689 int8x16_t b = vreinterpretq_s8_m128i(_b);
3690 int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
3691 vmovl_s8(vget_low_s8(b)));
3692 int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
3693 vmovl_s8(vget_high_s8(b)));
3694 return vreinterpretq_m128i_s16(
3695 vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
3696 #else
3697 // This would be much simpler if x86 would choose to zero extend OR sign
3698 // extend, not both. This could probably be optimized better.
3699 uint16x8_t a = vreinterpretq_u16_m128i(_a);
3700 int16x8_t b = vreinterpretq_s16_m128i(_b);
3701
3702 // Zero extend a
3703 int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
3704 int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
3705
3706 // Sign extend by shifting left then shifting right.
3707 int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
3708 int16x8_t b_odd = vshrq_n_s16(b, 8);
3709
3710 // multiply
3711 int16x8_t prod1 = vmulq_s16(a_even, b_even);
3712 int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
3713
3714 // saturated add
3715 return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
3716 #endif
3717 }
3718
3719 // Computes the fused multiple add product of 32-bit floating point numbers.
3720 //
3721 // Return Value
3722 // Multiplies A and B, and adds C to the temporary result before returning it.
3723 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
_mm_fmadd_ps(__m128 a,__m128 b,__m128 c)3724 FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
3725 {
3726 #if defined(__aarch64__)
3727 return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
3728 vreinterpretq_f32_m128(b),
3729 vreinterpretq_f32_m128(a)));
3730 #else
3731 return _mm_add_ps(_mm_mul_ps(a, b), c);
3732 #endif
3733 }
3734
3735 // Alternatively add and subtract packed single-precision (32-bit)
3736 // floating-point elements in a to/from packed elements in b, and store the
3737 // results in dst.
3738 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
_mm_addsub_ps(__m128 a,__m128 b)3739 FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
3740 {
3741 __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
3742 return _mm_fmadd_ps(b, mask, a);
3743 }
3744
3745 // Horizontally add adjacent pairs of double-precision (64-bit) floating-point
3746 // elements in a and b, and pack the results in dst.
3747 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
_mm_hadd_pd(__m128d a,__m128d b)3748 FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
3749 {
3750 #if defined(__aarch64__)
3751 return vreinterpretq_m128d_f64(
3752 vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3753 #else
3754 double *da = (double *) &a;
3755 double *db = (double *) &b;
3756 double c[] = {da[0] + da[1], db[0] + db[1]};
3757 return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
3758 #endif
3759 }
3760
3761 // Compute the absolute differences of packed unsigned 8-bit integers in a and
3762 // b, then horizontally sum each consecutive 8 differences to produce two
3763 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3764 // 16 bits of 64-bit elements in dst.
3765 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
_mm_sad_epu8(__m128i a,__m128i b)3766 FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
3767 {
3768 uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
3769 uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3770 uint16_t r4 = t[4] + t[5] + t[6] + t[7];
3771 uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
3772 return (__m128i) vsetq_lane_u16(r4, r, 4);
3773 }
3774
3775 // Compute the absolute differences of packed unsigned 8-bit integers in a and
3776 // b, then horizontally sum each consecutive 8 differences to produce four
3777 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3778 // 16 bits of dst.
3779 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
_mm_sad_pu8(__m64 a,__m64 b)3780 FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
3781 {
3782 uint16x4_t t =
3783 vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3784 uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3785 return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0));
3786 }
3787
3788 // Compute the absolute differences of packed unsigned 8-bit integers in a and
3789 // b, then horizontally sum each consecutive 8 differences to produce four
3790 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3791 // 16 bits of dst.
3792 //
3793 // FOR j := 0 to 7
3794 // i := j*8
3795 // tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
3796 // ENDFOR
3797 // dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] +
3798 // tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
3799 //
3800 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw
3801 #define _m_psadbw(a, b) _mm_sad_pu8(a, b)
3802
3803 // Divides the four single-precision, floating-point values of a and b.
3804 //
3805 // r0 := a0 / b0
3806 // r1 := a1 / b1
3807 // r2 := a2 / b2
3808 // r3 := a3 / b3
3809 //
3810 // https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
_mm_div_ps(__m128 a,__m128 b)3811 FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
3812 {
3813 #if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
3814 return vreinterpretq_m128_f32(
3815 vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3816 #else
3817 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
3818 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
3819 #if SSE2NEON_PRECISE_DIV
3820 // Additional Netwon-Raphson iteration for accuracy
3821 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
3822 #endif
3823 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
3824 #endif
3825 }
3826
3827 // Divides the scalar single-precision floating point value of a by b.
3828 // https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
_mm_div_ss(__m128 a,__m128 b)3829 FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
3830 {
3831 float32_t value =
3832 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
3833 return vreinterpretq_m128_f32(
3834 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
3835 }
3836
3837 // Divide packed double-precision (64-bit) floating-point elements in a by
3838 // packed elements in b, and store the results in dst.
3839 //
3840 // FOR j := 0 to 1
3841 // i := 64*j
3842 // dst[i+63:i] := a[i+63:i] / b[i+63:i]
3843 // ENDFOR
3844 //
3845 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd
_mm_div_pd(__m128d a,__m128d b)3846 FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
3847 {
3848 #if defined(__aarch64__)
3849 return vreinterpretq_m128d_f64(
3850 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3851 #else
3852 double *da = (double *) &a;
3853 double *db = (double *) &b;
3854 double c[2];
3855 c[0] = da[0] / db[0];
3856 c[1] = da[1] / db[1];
3857 return vld1q_f32((float32_t *) c);
3858 #endif
3859 }
3860
3861 // Divide the lower double-precision (64-bit) floating-point element in a by the
3862 // lower double-precision (64-bit) floating-point element in b, store the result
3863 // in the lower element of dst, and copy the upper element from a to the upper
3864 // element of dst.
3865 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd
_mm_div_sd(__m128d a,__m128d b)3866 FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
3867 {
3868 #if defined(__aarch64__)
3869 float64x2_t tmp =
3870 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
3871 return vreinterpretq_m128d_f64(
3872 vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
3873 #else
3874 return _mm_move_sd(a, _mm_div_pd(a, b));
3875 #endif
3876 }
3877
3878 // Compute the approximate reciprocal of packed single-precision (32-bit)
3879 // floating-point elements in a, and store the results in dst. The maximum
3880 // relative error for this approximation is less than 1.5*2^-12.
3881 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
_mm_rcp_ps(__m128 in)3882 FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
3883 {
3884 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
3885 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
3886 #if SSE2NEON_PRECISE_DIV
3887 // Additional Netwon-Raphson iteration for accuracy
3888 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
3889 #endif
3890 return vreinterpretq_m128_f32(recip);
3891 }
3892
3893 // Compute the approximate reciprocal of the lower single-precision (32-bit)
3894 // floating-point element in a, store the result in the lower element of dst,
3895 // and copy the upper 3 packed elements from a to the upper elements of dst. The
3896 // maximum relative error for this approximation is less than 1.5*2^-12.
3897 //
3898 // dst[31:0] := (1.0 / a[31:0])
3899 // dst[127:32] := a[127:32]
3900 //
3901 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
_mm_rcp_ss(__m128 a)3902 FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
3903 {
3904 return _mm_move_ss(a, _mm_rcp_ps(a));
3905 }
3906
3907 // Computes the approximations of square roots of the four single-precision,
3908 // floating-point values of a. First computes reciprocal square roots and then
3909 // reciprocals of the four values.
3910 //
3911 // r0 := sqrt(a0)
3912 // r1 := sqrt(a1)
3913 // r2 := sqrt(a2)
3914 // r3 := sqrt(a3)
3915 //
3916 // https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
_mm_sqrt_ps(__m128 in)3917 FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
3918 {
3919 #if SSE2NEON_PRECISE_SQRT
3920 float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
3921
3922 // Test for vrsqrteq_f32(0) -> positive infinity case.
3923 // Change to zero, so that s * 1/sqrt(s) result is zero too.
3924 const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
3925 const uint32x4_t div_by_zero =
3926 vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
3927 recip = vreinterpretq_f32_u32(
3928 vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
3929
3930 // Additional Netwon-Raphson iteration for accuracy
3931 recip = vmulq_f32(
3932 vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
3933 recip);
3934 recip = vmulq_f32(
3935 vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
3936 recip);
3937
3938 // sqrt(s) = s * 1/sqrt(s)
3939 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
3940 #elif defined(__aarch64__)
3941 return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
3942 #else
3943 float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
3944 float32x4_t sq = vrecpeq_f32(recipsq);
3945 return vreinterpretq_m128_f32(sq);
3946 #endif
3947 }
3948
3949 // Computes the approximation of the square root of the scalar single-precision
3950 // floating point value of in.
3951 // https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
_mm_sqrt_ss(__m128 in)3952 FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
3953 {
3954 float32_t value =
3955 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
3956 return vreinterpretq_m128_f32(
3957 vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
3958 }
3959
3960 // Computes the approximations of the reciprocal square roots of the four
3961 // single-precision floating point values of in.
3962 // https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
_mm_rsqrt_ps(__m128 in)3963 FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
3964 {
3965 float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
3966 #if SSE2NEON_PRECISE_RSQRT
3967 // Additional Netwon-Raphson iteration for accuracy
3968 out = vmulq_f32(
3969 out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
3970 out = vmulq_f32(
3971 out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
3972 #endif
3973 return vreinterpretq_m128_f32(out);
3974 }
3975
3976 // Compute the approximate reciprocal square root of the lower single-precision
3977 // (32-bit) floating-point element in a, store the result in the lower element
3978 // of dst, and copy the upper 3 packed elements from a to the upper elements of
3979 // dst.
3980 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
_mm_rsqrt_ss(__m128 in)3981 FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
3982 {
3983 return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
3984 }
3985
3986 // Compare packed signed 16-bit integers in a and b, and store packed maximum
3987 // values in dst.
3988 //
3989 // FOR j := 0 to 3
3990 // i := j*16
3991 // dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
3992 // ENDFOR
3993 //
3994 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
_mm_max_pi16(__m64 a,__m64 b)3995 FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
3996 {
3997 return vreinterpret_m64_s16(
3998 vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
3999 }
4000
4001 // Compare packed signed 16-bit integers in a and b, and store packed maximum
4002 // values in dst.
4003 //
4004 // FOR j := 0 to 3
4005 // i := j*16
4006 // dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
4007 // ENDFOR
4008 //
4009 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
4010 #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
4011
4012 // Computes the maximums of the four single-precision, floating-point values of
4013 // a and b.
4014 // https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
_mm_max_ps(__m128 a,__m128 b)4015 FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
4016 {
4017 #if SSE2NEON_PRECISE_MINMAX
4018 float32x4_t _a = vreinterpretq_f32_m128(a);
4019 float32x4_t _b = vreinterpretq_f32_m128(b);
4020 return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
4021 #else
4022 return vreinterpretq_m128_f32(
4023 vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4024 #endif
4025 }
4026
4027 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
4028 // values in dst.
4029 //
4030 // FOR j := 0 to 7
4031 // i := j*8
4032 // dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
4033 // ENDFOR
4034 //
4035 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
_mm_max_pu8(__m64 a,__m64 b)4036 FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
4037 {
4038 return vreinterpret_m64_u8(
4039 vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
4040 }
4041
4042 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
4043 // values in dst.
4044 //
4045 // FOR j := 0 to 7
4046 // i := j*8
4047 // dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
4048 // ENDFOR
4049 //
4050 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
4051 #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
4052
4053 // Compare packed signed 16-bit integers in a and b, and store packed minimum
4054 // values in dst.
4055 //
4056 // FOR j := 0 to 3
4057 // i := j*16
4058 // dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
4059 // ENDFOR
4060 //
4061 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
_mm_min_pi16(__m64 a,__m64 b)4062 FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
4063 {
4064 return vreinterpret_m64_s16(
4065 vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
4066 }
4067
4068 // Compare packed signed 16-bit integers in a and b, and store packed minimum
4069 // values in dst.
4070 //
4071 // FOR j := 0 to 3
4072 // i := j*16
4073 // dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
4074 // ENDFOR
4075 //
4076 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
4077 #define _m_pminsw(a, b) _mm_min_pi16(a, b)
4078
4079 // Computes the minima of the four single-precision, floating-point values of a
4080 // and b.
4081 // https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
_mm_min_ps(__m128 a,__m128 b)4082 FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
4083 {
4084 #if SSE2NEON_PRECISE_MINMAX
4085 float32x4_t _a = vreinterpretq_f32_m128(a);
4086 float32x4_t _b = vreinterpretq_f32_m128(b);
4087 return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
4088 #else
4089 return vreinterpretq_m128_f32(
4090 vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4091 #endif
4092 }
4093
4094 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
4095 // values in dst.
4096 //
4097 // FOR j := 0 to 7
4098 // i := j*8
4099 // dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
4100 // ENDFOR
4101 //
4102 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
_mm_min_pu8(__m64 a,__m64 b)4103 FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
4104 {
4105 return vreinterpret_m64_u8(
4106 vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
4107 }
4108
4109 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
4110 // values in dst.
4111 //
4112 // FOR j := 0 to 7
4113 // i := j*8
4114 // dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
4115 // ENDFOR
4116 //
4117 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
4118 #define _m_pminub(a, b) _mm_min_pu8(a, b)
4119
4120 // Computes the maximum of the two lower scalar single-precision floating point
4121 // values of a and b.
4122 // https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
_mm_max_ss(__m128 a,__m128 b)4123 FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
4124 {
4125 float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
4126 return vreinterpretq_m128_f32(
4127 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
4128 }
4129
4130 // Computes the minimum of the two lower scalar single-precision floating point
4131 // values of a and b.
4132 // https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
_mm_min_ss(__m128 a,__m128 b)4133 FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
4134 {
4135 float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
4136 return vreinterpretq_m128_f32(
4137 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
4138 }
4139
4140 // Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
4141 // 16 unsigned 8-bit integers from b.
4142 // https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
_mm_max_epu8(__m128i a,__m128i b)4143 FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
4144 {
4145 return vreinterpretq_m128i_u8(
4146 vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4147 }
4148
4149 // Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
4150 // 16 unsigned 8-bit integers from b.
4151 // https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
_mm_min_epu8(__m128i a,__m128i b)4152 FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
4153 {
4154 return vreinterpretq_m128i_u8(
4155 vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4156 }
4157
4158 // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
4159 // signed 16-bit integers from b.
4160 // https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
_mm_min_epi16(__m128i a,__m128i b)4161 FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
4162 {
4163 return vreinterpretq_m128i_s16(
4164 vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4165 }
4166
4167 // Compare packed signed 8-bit integers in a and b, and store packed maximum
4168 // values in dst.
4169 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
_mm_max_epi8(__m128i a,__m128i b)4170 FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
4171 {
4172 return vreinterpretq_m128i_s8(
4173 vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4174 }
4175
4176 // Compare packed unsigned 16-bit integers in a and b, and store packed maximum
4177 // values in dst.
4178 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
_mm_max_epu16(__m128i a,__m128i b)4179 FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
4180 {
4181 return vreinterpretq_m128i_u16(
4182 vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
4183 }
4184
4185 // Compare packed signed 8-bit integers in a and b, and store packed minimum
4186 // values in dst.
4187 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
_mm_min_epi8(__m128i a,__m128i b)4188 FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
4189 {
4190 return vreinterpretq_m128i_s8(
4191 vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4192 }
4193
4194 // Compare packed unsigned 16-bit integers in a and b, and store packed minimum
4195 // values in dst.
4196 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
_mm_min_epu16(__m128i a,__m128i b)4197 FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
4198 {
4199 return vreinterpretq_m128i_u16(
4200 vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
4201 }
4202
4203 // Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
4204 // signed 16-bit integers from b.
4205 // https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
_mm_max_epi16(__m128i a,__m128i b)4206 FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
4207 {
4208 return vreinterpretq_m128i_s16(
4209 vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4210 }
4211
4212 // epi versions of min/max
4213 // Computes the pariwise maximums of the four signed 32-bit integer values of a
4214 // and b.
4215 //
4216 // A 128-bit parameter that can be defined with the following equations:
4217 // r0 := (a0 > b0) ? a0 : b0
4218 // r1 := (a1 > b1) ? a1 : b1
4219 // r2 := (a2 > b2) ? a2 : b2
4220 // r3 := (a3 > b3) ? a3 : b3
4221 //
4222 // https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
_mm_max_epi32(__m128i a,__m128i b)4223 FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
4224 {
4225 return vreinterpretq_m128i_s32(
4226 vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4227 }
4228
4229 // Computes the pariwise minima of the four signed 32-bit integer values of a
4230 // and b.
4231 //
4232 // A 128-bit parameter that can be defined with the following equations:
4233 // r0 := (a0 < b0) ? a0 : b0
4234 // r1 := (a1 < b1) ? a1 : b1
4235 // r2 := (a2 < b2) ? a2 : b2
4236 // r3 := (a3 < b3) ? a3 : b3
4237 //
4238 // https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
_mm_min_epi32(__m128i a,__m128i b)4239 FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
4240 {
4241 return vreinterpretq_m128i_s32(
4242 vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4243 }
4244
4245 // Compare packed unsigned 32-bit integers in a and b, and store packed maximum
4246 // values in dst.
4247 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
_mm_max_epu32(__m128i a,__m128i b)4248 FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
4249 {
4250 return vreinterpretq_m128i_u32(
4251 vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
4252 }
4253
4254 // Compare packed unsigned 32-bit integers in a and b, and store packed minimum
4255 // values in dst.
4256 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
_mm_min_epu32(__m128i a,__m128i b)4257 FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
4258 {
4259 return vreinterpretq_m128i_u32(
4260 vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
4261 }
4262
4263 // Multiply the packed unsigned 16-bit integers in a and b, producing
4264 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
4265 // integers in dst.
4266 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
_mm_mulhi_pu16(__m64 a,__m64 b)4267 FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
4268 {
4269 return vreinterpret_m64_u16(vshrn_n_u32(
4270 vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
4271 }
4272
4273 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
4274 // integers from b.
4275 //
4276 // r0 := (a0 * b0)[31:16]
4277 // r1 := (a1 * b1)[31:16]
4278 // ...
4279 // r7 := (a7 * b7)[31:16]
4280 //
4281 // https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
_mm_mulhi_epi16(__m128i a,__m128i b)4282 FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
4283 {
4284 /* FIXME: issue with large values because of result saturation */
4285 // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
4286 // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
4287 // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
4288 int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
4289 int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
4290 int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4291 int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
4292 int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
4293 int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4294 uint16x8x2_t r =
4295 vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4296 return vreinterpretq_m128i_u16(r.val[1]);
4297 }
4298
4299 // Multiply the packed unsigned 16-bit integers in a and b, producing
4300 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
4301 // integers in dst.
4302 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
_mm_mulhi_epu16(__m128i a,__m128i b)4303 FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
4304 {
4305 uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
4306 uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
4307 uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4308 #if defined(__aarch64__)
4309 uint32x4_t ab7654 =
4310 vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
4311 uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4312 vreinterpretq_u16_u32(ab7654));
4313 return vreinterpretq_m128i_u16(r);
4314 #else
4315 uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
4316 uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
4317 uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4318 uint16x8x2_t r =
4319 vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4320 return vreinterpretq_m128i_u16(r.val[1]);
4321 #endif
4322 }
4323
4324 // Computes pairwise add of each argument as single-precision, floating-point
4325 // values a and b.
4326 // https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
_mm_hadd_ps(__m128 a,__m128 b)4327 FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
4328 {
4329 #if defined(__aarch64__)
4330 return vreinterpretq_m128_f32(
4331 vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4332 #else
4333 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
4334 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
4335 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
4336 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
4337 return vreinterpretq_m128_f32(
4338 vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
4339 #endif
4340 }
4341
4342 // Computes pairwise add of each argument as a 16-bit signed or unsigned integer
4343 // values a and b.
_mm_hadd_epi16(__m128i _a,__m128i _b)4344 FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
4345 {
4346 int16x8_t a = vreinterpretq_s16_m128i(_a);
4347 int16x8_t b = vreinterpretq_s16_m128i(_b);
4348 #if defined(__aarch64__)
4349 return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
4350 #else
4351 return vreinterpretq_m128i_s16(
4352 vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
4353 vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
4354 #endif
4355 }
4356
4357 // Horizontally substract adjacent pairs of single-precision (32-bit)
4358 // floating-point elements in a and b, and pack the results in dst.
4359 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
_mm_hsub_ps(__m128 _a,__m128 _b)4360 FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
4361 {
4362 #if defined(__aarch64__)
4363 return vreinterpretq_m128_f32(vsubq_f32(
4364 vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
4365 vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
4366 #else
4367 float32x4x2_t c =
4368 vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
4369 return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
4370 #endif
4371 }
4372
4373 // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
4374 // signed 16-bit results in dst.
4375 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
_mm_hadd_pi16(__m64 a,__m64 b)4376 FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
4377 {
4378 return vreinterpret_m64_s16(
4379 vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
4380 }
4381
4382 // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
4383 // signed 32-bit results in dst.
4384 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
_mm_hadd_pi32(__m64 a,__m64 b)4385 FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
4386 {
4387 return vreinterpret_m64_s32(
4388 vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
4389 }
4390
4391 // Computes pairwise difference of each argument as a 16-bit signed or unsigned
4392 // integer values a and b.
_mm_hsub_epi16(__m128i _a,__m128i _b)4393 FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
4394 {
4395 int32x4_t a = vreinterpretq_s32_m128i(_a);
4396 int32x4_t b = vreinterpretq_s32_m128i(_b);
4397 // Interleave using vshrn/vmovn
4398 // [a0|a2|a4|a6|b0|b2|b4|b6]
4399 // [a1|a3|a5|a7|b1|b3|b5|b7]
4400 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
4401 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
4402 // Subtract
4403 return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
4404 }
4405
4406 // Computes saturated pairwise sub of each argument as a 16-bit signed
4407 // integer values a and b.
_mm_hadds_epi16(__m128i _a,__m128i _b)4408 FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
4409 {
4410 #if defined(__aarch64__)
4411 int16x8_t a = vreinterpretq_s16_m128i(_a);
4412 int16x8_t b = vreinterpretq_s16_m128i(_b);
4413 return vreinterpretq_s64_s16(
4414 vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
4415 #else
4416 int32x4_t a = vreinterpretq_s32_m128i(_a);
4417 int32x4_t b = vreinterpretq_s32_m128i(_b);
4418 // Interleave using vshrn/vmovn
4419 // [a0|a2|a4|a6|b0|b2|b4|b6]
4420 // [a1|a3|a5|a7|b1|b3|b5|b7]
4421 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
4422 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
4423 // Saturated add
4424 return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
4425 #endif
4426 }
4427
4428 // Computes saturated pairwise difference of each argument as a 16-bit signed
4429 // integer values a and b.
4430 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
_mm_hsubs_epi16(__m128i _a,__m128i _b)4431 FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
4432 {
4433 #if defined(__aarch64__)
4434 int16x8_t a = vreinterpretq_s16_m128i(_a);
4435 int16x8_t b = vreinterpretq_s16_m128i(_b);
4436 return vreinterpretq_s64_s16(
4437 vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
4438 #else
4439 int32x4_t a = vreinterpretq_s32_m128i(_a);
4440 int32x4_t b = vreinterpretq_s32_m128i(_b);
4441 // Interleave using vshrn/vmovn
4442 // [a0|a2|a4|a6|b0|b2|b4|b6]
4443 // [a1|a3|a5|a7|b1|b3|b5|b7]
4444 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
4445 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
4446 // Saturated subtract
4447 return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
4448 #endif
4449 }
4450
4451 // Computes pairwise add of each argument as a 32-bit signed or unsigned integer
4452 // values a and b.
_mm_hadd_epi32(__m128i _a,__m128i _b)4453 FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
4454 {
4455 int32x4_t a = vreinterpretq_s32_m128i(_a);
4456 int32x4_t b = vreinterpretq_s32_m128i(_b);
4457 return vreinterpretq_m128i_s32(
4458 vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
4459 vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
4460 }
4461
4462 // Computes pairwise difference of each argument as a 32-bit signed or unsigned
4463 // integer values a and b.
_mm_hsub_epi32(__m128i _a,__m128i _b)4464 FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
4465 {
4466 int64x2_t a = vreinterpretq_s64_m128i(_a);
4467 int64x2_t b = vreinterpretq_s64_m128i(_b);
4468 // Interleave using vshrn/vmovn
4469 // [a0|a2|b0|b2]
4470 // [a1|a2|b1|b3]
4471 int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
4472 int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
4473 // Subtract
4474 return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
4475 }
4476
4477 // Kahan summation for accurate summation of floating-point numbers.
4478 // http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
_sse2neon_kadd_f32(float * sum,float * c,float y)4479 FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
4480 {
4481 y -= *c;
4482 float t = *sum + y;
4483 *c = (t - *sum) - y;
4484 *sum = t;
4485 }
4486
4487 // Conditionally multiply the packed single-precision (32-bit) floating-point
4488 // elements in a and b using the high 4 bits in imm8, sum the four products,
4489 // and conditionally store the sum in dst using the low 4 bits of imm.
4490 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
_mm_dp_ps(__m128 a,__m128 b,const int imm)4491 FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
4492 {
4493 #if defined(__aarch64__)
4494 /* shortcuts */
4495 if (imm == 0xFF) {
4496 return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
4497 }
4498 if (imm == 0x7F) {
4499 float32x4_t m = _mm_mul_ps(a, b);
4500 m[3] = 0;
4501 return _mm_set1_ps(vaddvq_f32(m));
4502 }
4503 #endif
4504
4505 float s = 0, c = 0;
4506 float32x4_t f32a = vreinterpretq_f32_m128(a);
4507 float32x4_t f32b = vreinterpretq_f32_m128(b);
4508
4509 /* To improve the accuracy of floating-point summation, Kahan algorithm
4510 * is used for each operation.
4511 */
4512 if (imm & (1 << 4))
4513 _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
4514 if (imm & (1 << 5))
4515 _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
4516 if (imm & (1 << 6))
4517 _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
4518 if (imm & (1 << 7))
4519 _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
4520 s += c;
4521
4522 float32x4_t res = {
4523 (imm & 0x1) ? s : 0,
4524 (imm & 0x2) ? s : 0,
4525 (imm & 0x4) ? s : 0,
4526 (imm & 0x8) ? s : 0,
4527 };
4528 return vreinterpretq_m128_f32(res);
4529 }
4530
4531 /* Compare operations */
4532
4533 // Compares for less than
4534 // https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
_mm_cmplt_ps(__m128 a,__m128 b)4535 FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
4536 {
4537 return vreinterpretq_m128_u32(
4538 vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4539 }
4540
4541 // Compares for less than
4542 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
_mm_cmplt_ss(__m128 a,__m128 b)4543 FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
4544 {
4545 return _mm_move_ss(a, _mm_cmplt_ps(a, b));
4546 }
4547
4548 // Compares for greater than.
4549 //
4550 // r0 := (a0 > b0) ? 0xffffffff : 0x0
4551 // r1 := (a1 > b1) ? 0xffffffff : 0x0
4552 // r2 := (a2 > b2) ? 0xffffffff : 0x0
4553 // r3 := (a3 > b3) ? 0xffffffff : 0x0
4554 //
4555 // https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
_mm_cmpgt_ps(__m128 a,__m128 b)4556 FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
4557 {
4558 return vreinterpretq_m128_u32(
4559 vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4560 }
4561
4562 // Compares for greater than.
4563 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
_mm_cmpgt_ss(__m128 a,__m128 b)4564 FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
4565 {
4566 return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
4567 }
4568
4569 // Compares for greater than or equal.
4570 // https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
_mm_cmpge_ps(__m128 a,__m128 b)4571 FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
4572 {
4573 return vreinterpretq_m128_u32(
4574 vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4575 }
4576
4577 // Compares for greater than or equal.
4578 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
_mm_cmpge_ss(__m128 a,__m128 b)4579 FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
4580 {
4581 return _mm_move_ss(a, _mm_cmpge_ps(a, b));
4582 }
4583
4584 // Compares for less than or equal.
4585 //
4586 // r0 := (a0 <= b0) ? 0xffffffff : 0x0
4587 // r1 := (a1 <= b1) ? 0xffffffff : 0x0
4588 // r2 := (a2 <= b2) ? 0xffffffff : 0x0
4589 // r3 := (a3 <= b3) ? 0xffffffff : 0x0
4590 //
4591 // https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
_mm_cmple_ps(__m128 a,__m128 b)4592 FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
4593 {
4594 return vreinterpretq_m128_u32(
4595 vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4596 }
4597
4598 // Compares for less than or equal.
4599 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
_mm_cmple_ss(__m128 a,__m128 b)4600 FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
4601 {
4602 return _mm_move_ss(a, _mm_cmple_ps(a, b));
4603 }
4604
4605 // Compares for equality.
4606 // https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
_mm_cmpeq_ps(__m128 a,__m128 b)4607 FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
4608 {
4609 return vreinterpretq_m128_u32(
4610 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4611 }
4612
4613 // Compares for equality.
4614 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
_mm_cmpeq_ss(__m128 a,__m128 b)4615 FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
4616 {
4617 return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
4618 }
4619
4620 // Compares for inequality.
4621 // https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
_mm_cmpneq_ps(__m128 a,__m128 b)4622 FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
4623 {
4624 return vreinterpretq_m128_u32(vmvnq_u32(
4625 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
4626 }
4627
4628 // Compares for inequality.
4629 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
_mm_cmpneq_ss(__m128 a,__m128 b)4630 FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
4631 {
4632 return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
4633 }
4634
4635 // Compares for not greater than or equal.
4636 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
_mm_cmpnge_ps(__m128 a,__m128 b)4637 FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
4638 {
4639 return _mm_cmplt_ps(a, b);
4640 }
4641
4642 // Compares for not greater than or equal.
4643 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
_mm_cmpnge_ss(__m128 a,__m128 b)4644 FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
4645 {
4646 return _mm_cmplt_ss(a, b);
4647 }
4648
4649 // Compares for not greater than.
4650 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
_mm_cmpngt_ps(__m128 a,__m128 b)4651 FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
4652 {
4653 return _mm_cmple_ps(a, b);
4654 }
4655
4656 // Compares for not greater than.
4657 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
_mm_cmpngt_ss(__m128 a,__m128 b)4658 FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
4659 {
4660 return _mm_cmple_ss(a, b);
4661 }
4662
4663 // Compares for not less than or equal.
4664 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
_mm_cmpnle_ps(__m128 a,__m128 b)4665 FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
4666 {
4667 return _mm_cmpgt_ps(a, b);
4668 }
4669
4670 // Compares for not less than or equal.
4671 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
_mm_cmpnle_ss(__m128 a,__m128 b)4672 FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
4673 {
4674 return _mm_cmpgt_ss(a, b);
4675 }
4676
4677 // Compares for not less than.
4678 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
_mm_cmpnlt_ps(__m128 a,__m128 b)4679 FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
4680 {
4681 return _mm_cmpge_ps(a, b);
4682 }
4683
4684 // Compares for not less than.
4685 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
_mm_cmpnlt_ss(__m128 a,__m128 b)4686 FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
4687 {
4688 return _mm_cmpge_ss(a, b);
4689 }
4690
4691 // Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
4692 // unsigned 8-bit integers in b for equality.
4693 // https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
_mm_cmpeq_epi8(__m128i a,__m128i b)4694 FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
4695 {
4696 return vreinterpretq_m128i_u8(
4697 vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4698 }
4699
4700 // Compare packed double-precision (64-bit) floating-point elements in a and b
4701 // for equality, and store the results in dst.
4702 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
_mm_cmpeq_pd(__m128d a,__m128d b)4703 FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
4704 {
4705 #if defined(__aarch64__)
4706 return vreinterpretq_m128d_u64(
4707 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4708 #else
4709 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
4710 uint32x4_t cmp =
4711 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
4712 uint32x4_t swapped = vrev64q_u32(cmp);
4713 return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
4714 #endif
4715 }
4716
4717 // Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
4718 // unsigned 16-bit integers in b for equality.
4719 // https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
_mm_cmpeq_epi16(__m128i a,__m128i b)4720 FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
4721 {
4722 return vreinterpretq_m128i_u16(
4723 vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4724 }
4725
4726 // Compare packed 32-bit integers in a and b for equality, and store the results
4727 // in dst
_mm_cmpeq_epi32(__m128i a,__m128i b)4728 FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
4729 {
4730 return vreinterpretq_m128i_u32(
4731 vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4732 }
4733
4734 // Compare packed 64-bit integers in a and b for equality, and store the results
4735 // in dst
_mm_cmpeq_epi64(__m128i a,__m128i b)4736 FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
4737 {
4738 #if defined(__aarch64__)
4739 return vreinterpretq_m128i_u64(
4740 vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
4741 #else
4742 // ARMv7 lacks vceqq_u64
4743 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
4744 uint32x4_t cmp =
4745 vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
4746 uint32x4_t swapped = vrev64q_u32(cmp);
4747 return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
4748 #endif
4749 }
4750
4751 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
4752 // in b for lesser than.
4753 // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
_mm_cmplt_epi8(__m128i a,__m128i b)4754 FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
4755 {
4756 return vreinterpretq_m128i_u8(
4757 vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4758 }
4759
4760 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
4761 // in b for greater than.
4762 //
4763 // r0 := (a0 > b0) ? 0xff : 0x0
4764 // r1 := (a1 > b1) ? 0xff : 0x0
4765 // ...
4766 // r15 := (a15 > b15) ? 0xff : 0x0
4767 //
4768 // https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
_mm_cmpgt_epi8(__m128i a,__m128i b)4769 FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
4770 {
4771 return vreinterpretq_m128i_u8(
4772 vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4773 }
4774
4775 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
4776 // in b for less than.
4777 //
4778 // r0 := (a0 < b0) ? 0xffff : 0x0
4779 // r1 := (a1 < b1) ? 0xffff : 0x0
4780 // ...
4781 // r7 := (a7 < b7) ? 0xffff : 0x0
4782 //
4783 // https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
_mm_cmplt_epi16(__m128i a,__m128i b)4784 FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
4785 {
4786 return vreinterpretq_m128i_u16(
4787 vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4788 }
4789
4790 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
4791 // in b for greater than.
4792 //
4793 // r0 := (a0 > b0) ? 0xffff : 0x0
4794 // r1 := (a1 > b1) ? 0xffff : 0x0
4795 // ...
4796 // r7 := (a7 > b7) ? 0xffff : 0x0
4797 //
4798 // https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
_mm_cmpgt_epi16(__m128i a,__m128i b)4799 FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
4800 {
4801 return vreinterpretq_m128i_u16(
4802 vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4803 }
4804
4805
4806 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
4807 // in b for less than.
4808 // https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
_mm_cmplt_epi32(__m128i a,__m128i b)4809 FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
4810 {
4811 return vreinterpretq_m128i_u32(
4812 vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4813 }
4814
4815 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
4816 // in b for greater than.
4817 // https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
_mm_cmpgt_epi32(__m128i a,__m128i b)4818 FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
4819 {
4820 return vreinterpretq_m128i_u32(
4821 vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4822 }
4823
4824 // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
4825 // in b for greater than.
_mm_cmpgt_epi64(__m128i a,__m128i b)4826 FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
4827 {
4828 #if defined(__aarch64__)
4829 return vreinterpretq_m128i_u64(
4830 vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
4831 #else
4832 // ARMv7 lacks vcgtq_s64.
4833 // This is based off of Clang's SSE2 polyfill:
4834 // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
4835
4836 // Mask the sign bit out since we need a signed AND an unsigned comparison
4837 // and it is ugly to try and split them.
4838 int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
4839 int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
4840 int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
4841 // Check if a > b
4842 int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
4843 // Copy upper mask to lower mask
4844 // a_hi > b_hi
4845 int64x2_t gt_hi = vshrq_n_s64(greater, 63);
4846 // Copy lower mask to upper mask
4847 // a_lo > b_lo
4848 int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
4849 // Compare for equality
4850 int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
4851 // Copy upper mask to lower mask
4852 // a_hi == b_hi
4853 int64x2_t eq_hi = vshrq_n_s64(equal, 63);
4854 // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
4855 int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
4856 return vreinterpretq_m128i_s64(ret);
4857 #endif
4858 }
4859
4860 // Compares the four 32-bit floats in a and b to check if any values are NaN.
4861 // Ordered compare between each value returns true for "orderable" and false for
4862 // "not orderable" (NaN).
4863 // https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
4864 // also:
4865 // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
4866 // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
_mm_cmpord_ps(__m128 a,__m128 b)4867 FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
4868 {
4869 // Note: NEON does not have ordered compare builtin
4870 // Need to compare a eq a and b eq b to check for NaN
4871 // Do AND of results to get final
4872 uint32x4_t ceqaa =
4873 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4874 uint32x4_t ceqbb =
4875 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4876 return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
4877 }
4878
4879 // Compares for ordered.
4880 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
_mm_cmpord_ss(__m128 a,__m128 b)4881 FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
4882 {
4883 return _mm_move_ss(a, _mm_cmpord_ps(a, b));
4884 }
4885
4886 // Compares for unordered.
4887 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
_mm_cmpunord_ps(__m128 a,__m128 b)4888 FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
4889 {
4890 uint32x4_t f32a =
4891 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4892 uint32x4_t f32b =
4893 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4894 return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
4895 }
4896
4897 // Compares for unordered.
4898 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
_mm_cmpunord_ss(__m128 a,__m128 b)4899 FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
4900 {
4901 return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
4902 }
4903
4904 // Compares the lower single-precision floating point scalar values of a and b
4905 // using a less than operation. :
4906 // https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
4907 // note!! The documentation on MSDN is incorrect! If either of the values is a
4908 // NAN the docs say you will get a one, but in fact, it will return a zero!!
_mm_comilt_ss(__m128 a,__m128 b)4909 FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
4910 {
4911 uint32x4_t a_not_nan =
4912 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4913 uint32x4_t b_not_nan =
4914 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4915 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4916 uint32x4_t a_lt_b =
4917 vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4918 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
4919 }
4920
4921 // Compares the lower single-precision floating point scalar values of a and b
4922 // using a greater than operation. :
4923 // https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
_mm_comigt_ss(__m128 a,__m128 b)4924 FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
4925 {
4926 // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
4927 // vreinterpretq_f32_m128(b)), 0);
4928 uint32x4_t a_not_nan =
4929 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4930 uint32x4_t b_not_nan =
4931 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4932 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4933 uint32x4_t a_gt_b =
4934 vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4935 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
4936 }
4937
4938 // Compares the lower single-precision floating point scalar values of a and b
4939 // using a less than or equal operation. :
4940 // https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
_mm_comile_ss(__m128 a,__m128 b)4941 FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
4942 {
4943 // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
4944 // vreinterpretq_f32_m128(b)), 0);
4945 uint32x4_t a_not_nan =
4946 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4947 uint32x4_t b_not_nan =
4948 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4949 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4950 uint32x4_t a_le_b =
4951 vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4952 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
4953 }
4954
4955 // Compares the lower single-precision floating point scalar values of a and b
4956 // using a greater than or equal operation. :
4957 // https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
_mm_comige_ss(__m128 a,__m128 b)4958 FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
4959 {
4960 // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
4961 // vreinterpretq_f32_m128(b)), 0);
4962 uint32x4_t a_not_nan =
4963 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4964 uint32x4_t b_not_nan =
4965 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4966 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4967 uint32x4_t a_ge_b =
4968 vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4969 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
4970 }
4971
4972 // Compares the lower single-precision floating point scalar values of a and b
4973 // using an equality operation. :
4974 // https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
_mm_comieq_ss(__m128 a,__m128 b)4975 FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
4976 {
4977 // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
4978 // vreinterpretq_f32_m128(b)), 0);
4979 uint32x4_t a_not_nan =
4980 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4981 uint32x4_t b_not_nan =
4982 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4983 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4984 uint32x4_t a_eq_b =
4985 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4986 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
4987 }
4988
4989 // Compares the lower single-precision floating point scalar values of a and b
4990 // using an inequality operation. :
4991 // https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
_mm_comineq_ss(__m128 a,__m128 b)4992 FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
4993 {
4994 // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
4995 // vreinterpretq_f32_m128(b)), 0);
4996 uint32x4_t a_not_nan =
4997 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4998 uint32x4_t b_not_nan =
4999 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
5000 uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
5001 uint32x4_t a_neq_b = vmvnq_u32(
5002 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
5003 return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
5004 }
5005
5006 // according to the documentation, these intrinsics behave the same as the
5007 // non-'u' versions. We'll just alias them here.
5008 #define _mm_ucomieq_ss _mm_comieq_ss
5009 #define _mm_ucomige_ss _mm_comige_ss
5010 #define _mm_ucomigt_ss _mm_comigt_ss
5011 #define _mm_ucomile_ss _mm_comile_ss
5012 #define _mm_ucomilt_ss _mm_comilt_ss
5013 #define _mm_ucomineq_ss _mm_comineq_ss
5014
5015 /* Conversions */
5016
5017 // Convert packed signed 32-bit integers in b to packed single-precision
5018 // (32-bit) floating-point elements, store the results in the lower 2 elements
5019 // of dst, and copy the upper 2 packed elements from a to the upper elements of
5020 // dst.
5021 //
5022 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
5023 // dst[63:32] := Convert_Int32_To_FP32(b[63:32])
5024 // dst[95:64] := a[95:64]
5025 // dst[127:96] := a[127:96]
5026 //
5027 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
_mm_cvt_pi2ps(__m128 a,__m64 b)5028 FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
5029 {
5030 return vreinterpretq_m128_f32(
5031 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
5032 vget_high_f32(vreinterpretq_f32_m128(a))));
5033 }
5034
5035 // Convert the signed 32-bit integer b to a single-precision (32-bit)
5036 // floating-point element, store the result in the lower element of dst, and
5037 // copy the upper 3 packed elements from a to the upper elements of dst.
5038 //
5039 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
5040 // dst[127:32] := a[127:32]
5041 //
5042 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
_mm_cvt_si2ss(__m128 a,int b)5043 FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
5044 {
5045 return vreinterpretq_m128_f32(
5046 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
5047 }
5048
5049 // Convert the signed 32-bit integer b to a single-precision (32-bit)
5050 // floating-point element, store the result in the lower element of dst, and
5051 // copy the upper 3 packed elements from a to the upper elements of dst.
5052 //
5053 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
5054 // dst[127:32] := a[127:32]
5055 //
5056 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
5057 #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
5058
5059 // Convert the signed 64-bit integer b to a single-precision (32-bit)
5060 // floating-point element, store the result in the lower element of dst, and
5061 // copy the upper 3 packed elements from a to the upper elements of dst.
5062 //
5063 // dst[31:0] := Convert_Int64_To_FP32(b[63:0])
5064 // dst[127:32] := a[127:32]
5065 //
5066 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
_mm_cvtsi64_ss(__m128 a,int64_t b)5067 FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
5068 {
5069 return vreinterpretq_m128_f32(
5070 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
5071 }
5072
5073 // Convert the lower single-precision (32-bit) floating-point element in a to a
5074 // 32-bit integer, and store the result in dst.
5075 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
_mm_cvt_ss2si(__m128 a)5076 FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
5077 {
5078 #if defined(__aarch64__)
5079 return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0);
5080 #else
5081 float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
5082 float32_t diff = data - floor(data);
5083 if (diff > 0.5)
5084 return (int32_t) ceil(data);
5085 if (unlikely(diff == 0.5)) {
5086 int32_t f = (int32_t) floor(data);
5087 int32_t c = (int32_t) ceil(data);
5088 return c & 1 ? f : c;
5089 }
5090 return (int32_t) floor(data);
5091 #endif
5092 }
5093
5094 // Convert packed 16-bit integers in a to packed single-precision (32-bit)
5095 // floating-point elements, and store the results in dst.
5096 //
5097 // FOR j := 0 to 3
5098 // i := j*16
5099 // m := j*32
5100 // dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
5101 // ENDFOR
5102 //
5103 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
_mm_cvtpi16_ps(__m64 a)5104 FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
5105 {
5106 return vreinterpretq_m128_f32(
5107 vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
5108 }
5109
5110 // Convert packed 32-bit integers in b to packed single-precision (32-bit)
5111 // floating-point elements, store the results in the lower 2 elements of dst,
5112 // and copy the upper 2 packed elements from a to the upper elements of dst.
5113 //
5114 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
5115 // dst[63:32] := Convert_Int32_To_FP32(b[63:32])
5116 // dst[95:64] := a[95:64]
5117 // dst[127:96] := a[127:96]
5118 //
5119 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
_mm_cvtpi32_ps(__m128 a,__m64 b)5120 FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
5121 {
5122 return vreinterpretq_m128_f32(
5123 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
5124 vget_high_f32(vreinterpretq_f32_m128(a))));
5125 }
5126
5127 // Convert packed signed 32-bit integers in a to packed single-precision
5128 // (32-bit) floating-point elements, store the results in the lower 2 elements
5129 // of dst, then covert the packed signed 32-bit integers in b to
5130 // single-precision (32-bit) floating-point element, and store the results in
5131 // the upper 2 elements of dst.
5132 //
5133 // dst[31:0] := Convert_Int32_To_FP32(a[31:0])
5134 // dst[63:32] := Convert_Int32_To_FP32(a[63:32])
5135 // dst[95:64] := Convert_Int32_To_FP32(b[31:0])
5136 // dst[127:96] := Convert_Int32_To_FP32(b[63:32])
5137 //
5138 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
_mm_cvtpi32x2_ps(__m64 a,__m64 b)5139 FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
5140 {
5141 return vreinterpretq_m128_f32(vcvtq_f32_s32(
5142 vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
5143 }
5144
5145 // Convert the lower packed 8-bit integers in a to packed single-precision
5146 // (32-bit) floating-point elements, and store the results in dst.
5147 //
5148 // FOR j := 0 to 3
5149 // i := j*8
5150 // m := j*32
5151 // dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
5152 // ENDFOR
5153 //
5154 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
_mm_cvtpi8_ps(__m64 a)5155 FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
5156 {
5157 return vreinterpretq_m128_f32(vcvtq_f32_s32(
5158 vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
5159 }
5160
5161 // Convert packed unsigned 16-bit integers in a to packed single-precision
5162 // (32-bit) floating-point elements, and store the results in dst.
5163 //
5164 // FOR j := 0 to 3
5165 // i := j*16
5166 // m := j*32
5167 // dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
5168 // ENDFOR
5169 //
5170 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
_mm_cvtpu16_ps(__m64 a)5171 FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
5172 {
5173 return vreinterpretq_m128_f32(
5174 vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
5175 }
5176
5177 // Convert the lower packed unsigned 8-bit integers in a to packed
5178 // single-precision (32-bit) floating-point elements, and store the results in
5179 // dst.
5180 //
5181 // FOR j := 0 to 3
5182 // i := j*8
5183 // m := j*32
5184 // dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
5185 // ENDFOR
5186 //
5187 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
_mm_cvtpu8_ps(__m64 a)5188 FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
5189 {
5190 return vreinterpretq_m128_f32(vcvtq_f32_u32(
5191 vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
5192 }
5193
5194 // Converts the four single-precision, floating-point values of a to signed
5195 // 32-bit integer values using truncate.
5196 // https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
_mm_cvttps_epi32(__m128 a)5197 FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
5198 {
5199 return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
5200 }
5201
5202 // Convert the lower double-precision (64-bit) floating-point element in a to a
5203 // 64-bit integer with truncation, and store the result in dst.
5204 //
5205 // dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
5206 //
5207 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
_mm_cvttsd_si64(__m128d a)5208 FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
5209 {
5210 #if defined(__aarch64__)
5211 return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
5212 #else
5213 double ret = *((double *) &a);
5214 return (int64_t) ret;
5215 #endif
5216 }
5217
5218 // Convert the lower double-precision (64-bit) floating-point element in a to a
5219 // 64-bit integer with truncation, and store the result in dst.
5220 //
5221 // dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
5222 //
5223 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
5224 #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
5225
5226 // Converts the four signed 32-bit integer values of a to single-precision,
5227 // floating-point values
5228 // https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
_mm_cvtepi32_ps(__m128i a)5229 FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
5230 {
5231 return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
5232 }
5233
5234 // Converts the four unsigned 8-bit integers in the lower 16 bits to four
5235 // unsigned 32-bit integers.
_mm_cvtepu8_epi16(__m128i a)5236 FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
5237 {
5238 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
5239 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
5240 return vreinterpretq_m128i_u16(u16x8);
5241 }
5242
5243 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
5244 // unsigned 32-bit integers.
5245 // https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
_mm_cvtepu8_epi32(__m128i a)5246 FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
5247 {
5248 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
5249 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
5250 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
5251 return vreinterpretq_m128i_u32(u32x4);
5252 }
5253
5254 // Converts the two unsigned 8-bit integers in the lower 16 bits to two
5255 // unsigned 64-bit integers.
_mm_cvtepu8_epi64(__m128i a)5256 FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
5257 {
5258 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
5259 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
5260 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
5261 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
5262 return vreinterpretq_m128i_u64(u64x2);
5263 }
5264
5265 // Converts the four unsigned 8-bit integers in the lower 16 bits to four
5266 // unsigned 32-bit integers.
_mm_cvtepi8_epi16(__m128i a)5267 FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
5268 {
5269 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
5270 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
5271 return vreinterpretq_m128i_s16(s16x8);
5272 }
5273
5274 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
5275 // unsigned 32-bit integers.
_mm_cvtepi8_epi32(__m128i a)5276 FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
5277 {
5278 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
5279 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
5280 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
5281 return vreinterpretq_m128i_s32(s32x4);
5282 }
5283
5284 // Converts the two signed 8-bit integers in the lower 32 bits to four
5285 // signed 64-bit integers.
_mm_cvtepi8_epi64(__m128i a)5286 FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
5287 {
5288 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
5289 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
5290 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
5291 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
5292 return vreinterpretq_m128i_s64(s64x2);
5293 }
5294
5295 // Converts the four signed 16-bit integers in the lower 64 bits to four signed
5296 // 32-bit integers.
_mm_cvtepi16_epi32(__m128i a)5297 FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
5298 {
5299 return vreinterpretq_m128i_s32(
5300 vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
5301 }
5302
5303 // Converts the two signed 16-bit integers in the lower 32 bits two signed
5304 // 32-bit integers.
_mm_cvtepi16_epi64(__m128i a)5305 FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
5306 {
5307 int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
5308 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
5309 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
5310 return vreinterpretq_m128i_s64(s64x2);
5311 }
5312
5313 // Converts the four unsigned 16-bit integers in the lower 64 bits to four
5314 // unsigned 32-bit integers.
_mm_cvtepu16_epi32(__m128i a)5315 FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
5316 {
5317 return vreinterpretq_m128i_u32(
5318 vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
5319 }
5320
5321 // Converts the two unsigned 16-bit integers in the lower 32 bits to two
5322 // unsigned 64-bit integers.
_mm_cvtepu16_epi64(__m128i a)5323 FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
5324 {
5325 uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
5326 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
5327 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
5328 return vreinterpretq_m128i_u64(u64x2);
5329 }
5330
5331 // Converts the two unsigned 32-bit integers in the lower 64 bits to two
5332 // unsigned 64-bit integers.
_mm_cvtepu32_epi64(__m128i a)5333 FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
5334 {
5335 return vreinterpretq_m128i_u64(
5336 vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
5337 }
5338
5339 // Converts the two signed 32-bit integers in the lower 64 bits to two signed
5340 // 64-bit integers.
_mm_cvtepi32_epi64(__m128i a)5341 FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
5342 {
5343 return vreinterpretq_m128i_s64(
5344 vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
5345 }
5346
5347 // Converts the four single-precision, floating-point values of a to signed
5348 // 32-bit integer values.
5349 //
5350 // r0 := (int) a0
5351 // r1 := (int) a1
5352 // r2 := (int) a2
5353 // r3 := (int) a3
5354 //
5355 // https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
5356 // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
5357 // does not support! It is supported on ARMv8-A however.
_mm_cvtps_epi32(__m128 a)5358 FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
5359 {
5360 #if defined(__aarch64__)
5361 return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
5362 #else
5363 uint32x4_t signmask = vdupq_n_u32(0x80000000);
5364 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
5365 vdupq_n_f32(0.5f)); /* +/- 0.5 */
5366 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
5367 vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
5368 int32x4_t r_trunc =
5369 vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
5370 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
5371 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
5372 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
5373 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
5374 float32x4_t delta = vsubq_f32(
5375 vreinterpretq_f32_m128(a),
5376 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
5377 uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
5378 return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
5379 #endif
5380 }
5381
5382 // Convert packed single-precision (32-bit) floating-point elements in a to
5383 // packed 16-bit integers, and store the results in dst. Note: this intrinsic
5384 // will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
5385 // 0x7FFFFFFF.
5386 //
5387 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
_mm_cvtps_pi16(__m128 a)5388 FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
5389 {
5390 return vreinterpret_m64_s16(
5391 vmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
5392 }
5393
5394 // Copy the lower 32-bit integer in a to dst.
5395 //
5396 // dst[31:0] := a[31:0]
5397 //
5398 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
_mm_cvtsi128_si32(__m128i a)5399 FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
5400 {
5401 return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
5402 }
5403
5404 // Copy the lower 64-bit integer in a to dst.
5405 //
5406 // dst[63:0] := a[63:0]
5407 //
5408 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
_mm_cvtsi128_si64(__m128i a)5409 FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
5410 {
5411 return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
5412 }
5413
5414 // Copy the lower 64-bit integer in a to dst.
5415 //
5416 // dst[63:0] := a[63:0]
5417 //
5418 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
5419 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
5420
5421 // Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
5422 // zero extending the upper bits.
5423 //
5424 // r0 := a
5425 // r1 := 0x0
5426 // r2 := 0x0
5427 // r3 := 0x0
5428 //
5429 // https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
_mm_cvtsi32_si128(int a)5430 FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
5431 {
5432 return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
5433 }
5434
5435 // Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
5436 // zero extending the upper bits.
5437 //
5438 // r0 := a
5439 // r1 := 0x0
_mm_cvtsi64_si128(int64_t a)5440 FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
5441 {
5442 return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
5443 }
5444
5445 // Cast vector of type __m128 to type __m128d. This intrinsic is only used for
5446 // compilation and does not generate any instructions, thus it has zero latency.
5447 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
_mm_castps_pd(__m128 a)5448 FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
5449 {
5450 return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
5451 }
5452
5453 // Applies a type cast to reinterpret four 32-bit floating point values passed
5454 // in as a 128-bit parameter as packed 32-bit integers.
5455 // https://msdn.microsoft.com/en-us/library/bb514099.aspx
_mm_castps_si128(__m128 a)5456 FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
5457 {
5458 return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
5459 }
5460
5461 // Cast vector of type __m128i to type __m128d. This intrinsic is only used for
5462 // compilation and does not generate any instructions, thus it has zero latency.
5463 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
_mm_castsi128_pd(__m128i a)5464 FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
5465 {
5466 #if defined(__aarch64__)
5467 return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
5468 #else
5469 return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
5470 #endif
5471 }
5472
5473 // Applies a type cast to reinterpret four 32-bit integers passed in as a
5474 // 128-bit parameter as packed 32-bit floating point values.
5475 // https://msdn.microsoft.com/en-us/library/bb514029.aspx
_mm_castsi128_ps(__m128i a)5476 FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
5477 {
5478 return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
5479 }
5480
5481 // Loads 128-bit value. :
5482 // https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
_mm_load_si128(const __m128i * p)5483 FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
5484 {
5485 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
5486 }
5487
5488 // Load a double-precision (64-bit) floating-point element from memory into both
5489 // elements of dst.
5490 //
5491 // dst[63:0] := MEM[mem_addr+63:mem_addr]
5492 // dst[127:64] := MEM[mem_addr+63:mem_addr]
5493 //
5494 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
_mm_load1_pd(const double * p)5495 FORCE_INLINE __m128d _mm_load1_pd(const double *p)
5496 {
5497 #if defined(__aarch64__)
5498 return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
5499 #else
5500 return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
5501 #endif
5502 }
5503
5504 // Load a double-precision (64-bit) floating-point element from memory into both
5505 // elements of dst.
5506 //
5507 // dst[63:0] := MEM[mem_addr+63:mem_addr]
5508 // dst[127:64] := MEM[mem_addr+63:mem_addr]
5509 //
5510 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
5511 #define _mm_load_pd1 _mm_load1_pd
5512
5513 // Load a double-precision (64-bit) floating-point element from memory into the
5514 // upper element of dst, and copy the lower element from a to dst. mem_addr does
5515 // not need to be aligned on any particular boundary.
5516 //
5517 // dst[63:0] := a[63:0]
5518 // dst[127:64] := MEM[mem_addr+63:mem_addr]
5519 //
5520 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
_mm_loadh_pd(__m128d a,const double * p)5521 FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
5522 {
5523 #if defined(__aarch64__)
5524 return vreinterpretq_m128d_f64(
5525 vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
5526 #else
5527 return vreinterpretq_m128d_f32(vcombine_f32(
5528 vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
5529 #endif
5530 }
5531
5532 // Load a double-precision (64-bit) floating-point element from memory into both
5533 // elements of dst.
5534 //
5535 // dst[63:0] := MEM[mem_addr+63:mem_addr]
5536 // dst[127:64] := MEM[mem_addr+63:mem_addr]
5537 //
5538 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
5539 #define _mm_load_pd1 _mm_load1_pd
5540
5541 // Load a double-precision (64-bit) floating-point element from memory into both
5542 // elements of dst.
5543 //
5544 // dst[63:0] := MEM[mem_addr+63:mem_addr]
5545 // dst[127:64] := MEM[mem_addr+63:mem_addr]
5546 //
5547 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
5548 #define _mm_loaddup_pd _mm_load1_pd
5549
5550 // Loads 128-bit value. :
5551 // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
_mm_loadu_si128(const __m128i * p)5552 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
5553 {
5554 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
5555 }
5556
5557 // Load unaligned 32-bit integer from memory into the first element of dst.
5558 //
5559 // dst[31:0] := MEM[mem_addr+31:mem_addr]
5560 // dst[MAX:32] := 0
5561 //
5562 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
_mm_loadu_si32(const void * p)5563 FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
5564 {
5565 return vreinterpretq_m128i_s32(
5566 vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
5567 }
5568
5569 // Convert packed double-precision (64-bit) floating-point elements in a to
5570 // packed single-precision (32-bit) floating-point elements, and store the
5571 // results in dst.
5572 //
5573 // FOR j := 0 to 1
5574 // i := 32*j
5575 // k := 64*j
5576 // dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
5577 // ENDFOR
5578 // dst[127:64] := 0
5579 //
5580 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
_mm_cvtpd_ps(__m128d a)5581 FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
5582 {
5583 #if defined(__aarch64__)
5584 float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
5585 return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
5586 #else
5587 float a0 = (float) ((double *) &a)[0];
5588 float a1 = (float) ((double *) &a)[1];
5589 return _mm_set_ps(0, 0, a1, a0);
5590 #endif
5591 }
5592
5593 // Copy the lower double-precision (64-bit) floating-point element of a to dst.
5594 //
5595 // dst[63:0] := a[63:0]
5596 //
5597 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
_mm_cvtsd_f64(__m128d a)5598 FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
5599 {
5600 #if defined(__aarch64__)
5601 return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
5602 #else
5603 return ((double *) &a)[0];
5604 #endif
5605 }
5606
5607 // Convert packed single-precision (32-bit) floating-point elements in a to
5608 // packed double-precision (64-bit) floating-point elements, and store the
5609 // results in dst.
5610 //
5611 // FOR j := 0 to 1
5612 // i := 64*j
5613 // k := 32*j
5614 // dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
5615 // ENDFOR
5616 //
5617 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
_mm_cvtps_pd(__m128 a)5618 FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
5619 {
5620 #if defined(__aarch64__)
5621 return vreinterpretq_m128d_f64(
5622 vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
5623 #else
5624 double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
5625 double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
5626 return _mm_set_pd(a1, a0);
5627 #endif
5628 }
5629
5630 // Cast vector of type __m128d to type __m128i. This intrinsic is only used for
5631 // compilation and does not generate any instructions, thus it has zero latency.
5632 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
_mm_castpd_si128(__m128d a)5633 FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
5634 {
5635 return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
5636 }
5637
5638 // Cast vector of type __m128d to type __m128. This intrinsic is only used for
5639 // compilation and does not generate any instructions, thus it has zero latency.
5640 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
_mm_castpd_ps(__m128d a)5641 FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
5642 {
5643 return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
5644 }
5645
5646 // Blend packed single-precision (32-bit) floating-point elements from a and b
5647 // using mask, and store the results in dst.
5648 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
_mm_blendv_ps(__m128 _a,__m128 _b,__m128 _mask)5649 FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
5650 {
5651 // Use a signed shift right to create a mask with the sign bit
5652 uint32x4_t mask =
5653 vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
5654 float32x4_t a = vreinterpretq_f32_m128(_a);
5655 float32x4_t b = vreinterpretq_f32_m128(_b);
5656 return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
5657 }
5658
5659 // Blend packed single-precision (32-bit) floating-point elements from a and b
5660 // using mask, and store the results in dst.
5661 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps
_mm_blend_ps(__m128 _a,__m128 _b,const char imm8)5662 FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
5663 {
5664 const uint32_t ALIGN_STRUCT(16)
5665 data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
5666 ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
5667 ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
5668 ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
5669 uint32x4_t mask = vld1q_u32(data);
5670 float32x4_t a = vreinterpretq_f32_m128(_a);
5671 float32x4_t b = vreinterpretq_f32_m128(_b);
5672 return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
5673 }
5674
5675 // Blend packed double-precision (64-bit) floating-point elements from a and b
5676 // using mask, and store the results in dst.
5677 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
_mm_blendv_pd(__m128d _a,__m128d _b,__m128d _mask)5678 FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
5679 {
5680 uint64x2_t mask =
5681 vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
5682 #if defined(__aarch64__)
5683 float64x2_t a = vreinterpretq_f64_m128d(_a);
5684 float64x2_t b = vreinterpretq_f64_m128d(_b);
5685 return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
5686 #else
5687 uint64x2_t a = vreinterpretq_u64_m128d(_a);
5688 uint64x2_t b = vreinterpretq_u64_m128d(_b);
5689 return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
5690 #endif
5691 }
5692
5693 typedef struct {
5694 uint16_t res0;
5695 uint8_t res1 : 6;
5696 uint8_t bit22 : 1;
5697 uint8_t bit23 : 1;
5698 uint8_t res2;
5699 #if defined(__aarch64__)
5700 uint32_t res3;
5701 #endif
5702 } fpcr_bitfield;
5703
5704 // Macro: Set the rounding mode bits of the MXCSR control and status register to
5705 // the value in unsigned 32-bit integer a. The rounding mode may contain any of
5706 // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
5707 // _MM_ROUND_TOWARD_ZERO
5708 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
_MM_SET_ROUNDING_MODE(int rounding)5709 FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
5710 {
5711 union {
5712 fpcr_bitfield field;
5713 #if defined(__aarch64__)
5714 uint64_t value;
5715 #else
5716 uint32_t value;
5717 #endif
5718 } r;
5719
5720 #if defined(__aarch64__)
5721 asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
5722 #else
5723 asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
5724 #endif
5725
5726 switch (rounding) {
5727 case _MM_ROUND_TOWARD_ZERO:
5728 r.field.bit22 = 1;
5729 r.field.bit23 = 1;
5730 break;
5731 case _MM_ROUND_DOWN:
5732 r.field.bit22 = 0;
5733 r.field.bit23 = 1;
5734 break;
5735 case _MM_ROUND_UP:
5736 r.field.bit22 = 1;
5737 r.field.bit23 = 0;
5738 break;
5739 default: //_MM_ROUND_NEAREST
5740 r.field.bit22 = 0;
5741 r.field.bit23 = 0;
5742 }
5743
5744 #if defined(__aarch64__)
5745 asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
5746 #else
5747 asm volatile("vmsr FPSCR, %0" ::"r"(r)); /* write */
5748 #endif
5749 }
5750
_mm_setcsr(unsigned int a)5751 FORCE_INLINE void _mm_setcsr(unsigned int a)
5752 {
5753 _MM_SET_ROUNDING_MODE(a);
5754 }
5755
5756 // Round the packed single-precision (32-bit) floating-point elements in a using
5757 // the rounding parameter, and store the results as packed single-precision
5758 // floating-point elements in dst.
5759 // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
_mm_round_ps(__m128 a,int rounding)5760 FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
5761 {
5762 #if defined(__aarch64__)
5763 switch (rounding) {
5764 case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
5765 return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
5766 case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
5767 return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
5768 case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
5769 return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
5770 case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
5771 return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
5772 default: //_MM_FROUND_CUR_DIRECTION
5773 return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
5774 }
5775 #else
5776 float *v_float = (float *) &a;
5777 __m128 zero, neg_inf, pos_inf;
5778
5779 switch (rounding) {
5780 case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
5781 return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
5782 case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
5783 return (__m128){floorf(v_float[0]), floorf(v_float[1]),
5784 floorf(v_float[2]), floorf(v_float[3])};
5785 case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
5786 return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]),
5787 ceilf(v_float[3])};
5788 case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
5789 zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
5790 neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]),
5791 floorf(v_float[2]), floorf(v_float[3]));
5792 pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]),
5793 ceilf(v_float[2]), ceilf(v_float[3]));
5794 return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero));
5795 default: //_MM_FROUND_CUR_DIRECTION
5796 return (__m128){roundf(v_float[0]), roundf(v_float[1]),
5797 roundf(v_float[2]), roundf(v_float[3])};
5798 }
5799 #endif
5800 }
5801
5802 // Convert packed single-precision (32-bit) floating-point elements in a to
5803 // packed 32-bit integers, and store the results in dst.
5804 //
5805 // FOR j := 0 to 1
5806 // i := 32*j
5807 // dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
5808 // ENDFOR
5809 //
5810 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
_mm_cvt_ps2pi(__m128 a)5811 FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
5812 {
5813 #if defined(__aarch64__)
5814 return vreinterpret_m64_s32(
5815 vget_low_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))));
5816 #else
5817 return vreinterpret_m64_s32(
5818 vcvt_s32_f32(vget_low_f32(vreinterpretq_f32_m128(
5819 _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)))));
5820 #endif
5821 }
5822
5823 // Convert packed single-precision (32-bit) floating-point elements in a to
5824 // packed 32-bit integers, and store the results in dst.
5825 //
5826 // FOR j := 0 to 1
5827 // i := 32*j
5828 // dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
5829 // ENDFOR
5830 //
5831 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
5832 #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
5833
5834 // Round the packed single-precision (32-bit) floating-point elements in a up to
5835 // an integer value, and store the results as packed single-precision
5836 // floating-point elements in dst.
5837 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
_mm_ceil_ps(__m128 a)5838 FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
5839 {
5840 return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
5841 }
5842
5843 // Round the lower single-precision (32-bit) floating-point element in b up to
5844 // an integer value, store the result as a single-precision floating-point
5845 // element in the lower element of dst, and copy the upper 3 packed elements
5846 // from a to the upper elements of dst.
5847 //
5848 // dst[31:0] := CEIL(b[31:0])
5849 // dst[127:32] := a[127:32]
5850 //
5851 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss
_mm_ceil_ss(__m128 a,__m128 b)5852 FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
5853 {
5854 return _mm_move_ss(
5855 a, _mm_round_ps(b, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC));
5856 }
5857
5858 // Round the packed single-precision (32-bit) floating-point elements in a down
5859 // to an integer value, and store the results as packed single-precision
5860 // floating-point elements in dst.
5861 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
_mm_floor_ps(__m128 a)5862 FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
5863 {
5864 return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
5865 }
5866
5867 // Round the lower single-precision (32-bit) floating-point element in b down to
5868 // an integer value, store the result as a single-precision floating-point
5869 // element in the lower element of dst, and copy the upper 3 packed elements
5870 // from a to the upper elements of dst.
5871 //
5872 // dst[31:0] := FLOOR(b[31:0])
5873 // dst[127:32] := a[127:32]
5874 //
5875 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
_mm_floor_ss(__m128 a,__m128 b)5876 FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
5877 {
5878 return _mm_move_ss(
5879 a, _mm_round_ps(b, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
5880 }
5881
5882 // Load 128-bits of integer data from unaligned memory into dst. This intrinsic
5883 // may perform better than _mm_loadu_si128 when the data crosses a cache line
5884 // boundary.
5885 //
5886 // dst[127:0] := MEM[mem_addr+127:mem_addr]
5887 //
5888 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
5889 #define _mm_lddqu_si128 _mm_loadu_si128
5890
5891 /* Miscellaneous Operations */
5892
5893 // Shifts the 8 signed 16-bit integers in a right by count bits while shifting
5894 // in the sign bit.
5895 //
5896 // r0 := a0 >> count
5897 // r1 := a1 >> count
5898 // ...
5899 // r7 := a7 >> count
5900 //
5901 // https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
_mm_sra_epi16(__m128i a,__m128i count)5902 FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
5903 {
5904 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5905 if (unlikely(c > 15))
5906 return _mm_cmplt_epi16(a, _mm_setzero_si128());
5907 return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
5908 }
5909
5910 // Shifts the 4 signed 32-bit integers in a right by count bits while shifting
5911 // in the sign bit.
5912 //
5913 // r0 := a0 >> count
5914 // r1 := a1 >> count
5915 // r2 := a2 >> count
5916 // r3 := a3 >> count
5917 //
5918 // https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
_mm_sra_epi32(__m128i a,__m128i count)5919 FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
5920 {
5921 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5922 if (unlikely(c > 31))
5923 return _mm_cmplt_epi32(a, _mm_setzero_si128());
5924 return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
5925 }
5926
5927 // Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
5928 // saturates.
5929 // https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
_mm_packs_epi16(__m128i a,__m128i b)5930 FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
5931 {
5932 return vreinterpretq_m128i_s8(
5933 vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
5934 vqmovn_s16(vreinterpretq_s16_m128i(b))));
5935 }
5936
5937 // Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
5938 // integers and saturates.
5939 //
5940 // r0 := UnsignedSaturate(a0)
5941 // r1 := UnsignedSaturate(a1)
5942 // ...
5943 // r7 := UnsignedSaturate(a7)
5944 // r8 := UnsignedSaturate(b0)
5945 // r9 := UnsignedSaturate(b1)
5946 // ...
5947 // r15 := UnsignedSaturate(b7)
5948 //
5949 // https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
_mm_packus_epi16(const __m128i a,const __m128i b)5950 FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
5951 {
5952 return vreinterpretq_m128i_u8(
5953 vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
5954 vqmovun_s16(vreinterpretq_s16_m128i(b))));
5955 }
5956
5957 // Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
5958 // and saturates.
5959 //
5960 // r0 := SignedSaturate(a0)
5961 // r1 := SignedSaturate(a1)
5962 // r2 := SignedSaturate(a2)
5963 // r3 := SignedSaturate(a3)
5964 // r4 := SignedSaturate(b0)
5965 // r5 := SignedSaturate(b1)
5966 // r6 := SignedSaturate(b2)
5967 // r7 := SignedSaturate(b3)
5968 //
5969 // https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
_mm_packs_epi32(__m128i a,__m128i b)5970 FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
5971 {
5972 return vreinterpretq_m128i_s16(
5973 vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
5974 vqmovn_s32(vreinterpretq_s32_m128i(b))));
5975 }
5976
5977 // Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
5978 // integers and saturates.
5979 //
5980 // r0 := UnsignedSaturate(a0)
5981 // r1 := UnsignedSaturate(a1)
5982 // r2 := UnsignedSaturate(a2)
5983 // r3 := UnsignedSaturate(a3)
5984 // r4 := UnsignedSaturate(b0)
5985 // r5 := UnsignedSaturate(b1)
5986 // r6 := UnsignedSaturate(b2)
5987 // r7 := UnsignedSaturate(b3)
_mm_packus_epi32(__m128i a,__m128i b)5988 FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
5989 {
5990 return vreinterpretq_m128i_u16(
5991 vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
5992 vqmovun_s32(vreinterpretq_s32_m128i(b))));
5993 }
5994
5995 // Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
5996 // 8 signed or unsigned 8-bit integers in b.
5997 //
5998 // r0 := a0
5999 // r1 := b0
6000 // r2 := a1
6001 // r3 := b1
6002 // ...
6003 // r14 := a7
6004 // r15 := b7
6005 //
6006 // https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
_mm_unpacklo_epi8(__m128i a,__m128i b)6007 FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
6008 {
6009 #if defined(__aarch64__)
6010 return vreinterpretq_m128i_s8(
6011 vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6012 #else
6013 int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
6014 int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
6015 int8x8x2_t result = vzip_s8(a1, b1);
6016 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6017 #endif
6018 }
6019
6020 // Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
6021 // lower 4 signed or unsigned 16-bit integers in b.
6022 //
6023 // r0 := a0
6024 // r1 := b0
6025 // r2 := a1
6026 // r3 := b1
6027 // r4 := a2
6028 // r5 := b2
6029 // r6 := a3
6030 // r7 := b3
6031 //
6032 // https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
_mm_unpacklo_epi16(__m128i a,__m128i b)6033 FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
6034 {
6035 #if defined(__aarch64__)
6036 return vreinterpretq_m128i_s16(
6037 vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6038 #else
6039 int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
6040 int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
6041 int16x4x2_t result = vzip_s16(a1, b1);
6042 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6043 #endif
6044 }
6045
6046 // Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
6047 // lower 2 signed or unsigned 32 - bit integers in b.
6048 //
6049 // r0 := a0
6050 // r1 := b0
6051 // r2 := a1
6052 // r3 := b1
6053 //
6054 // https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
_mm_unpacklo_epi32(__m128i a,__m128i b)6055 FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
6056 {
6057 #if defined(__aarch64__)
6058 return vreinterpretq_m128i_s32(
6059 vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6060 #else
6061 int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
6062 int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
6063 int32x2x2_t result = vzip_s32(a1, b1);
6064 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6065 #endif
6066 }
6067
_mm_unpacklo_epi64(__m128i a,__m128i b)6068 FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
6069 {
6070 int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
6071 int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
6072 return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
6073 }
6074
6075 // Selects and interleaves the lower two single-precision, floating-point values
6076 // from a and b.
6077 //
6078 // r0 := a0
6079 // r1 := b0
6080 // r2 := a1
6081 // r3 := b1
6082 //
6083 // https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
_mm_unpacklo_ps(__m128 a,__m128 b)6084 FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
6085 {
6086 #if defined(__aarch64__)
6087 return vreinterpretq_m128_f32(
6088 vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
6089 #else
6090 float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
6091 float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
6092 float32x2x2_t result = vzip_f32(a1, b1);
6093 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
6094 #endif
6095 }
6096
6097 // Unpack and interleave double-precision (64-bit) floating-point elements from
6098 // the low half of a and b, and store the results in dst.
6099 //
6100 // DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
6101 // dst[63:0] := src1[63:0]
6102 // dst[127:64] := src2[63:0]
6103 // RETURN dst[127:0]
6104 // }
6105 // dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
6106 //
6107 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
_mm_unpacklo_pd(__m128d a,__m128d b)6108 FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
6109 {
6110 #if defined(__aarch64__)
6111 return vreinterpretq_m128d_f64(
6112 vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6113 #else
6114 return vreinterpretq_m128d_s64(
6115 vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
6116 vget_low_s64(vreinterpretq_s64_m128d(b))));
6117 #endif
6118 }
6119
6120 // Unpack and interleave double-precision (64-bit) floating-point elements from
6121 // the high half of a and b, and store the results in dst.
6122 //
6123 // DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
6124 // dst[63:0] := src1[127:64]
6125 // dst[127:64] := src2[127:64]
6126 // RETURN dst[127:0]
6127 // }
6128 // dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
6129 //
6130 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
_mm_unpackhi_pd(__m128d a,__m128d b)6131 FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
6132 {
6133 #if defined(__aarch64__)
6134 return vreinterpretq_m128d_f64(
6135 vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6136 #else
6137 return vreinterpretq_m128d_s64(
6138 vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
6139 vget_high_s64(vreinterpretq_s64_m128d(b))));
6140 #endif
6141 }
6142
6143 // Selects and interleaves the upper two single-precision, floating-point values
6144 // from a and b.
6145 //
6146 // r0 := a2
6147 // r1 := b2
6148 // r2 := a3
6149 // r3 := b3
6150 //
6151 // https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
_mm_unpackhi_ps(__m128 a,__m128 b)6152 FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
6153 {
6154 #if defined(__aarch64__)
6155 return vreinterpretq_m128_f32(
6156 vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
6157 #else
6158 float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
6159 float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
6160 float32x2x2_t result = vzip_f32(a1, b1);
6161 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
6162 #endif
6163 }
6164
6165 // Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
6166 // 8 signed or unsigned 8-bit integers in b.
6167 //
6168 // r0 := a8
6169 // r1 := b8
6170 // r2 := a9
6171 // r3 := b9
6172 // ...
6173 // r14 := a15
6174 // r15 := b15
6175 //
6176 // https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
_mm_unpackhi_epi8(__m128i a,__m128i b)6177 FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
6178 {
6179 #if defined(__aarch64__)
6180 return vreinterpretq_m128i_s8(
6181 vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6182 #else
6183 int8x8_t a1 =
6184 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
6185 int8x8_t b1 =
6186 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
6187 int8x8x2_t result = vzip_s8(a1, b1);
6188 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6189 #endif
6190 }
6191
6192 // Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
6193 // upper 4 signed or unsigned 16-bit integers in b.
6194 //
6195 // r0 := a4
6196 // r1 := b4
6197 // r2 := a5
6198 // r3 := b5
6199 // r4 := a6
6200 // r5 := b6
6201 // r6 := a7
6202 // r7 := b7
6203 //
6204 // https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
_mm_unpackhi_epi16(__m128i a,__m128i b)6205 FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
6206 {
6207 #if defined(__aarch64__)
6208 return vreinterpretq_m128i_s16(
6209 vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6210 #else
6211 int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
6212 int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
6213 int16x4x2_t result = vzip_s16(a1, b1);
6214 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6215 #endif
6216 }
6217
6218 // Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
6219 // upper 2 signed or unsigned 32-bit integers in b.
6220 // https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
_mm_unpackhi_epi32(__m128i a,__m128i b)6221 FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
6222 {
6223 #if defined(__aarch64__)
6224 return vreinterpretq_m128i_s32(
6225 vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6226 #else
6227 int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
6228 int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
6229 int32x2x2_t result = vzip_s32(a1, b1);
6230 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6231 #endif
6232 }
6233
6234 // Interleaves the upper signed or unsigned 64-bit integer in a with the
6235 // upper signed or unsigned 64-bit integer in b.
6236 //
6237 // r0 := a1
6238 // r1 := b1
_mm_unpackhi_epi64(__m128i a,__m128i b)6239 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
6240 {
6241 int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
6242 int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
6243 return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
6244 }
6245
6246 // Horizontally compute the minimum amongst the packed unsigned 16-bit integers
6247 // in a, store the minimum and index in dst, and zero the remaining bits in dst.
6248 //
6249 // index[2:0] := 0
6250 // min[15:0] := a[15:0]
6251 // FOR j := 0 to 7
6252 // i := j*16
6253 // IF a[i+15:i] < min[15:0]
6254 // index[2:0] := j
6255 // min[15:0] := a[i+15:i]
6256 // FI
6257 // ENDFOR
6258 // dst[15:0] := min[15:0]
6259 // dst[18:16] := index[2:0]
6260 // dst[127:19] := 0
6261 //
6262 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
_mm_minpos_epu16(__m128i a)6263 FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
6264 {
6265 __m128i dst;
6266 uint16_t min, idx = 0;
6267 // Find the minimum value
6268 #if defined(__aarch64__)
6269 min = vminvq_u16(vreinterpretq_u16_m128i(a));
6270 #else
6271 __m64 tmp;
6272 tmp = vreinterpret_m64_u16(
6273 vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
6274 vget_high_u16(vreinterpretq_u16_m128i(a))));
6275 tmp = vreinterpret_m64_u16(
6276 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
6277 tmp = vreinterpret_m64_u16(
6278 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
6279 min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
6280 #endif
6281 // Get the index of the minimum value
6282 int i;
6283 for (i = 0; i < 8; i++) {
6284 if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
6285 idx = (uint16_t) i;
6286 break;
6287 }
6288 a = _mm_srli_si128(a, 2);
6289 }
6290 // Generate result
6291 dst = _mm_setzero_si128();
6292 dst = vreinterpretq_m128i_u16(
6293 vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
6294 dst = vreinterpretq_m128i_u16(
6295 vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
6296 return dst;
6297 }
6298
6299 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
6300 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
6301 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
6302 // otherwise set CF to 0. Return the CF value.
6303 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
_mm_testc_si128(__m128i a,__m128i b)6304 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
6305 {
6306 int64x2_t s64 =
6307 vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
6308 vreinterpretq_s64_m128i(b));
6309 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
6310 }
6311
6312 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
6313 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
6314 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
6315 // otherwise set CF to 0. Return the ZF value.
6316 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
_mm_testz_si128(__m128i a,__m128i b)6317 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
6318 {
6319 int64x2_t s64 =
6320 vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
6321 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
6322 }
6323
6324 // Extracts the selected signed or unsigned 8-bit integer from a and zero
6325 // extends.
6326 // FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
6327 #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
6328
6329 // Inserts the least significant 8 bits of b into the selected 8-bit integer
6330 // of a.
6331 // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
6332 // __constrange(0,16) int imm)
6333 #define _mm_insert_epi8(a, b, imm) \
6334 __extension__({ \
6335 vreinterpretq_m128i_s8( \
6336 vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
6337 })
6338
6339 // Extracts the selected signed or unsigned 16-bit integer from a and zero
6340 // extends.
6341 // https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
6342 // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
6343 #define _mm_extract_epi16(a, imm) \
6344 vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
6345
6346 // Inserts the least significant 16 bits of b into the selected 16-bit integer
6347 // of a.
6348 // https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
6349 // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
6350 // __constrange(0,8) int imm)
6351 #define _mm_insert_epi16(a, b, imm) \
6352 __extension__({ \
6353 vreinterpretq_m128i_s16( \
6354 vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
6355 })
6356
6357 // Copy a to dst, and insert the 16-bit integer i into dst at the location
6358 // specified by imm8.
6359 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
6360 #define _mm_insert_pi16(a, b, imm) \
6361 __extension__({ \
6362 vreinterpret_m64_s16( \
6363 vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
6364 })
6365
6366 // Extracts the selected signed or unsigned 32-bit integer from a and zero
6367 // extends.
6368 // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
6369 #define _mm_extract_epi32(a, imm) \
6370 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
6371
6372 // Extracts the selected single-precision (32-bit) floating-point from a.
6373 // FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
6374 #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
6375
6376 // Inserts the least significant 32 bits of b into the selected 32-bit integer
6377 // of a.
6378 // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
6379 // __constrange(0,4) int imm)
6380 #define _mm_insert_epi32(a, b, imm) \
6381 __extension__({ \
6382 vreinterpretq_m128i_s32( \
6383 vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
6384 })
6385
6386 // Extracts the selected signed or unsigned 64-bit integer from a and zero
6387 // extends.
6388 // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
6389 #define _mm_extract_epi64(a, imm) \
6390 vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
6391
6392 // Inserts the least significant 64 bits of b into the selected 64-bit integer
6393 // of a.
6394 // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
6395 // __constrange(0,2) int imm)
6396 #define _mm_insert_epi64(a, b, imm) \
6397 __extension__({ \
6398 vreinterpretq_m128i_s64( \
6399 vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
6400 })
6401
6402 // Count the number of bits set to 1 in unsigned 32-bit integer a, and
6403 // return that count in dst.
6404 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
_mm_popcnt_u32(unsigned int a)6405 FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
6406 {
6407 #if defined(__aarch64__)
6408 #if __has_builtin(__builtin_popcount)
6409 return __builtin_popcount(a);
6410 #else
6411 return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
6412 #endif
6413 #else
6414 uint32_t count = 0;
6415 uint8x8_t input_val, count8x8_val;
6416 uint16x4_t count16x4_val;
6417 uint32x2_t count32x2_val;
6418
6419 input_val = vld1_u8((uint8_t *) &a);
6420 count8x8_val = vcnt_u8(input_val);
6421 count16x4_val = vpaddl_u8(count8x8_val);
6422 count32x2_val = vpaddl_u16(count16x4_val);
6423
6424 vst1_u32(&count, count32x2_val);
6425 return count;
6426 #endif
6427 }
6428
6429 // Count the number of bits set to 1 in unsigned 64-bit integer a, and
6430 // return that count in dst.
6431 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
_mm_popcnt_u64(uint64_t a)6432 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
6433 {
6434 #if defined(__aarch64__)
6435 #if __has_builtin(__builtin_popcountll)
6436 return __builtin_popcountll(a);
6437 #else
6438 return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
6439 #endif
6440 #else
6441 uint64_t count = 0;
6442 uint8x8_t input_val, count8x8_val;
6443 uint16x4_t count16x4_val;
6444 uint32x2_t count32x2_val;
6445 uint64x1_t count64x1_val;
6446
6447 input_val = vld1_u8((uint8_t *) &a);
6448 count8x8_val = vcnt_u8(input_val);
6449 count16x4_val = vpaddl_u8(count8x8_val);
6450 count32x2_val = vpaddl_u16(count16x4_val);
6451 count64x1_val = vpaddl_u32(count32x2_val);
6452 vst1_u64(&count, count64x1_val);
6453 return count;
6454 #endif
6455 }
6456
6457 // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
6458 // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
6459 // transposed matrix in these vectors (row0 now contains column 0, etc.).
6460 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
6461 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
6462 do { \
6463 float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
6464 float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
6465 row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
6466 vget_low_f32(ROW23.val[0])); \
6467 row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
6468 vget_low_f32(ROW23.val[1])); \
6469 row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
6470 vget_high_f32(ROW23.val[0])); \
6471 row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
6472 vget_high_f32(ROW23.val[1])); \
6473 } while (0)
6474
6475 /* Crypto Extensions */
6476
6477 #if defined(__ARM_FEATURE_CRYPTO)
6478 // Wraps vmull_p64
_sse2neon_vmull_p64(uint64x1_t _a,uint64x1_t _b)6479 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
6480 {
6481 poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
6482 poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
6483 return vreinterpretq_u64_p128(vmull_p64(a, b));
6484 }
6485 #else // ARMv7 polyfill
6486 // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
6487 //
6488 // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
6489 // 64-bit->128-bit polynomial multiply.
6490 //
6491 // It needs some work and is somewhat slow, but it is still faster than all
6492 // known scalar methods.
6493 //
6494 // Algorithm adapted to C from
6495 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
6496 // from "Fast Software Polynomial Multiplication on ARM Processors Using the
6497 // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
6498 // (https://hal.inria.fr/hal-01506572)
_sse2neon_vmull_p64(uint64x1_t _a,uint64x1_t _b)6499 static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
6500 {
6501 poly8x8_t a = vreinterpret_p8_u64(_a);
6502 poly8x8_t b = vreinterpret_p8_u64(_b);
6503
6504 // Masks
6505 uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
6506 vcreate_u8(0x00000000ffffffff));
6507 uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
6508 vcreate_u8(0x0000000000000000));
6509
6510 // Do the multiplies, rotating with vext to get all combinations
6511 uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
6512 uint8x16_t e =
6513 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
6514 uint8x16_t f =
6515 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
6516 uint8x16_t g =
6517 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
6518 uint8x16_t h =
6519 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
6520 uint8x16_t i =
6521 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
6522 uint8x16_t j =
6523 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
6524 uint8x16_t k =
6525 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
6526
6527 // Add cross products
6528 uint8x16_t l = veorq_u8(e, f); // L = E + F
6529 uint8x16_t m = veorq_u8(g, h); // M = G + H
6530 uint8x16_t n = veorq_u8(i, j); // N = I + J
6531
6532 // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
6533 // instructions.
6534 #if defined(__aarch64__)
6535 uint8x16_t lm_p0 = vreinterpretq_u8_u64(
6536 vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
6537 uint8x16_t lm_p1 = vreinterpretq_u8_u64(
6538 vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
6539 uint8x16_t nk_p0 = vreinterpretq_u8_u64(
6540 vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
6541 uint8x16_t nk_p1 = vreinterpretq_u8_u64(
6542 vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
6543 #else
6544 uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
6545 uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
6546 uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
6547 uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
6548 #endif
6549 // t0 = (L) (P0 + P1) << 8
6550 // t1 = (M) (P2 + P3) << 16
6551 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
6552 uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
6553 uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
6554
6555 // t2 = (N) (P4 + P5) << 24
6556 // t3 = (K) (P6 + P7) << 32
6557 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
6558 uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
6559 uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
6560
6561 // De-interleave
6562 #if defined(__aarch64__)
6563 uint8x16_t t0 = vreinterpretq_u8_u64(
6564 vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
6565 uint8x16_t t1 = vreinterpretq_u8_u64(
6566 vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
6567 uint8x16_t t2 = vreinterpretq_u8_u64(
6568 vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
6569 uint8x16_t t3 = vreinterpretq_u8_u64(
6570 vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
6571 #else
6572 uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
6573 uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
6574 uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
6575 uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
6576 #endif
6577 // Shift the cross products
6578 uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
6579 uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
6580 uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
6581 uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
6582
6583 // Accumulate the products
6584 uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
6585 uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
6586 uint8x16_t mix = veorq_u8(d, cross1);
6587 uint8x16_t r = veorq_u8(mix, cross2);
6588 return vreinterpretq_u64_u8(r);
6589 }
6590 #endif // ARMv7 polyfill
6591
6592 // Perform a carry-less multiplication of two 64-bit integers, selected from a
6593 // and b according to imm8, and store the results in dst.
6594 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
_mm_clmulepi64_si128(__m128i _a,__m128i _b,const int imm)6595 FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
6596 {
6597 uint64x2_t a = vreinterpretq_u64_m128i(_a);
6598 uint64x2_t b = vreinterpretq_u64_m128i(_b);
6599 switch (imm & 0x11) {
6600 case 0x00:
6601 return vreinterpretq_m128i_u64(
6602 _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
6603 case 0x01:
6604 return vreinterpretq_m128i_u64(
6605 _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
6606 case 0x10:
6607 return vreinterpretq_m128i_u64(
6608 _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
6609 case 0x11:
6610 return vreinterpretq_m128i_u64(
6611 _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
6612 default:
6613 abort();
6614 }
6615 }
6616
6617 #if !defined(__ARM_FEATURE_CRYPTO)
6618 /* clang-format off */
6619 #define SSE2NEON_AES_DATA(w) \
6620 { \
6621 w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
6622 w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
6623 w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
6624 w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
6625 w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
6626 w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
6627 w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
6628 w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
6629 w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
6630 w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
6631 w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
6632 w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
6633 w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
6634 w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
6635 w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
6636 w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
6637 w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
6638 w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
6639 w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
6640 w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
6641 w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
6642 w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
6643 w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
6644 w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
6645 w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
6646 w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
6647 w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
6648 w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
6649 w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
6650 w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
6651 w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
6652 w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
6653 w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
6654 w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
6655 w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
6656 w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
6657 w(0xb0), w(0x54), w(0xbb), w(0x16) \
6658 }
6659 /* clang-format on */
6660
6661 /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
6662 #define SSE2NEON_AES_H0(x) (x)
6663 static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
6664 #undef SSE2NEON_AES_H0
6665
6666 // In the absence of crypto extensions, implement aesenc using regular neon
6667 // intrinsics instead. See:
6668 // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
6669 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
6670 // https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
6671 // for more information Reproduced with permission of the author.
_mm_aesenc_si128(__m128i EncBlock,__m128i RoundKey)6672 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
6673 {
6674 #if defined(__aarch64__)
6675 static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
6676 0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
6677 0xc, 0x1, 0x6, 0xb};
6678 static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
6679 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
6680
6681 uint8x16_t v;
6682 uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
6683
6684 // shift rows
6685 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
6686
6687 // sub bytes
6688 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
6689 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
6690 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
6691 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
6692
6693 // mix columns
6694 w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
6695 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
6696 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
6697
6698 // add round key
6699 return vreinterpretq_m128i_u8(w) ^ RoundKey;
6700
6701 #else /* ARMv7-A NEON implementation */
6702 #define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
6703 (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
6704 (b0))
6705 #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
6706 #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
6707 #define SSE2NEON_AES_U0(p) \
6708 SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
6709 #define SSE2NEON_AES_U1(p) \
6710 SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
6711 #define SSE2NEON_AES_U2(p) \
6712 SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
6713 #define SSE2NEON_AES_U3(p) \
6714 SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
6715 static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
6716 SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
6717 SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
6718 SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
6719 SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
6720 };
6721 #undef SSE2NEON_AES_B2W
6722 #undef SSE2NEON_AES_F2
6723 #undef SSE2NEON_AES_F3
6724 #undef SSE2NEON_AES_U0
6725 #undef SSE2NEON_AES_U1
6726 #undef SSE2NEON_AES_U2
6727 #undef SSE2NEON_AES_U3
6728
6729 uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
6730 uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
6731 uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
6732 uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
6733
6734 __m128i out = _mm_set_epi32(
6735 (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
6736 aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
6737 (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
6738 aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
6739 (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
6740 aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
6741 (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
6742 aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
6743
6744 return _mm_xor_si128(out, RoundKey);
6745 #endif
6746 }
6747
6748 // Perform the last round of an AES encryption flow on data (state) in a using
6749 // the round key in RoundKey, and store the result in dst.
6750 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
_mm_aesenclast_si128(__m128i a,__m128i RoundKey)6751 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
6752 {
6753 /* FIXME: optimized for NEON */
6754 uint8_t v[4][4] = {
6755 [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
6756 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
6757 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
6758 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
6759 [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
6760 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
6761 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
6762 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
6763 [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
6764 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
6765 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
6766 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
6767 [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
6768 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
6769 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
6770 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
6771 };
6772 for (int i = 0; i < 16; i++)
6773 vreinterpretq_nth_u8_m128i(a, i) =
6774 v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
6775 return a;
6776 }
6777
6778 // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
6779 // This instruction generates a round key for AES encryption. See
6780 // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
6781 // for details.
6782 //
6783 // https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
_mm_aeskeygenassist_si128(__m128i key,const int rcon)6784 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
6785 {
6786 uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
6787 uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
6788 for (int i = 0; i < 4; ++i) {
6789 ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
6790 ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
6791 }
6792 return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
6793 ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
6794 }
6795 #undef SSE2NEON_AES_DATA
6796
6797 #else /* __ARM_FEATURE_CRYPTO */
6798 // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
6799 // AESMC and then manually applying the real key as an xor operation. This
6800 // unfortunately means an additional xor op; the compiler should be able to
6801 // optimize this away for repeated calls however. See
6802 // https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
6803 // for more details.
_mm_aesenc_si128(__m128i a,__m128i b)6804 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
6805 {
6806 return vreinterpretq_m128i_u8(
6807 vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
6808 vreinterpretq_u8_m128i(b));
6809 }
6810
6811 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
_mm_aesenclast_si128(__m128i a,__m128i RoundKey)6812 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
6813 {
6814 return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
6815 vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
6816 RoundKey);
6817 }
6818
_mm_aeskeygenassist_si128(__m128i a,const int rcon)6819 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
6820 {
6821 // AESE does ShiftRows and SubBytes on A
6822 uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
6823
6824 uint8x16_t dest = {
6825 // Undo ShiftRows step from AESE and extract X1 and X3
6826 u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
6827 u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
6828 u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
6829 u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
6830 };
6831 uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
6832 return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
6833 }
6834 #endif
6835
6836 /* Streaming Extensions */
6837
6838 // Guarantees that every preceding store is globally visible before any
6839 // subsequent store.
6840 // https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
_mm_sfence(void)6841 FORCE_INLINE void _mm_sfence(void)
6842 {
6843 __sync_synchronize();
6844 }
6845
6846 // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
6847 // point elements) from a into memory using a non-temporal memory hint.
6848 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
_mm_stream_ps(float * p,__m128 a)6849 FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
6850 {
6851 #if __has_builtin(__builtin_nontemporal_store)
6852 __builtin_nontemporal_store(a, (float32x4_t *) p);
6853 #else
6854 vst1q_f32(p, vreinterpretq_f32_m128(a));
6855 #endif
6856 }
6857
6858 // Stores the data in a to the address p without polluting the caches. If the
6859 // cache line containing address p is already in the cache, the cache will be
6860 // updated.
6861 // https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
_mm_stream_si128(__m128i * p,__m128i a)6862 FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
6863 {
6864 #if __has_builtin(__builtin_nontemporal_store)
6865 __builtin_nontemporal_store(a, p);
6866 #else
6867 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
6868 #endif
6869 }
6870
6871 // Load 128-bits of integer data from memory into dst using a non-temporal
6872 // memory hint. mem_addr must be aligned on a 16-byte boundary or a
6873 // general-protection exception may be generated.
6874 //
6875 // dst[127:0] := MEM[mem_addr+127:mem_addr]
6876 //
6877 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
_mm_stream_load_si128(__m128i * p)6878 FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
6879 {
6880 #if __has_builtin(__builtin_nontemporal_store)
6881 return __builtin_nontemporal_load(p);
6882 #else
6883 return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
6884 #endif
6885 }
6886
6887 // Cache line containing p is flushed and invalidated from all caches in the
6888 // coherency domain. :
6889 // https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
_mm_clflush(void const * p)6890 FORCE_INLINE void _mm_clflush(void const *p)
6891 {
6892 (void) p;
6893 // no corollary for Neon?
6894 }
6895
6896 // Allocate aligned blocks of memory.
6897 // https://software.intel.com/en-us/
6898 // cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
_mm_malloc(size_t size,size_t align)6899 FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
6900 {
6901 void *ptr;
6902 if (align == 1)
6903 return malloc(size);
6904 if (align == 2 || (sizeof(void *) == 8 && align == 4))
6905 align = sizeof(void *);
6906 if (!posix_memalign(&ptr, align, size))
6907 return ptr;
6908 return NULL;
6909 }
6910
6911 // Free aligned memory that was allocated with _mm_malloc.
6912 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
_mm_free(void * addr)6913 FORCE_INLINE void _mm_free(void *addr)
6914 {
6915 free(addr);
6916 }
6917
6918 // Starting with the initial value in crc, accumulates a CRC32 value for
6919 // unsigned 8-bit integer v.
6920 // https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
_mm_crc32_u8(uint32_t crc,uint8_t v)6921 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
6922 {
6923 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
6924 __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
6925 : [c] "+r"(crc)
6926 : [v] "r"(v));
6927 #else
6928 crc ^= v;
6929 for (int bit = 0; bit < 8; bit++) {
6930 if (crc & 1)
6931 crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
6932 else
6933 crc = (crc >> 1);
6934 }
6935 #endif
6936 return crc;
6937 }
6938
6939 // Starting with the initial value in crc, accumulates a CRC32 value for
6940 // unsigned 16-bit integer v.
6941 // https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
_mm_crc32_u16(uint32_t crc,uint16_t v)6942 FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
6943 {
6944 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
6945 __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
6946 : [c] "+r"(crc)
6947 : [v] "r"(v));
6948 #else
6949 crc = _mm_crc32_u8(crc, v & 0xff);
6950 crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
6951 #endif
6952 return crc;
6953 }
6954
6955 // Starting with the initial value in crc, accumulates a CRC32 value for
6956 // unsigned 32-bit integer v.
6957 // https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
_mm_crc32_u32(uint32_t crc,uint32_t v)6958 FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
6959 {
6960 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
6961 __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
6962 : [c] "+r"(crc)
6963 : [v] "r"(v));
6964 #else
6965 crc = _mm_crc32_u16(crc, v & 0xffff);
6966 crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
6967 #endif
6968 return crc;
6969 }
6970
6971 // Starting with the initial value in crc, accumulates a CRC32 value for
6972 // unsigned 64-bit integer v.
6973 // https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
_mm_crc32_u64(uint64_t crc,uint64_t v)6974 FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
6975 {
6976 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
6977 __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
6978 : [c] "+r"(crc)
6979 : [v] "r"(v));
6980 #else
6981 crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
6982 crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
6983 #endif
6984 return crc;
6985 }
6986
6987 #if defined(__GNUC__) || defined(__clang__)
6988 #pragma pop_macro("ALIGN_STRUCT")
6989 #pragma pop_macro("FORCE_INLINE")
6990 #endif
6991
6992 #if defined(__GNUC__) && !defined(__clang__)
6993 #pragma GCC pop_options
6994 #endif
6995
6996 #endif
6997