1 #ifndef SSE2NEON_H
2 #define SSE2NEON_H
3
4 // This header file provides a simple API translation layer
5 // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
6 //
7 // This header file does not yet translate all of the SSE intrinsics.
8 //
9 // Contributors to this work are:
10 // John W. Ratcliff <jratcliffscarab@gmail.com>
11 // Brandon Rowlett <browlett@nvidia.com>
12 // Ken Fast <kfast@gdeb.com>
13 // Eric van Beurden <evanbeurden@nvidia.com>
14 // Alexander Potylitsin <apotylitsin@nvidia.com>
15 // Hasindu Gamaarachchi <hasindu2008@gmail.com>
16 // Jim Huang <jserv@biilabs.io>
17 // Mark Cheng <marktwtn@biilabs.io>
18 // Malcolm James MacLeod <malcolm@gulden.com>
19 // Devin Hussey (easyaspi314) <husseydevin@gmail.com>
20 // Sebastian Pop <spop@amazon.com>
21 // Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
22 // Danila Kutenin <danilak@google.com>
23 // François Turban (JishinMaster) <francois.turban@gmail.com>
24 // Pei-Hsuan Hung <afcidk@gmail.com>
25 // Yang-Hao Yuan <yanghau@biilabs.io>
26 // Syoyo Fujita <syoyo@lighttransport.com>
27 // Brecht Van Lommel <brecht@blender.org>
28
29 /*
30 * sse2neon is freely redistributable under the MIT License.
31 *
32 * Permission is hereby granted, free of charge, to any person obtaining a copy
33 * of this software and associated documentation files (the "Software"), to deal
34 * in the Software without restriction, including without limitation the rights
35 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
36 * copies of the Software, and to permit persons to whom the Software is
37 * furnished to do so, subject to the following conditions:
38 *
39 * The above copyright notice and this permission notice shall be included in
40 * all copies or substantial portions of the Software.
41 *
42 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
43 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
44 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
47 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
48 * SOFTWARE.
49 */
50
51 /* Tunable configurations */
52
53 /* Enable precise implementation of math operations
54 * This would slow down the computation a bit, but gives consistent result with
55 * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
56 */
57 /* _mm_min_ps and _mm_max_ps */
58 #ifndef SSE2NEON_PRECISE_MINMAX
59 #define SSE2NEON_PRECISE_MINMAX (0)
60 #endif
61 /* _mm_rcp_ps and _mm_div_ps */
62 #ifndef SSE2NEON_PRECISE_DIV
63 #define SSE2NEON_PRECISE_DIV (0)
64 #endif
65 /* _mm_sqrt_ps and _mm_rsqrt_ps */
66 #ifndef SSE2NEON_PRECISE_SQRT
67 #define SSE2NEON_PRECISE_SQRT (0)
68 #endif
69
70 #if defined(__GNUC__) || defined(__clang__)
71 #pragma push_macro("FORCE_INLINE")
72 #pragma push_macro("ALIGN_STRUCT")
73 #define FORCE_INLINE static inline __attribute__((always_inline))
74 #define ALIGN_STRUCT(x) __attribute__((aligned(x)))
75 #ifndef likely
76 #define likely(x) __builtin_expect(!!(x), 1)
77 #endif
78 #ifndef unlikely
79 #define unlikely(x) __builtin_expect(!!(x), 0)
80 #endif
81 #else
82 #error "Macro name collisions may happen with unsupported compiler."
83 #ifdef FORCE_INLINE
84 #undef FORCE_INLINE
85 #endif
86 #define FORCE_INLINE static inline
87 #ifndef ALIGN_STRUCT
88 #define ALIGN_STRUCT(x) __declspec(align(x))
89 #endif
90 #endif
91 #ifndef likely
92 #define likely(x) (x)
93 #endif
94 #ifndef unlikely
95 #define unlikely(x) (x)
96 #endif
97
98 #include <stdint.h>
99 #include <stdlib.h>
100
101 /* Architecture-specific build options */
102 /* FIXME: #pragma GCC push_options is only available on GCC */
103 #if defined(__GNUC__)
104 #if defined(__arm__) && __ARM_ARCH == 7
105 /* According to ARM C Language Extensions Architecture specification,
106 * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
107 * architecture supported.
108 */
109 #if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
110 #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
111 #endif
112 #if !defined(__clang__)
113 #pragma GCC push_options
114 #pragma GCC target("fpu=neon")
115 #endif
116 #elif defined(__aarch64__)
117 #if !defined(__clang__)
118 #pragma GCC push_options
119 #pragma GCC target("+simd")
120 #endif
121 #else
122 #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
123 #endif
124 #endif
125
126 #include <arm_neon.h>
127
128 /* Rounding functions require either Aarch64 instructions or libm failback */
129 #if !defined(__aarch64__)
130 #include <math.h>
131 #endif
132
133 /* "__has_builtin" can be used to query support for built-in functions
134 * provided by gcc/clang and other compilers that support it.
135 */
136 #ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
137 /* Compatibility with gcc <= 9 */
138 #if __GNUC__ <= 9
139 #define __has_builtin(x) HAS##x
140 #define HAS__builtin_popcount 1
141 #define HAS__builtin_popcountll 1
142 #else
143 #define __has_builtin(x) 0
144 #endif
145 #endif
146
147 /**
148 * MACRO for shuffle parameter for _mm_shuffle_ps().
149 * Argument fp3 is a digit[0123] that represents the fp from argument "b"
150 * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
151 * for fp2 in result. fp1 is a digit[0123] that represents the fp from
152 * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
153 * fp0 is the same for fp0 of result.
154 */
155 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
156 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
157
158 /* Rounding mode macros. */
159 #define _MM_FROUND_TO_NEAREST_INT 0x00
160 #define _MM_FROUND_TO_NEG_INF 0x01
161 #define _MM_FROUND_TO_POS_INF 0x02
162 #define _MM_FROUND_TO_ZERO 0x03
163 #define _MM_FROUND_CUR_DIRECTION 0x04
164 #define _MM_FROUND_NO_EXC 0x08
165 #define _MM_ROUND_NEAREST 0x0000
166 #define _MM_ROUND_DOWN 0x2000
167 #define _MM_ROUND_UP 0x4000
168 #define _MM_ROUND_TOWARD_ZERO 0x6000
169
170 /* indicate immediate constant argument in a given range */
171 #define __constrange(a, b) const
172
173 /* A few intrinsics accept traditional data types like ints or floats, but
174 * most operate on data types that are specific to SSE.
175 * If a vector type ends in d, it contains doubles, and if it does not have
176 * a suffix, it contains floats. An integer vector type can contain any type
177 * of integer, from chars to shorts to unsigned long longs.
178 */
179 typedef int64x1_t __m64;
180 typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
181 // On ARM 32-bit architecture, the float64x2_t is not supported.
182 // The data type __m128d should be represented in a different way for related
183 // intrinsic conversion.
184 #if defined(__aarch64__)
185 typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
186 #else
187 typedef float32x4_t __m128d;
188 #endif
189 typedef int64x2_t __m128i; /* 128-bit vector containing integers */
190
191 /* type-safe casting between types */
192
193 #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
194 #define vreinterpretq_m128_f32(x) (x)
195 #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
196
197 #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
198 #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
199 #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
200 #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
201
202 #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
203 #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
204 #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
205 #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
206
207 #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
208 #define vreinterpretq_f32_m128(x) (x)
209 #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
210
211 #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
212 #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
213 #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
214 #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
215
216 #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
217 #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
218 #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
219 #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
220
221 #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
222 #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
223 #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
224 #define vreinterpretq_m128i_s64(x) (x)
225
226 #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
227 #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
228 #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
229 #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
230
231 #define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
232 #define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
233
234 #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
235 #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
236 #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
237 #define vreinterpretq_s64_m128i(x) (x)
238
239 #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
240 #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
241 #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
242 #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
243
244 #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
245 #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
246 #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
247 #define vreinterpret_m64_s64(x) (x)
248
249 #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
250 #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
251 #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
252 #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
253
254 #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
255 #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
256 #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
257
258 #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
259 #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
260 #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
261 #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
262
263 #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
264 #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
265 #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
266 #define vreinterpret_s64_m64(x) (x)
267
268 #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
269
270 #if defined(__aarch64__)
271 #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
272 #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
273
274 #define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
275
276 #define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
277 #define vreinterpretq_m128d_f64(x) (x)
278
279 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
280
281 #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
282 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
283
284 #define vreinterpretq_f64_m128d(x) (x)
285 #define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
286 #else
287 #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
288 #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
289
290 #define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
291 #define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
292
293 #define vreinterpretq_m128d_f32(x) (x)
294
295 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
296
297 #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
298 #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
299
300 #define vreinterpretq_f32_m128d(x) (x)
301 #endif
302
303 // A struct is defined in this header file called 'SIMDVec' which can be used
304 // by applications which attempt to access the contents of an _m128 struct
305 // directly. It is important to note that accessing the __m128 struct directly
306 // is bad coding practice by Microsoft: @see:
307 // https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
308 //
309 // However, some legacy source code may try to access the contents of an __m128
310 // struct directly so the developer can use the SIMDVec as an alias for it. Any
311 // casting must be done manually by the developer, as you cannot cast or
312 // otherwise alias the base NEON data type for intrinsic operations.
313 //
314 // union intended to allow direct access to an __m128 variable using the names
315 // that the MSVC compiler provides. This union should really only be used when
316 // trying to access the members of the vector as integer values. GCC/clang
317 // allow native access to the float members through a simple array access
318 // operator (in C since 4.6, in C++ since 4.8).
319 //
320 // Ideally direct accesses to SIMD vectors should not be used since it can cause
321 // a performance hit. If it really is needed however, the original __m128
322 // variable can be aliased with a pointer to this union and used to access
323 // individual components. The use of this union should be hidden behind a macro
324 // that is used throughout the codebase to access the members instead of always
325 // declaring this type of variable.
326 typedef union ALIGN_STRUCT(16) SIMDVec {
327 float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
328 int8_t m128_i8[16]; // as signed 8-bit integers.
329 int16_t m128_i16[8]; // as signed 16-bit integers.
330 int32_t m128_i32[4]; // as signed 32-bit integers.
331 int64_t m128_i64[2]; // as signed 64-bit integers.
332 uint8_t m128_u8[16]; // as unsigned 8-bit integers.
333 uint16_t m128_u16[8]; // as unsigned 16-bit integers.
334 uint32_t m128_u32[4]; // as unsigned 32-bit integers.
335 uint64_t m128_u64[2]; // as unsigned 64-bit integers.
336 } SIMDVec;
337
338 // casting using SIMDVec
339 #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
340 #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
341 #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
342
343 // Function declaration
344 // SSE
345 FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE();
346 FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
347 // SSE2
348 FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
349 FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
350 FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
351 FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
352 FORCE_INLINE __m128d _mm_set_pd(double, double);
353 // SSE4.1
354 FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
355 FORCE_INLINE __m128 _mm_ceil_ps(__m128);
356 FORCE_INLINE __m128d _mm_floor_pd(__m128d);
357 FORCE_INLINE __m128 _mm_floor_ps(__m128);
358 FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
359 FORCE_INLINE __m128 _mm_round_ps(__m128, int);
360 // SSE4.2
361 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
362
363 /* Backwards compatibility for compilers with lack of specific type support */
364
365 // Older gcc does not define vld1q_u8_x4 type
366 #if defined(__GNUC__) && !defined(__clang__) && \
367 ((__GNUC__ <= 10 && defined(__arm__)) || \
368 (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
369 (__GNUC__ <= 9 && defined(__aarch64__)))
_sse2neon_vld1q_u8_x4(const uint8_t * p)370 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
371 {
372 uint8x16x4_t ret;
373 ret.val[0] = vld1q_u8(p + 0);
374 ret.val[1] = vld1q_u8(p + 16);
375 ret.val[2] = vld1q_u8(p + 32);
376 ret.val[3] = vld1q_u8(p + 48);
377 return ret;
378 }
379 #else
380 // Wraps vld1q_u8_x4
_sse2neon_vld1q_u8_x4(const uint8_t * p)381 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
382 {
383 return vld1q_u8_x4(p);
384 }
385 #endif
386
387 /* Function Naming Conventions
388 * The naming convention of SSE intrinsics is straightforward. A generic SSE
389 * intrinsic function is given as follows:
390 * _mm_<name>_<data_type>
391 *
392 * The parts of this format are given as follows:
393 * 1. <name> describes the operation performed by the intrinsic
394 * 2. <data_type> identifies the data type of the function's primary arguments
395 *
396 * This last part, <data_type>, is a little complicated. It identifies the
397 * content of the input values, and can be set to any of the following values:
398 * + ps - vectors contain floats (ps stands for packed single-precision)
399 * + pd - vectors cantain doubles (pd stands for packed double-precision)
400 * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
401 * signed integers
402 * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
403 * unsigned integers
404 * + si128 - unspecified 128-bit vector or 256-bit vector
405 * + m128/m128i/m128d - identifies input vector types when they are different
406 * than the type of the returned vector
407 *
408 * For example, _mm_setzero_ps. The _mm implies that the function returns
409 * a 128-bit vector. The _ps at the end implies that the argument vectors
410 * contain floats.
411 *
412 * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
413 * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
414 * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
415 * // Set packed 8-bit integers
416 * // 128 bits, 16 chars, per 8 bits
417 * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
418 * 4, 5, 12, 13, 6, 7, 14, 15);
419 * // Shuffle packed 8-bit integers
420 * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
421 *
422 * Data (Number, Binary, Byte Index):
423 +------+------+-------------+------+------+-------------+
424 | 1 | 2 | 3 | 4 | Number
425 +------+------+------+------+------+------+------+------+
426 | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
427 +------+------+------+------+------+------+------+------+
428 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index
429 +------+------+------+------+------+------+------+------+
430
431 +------+------+------+------+------+------+------+------+
432 | 5 | 6 | 7 | 8 | Number
433 +------+------+------+------+------+------+------+------+
434 | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
435 +------+------+------+------+------+------+------+------+
436 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index
437 +------+------+------+------+------+------+------+------+
438 * Index (Byte Index):
439 +------+------+------+------+------+------+------+------+
440 | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 |
441 +------+------+------+------+------+------+------+------+
442
443 +------+------+------+------+------+------+------+------+
444 | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 |
445 +------+------+------+------+------+------+------+------+
446 * Result:
447 +------+------+------+------+------+------+------+------+
448 | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index
449 +------+------+------+------+------+------+------+------+
450 | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
451 +------+------+------+------+------+------+------+------+
452 | 256 | 2 | 5 | 6 | Number
453 +------+------+------+------+------+------+------+------+
454
455 +------+------+------+------+------+------+------+------+
456 | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index
457 +------+------+------+------+------+------+------+------+
458 | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
459 +------+------+------+------+------+------+------+------+
460 | 3 | 7 | 4 | 8 | Number
461 +------+------+------+------+------+------+-------------+
462 */
463
464 /* Constants for use with _mm_prefetch. */
465 enum _mm_hint {
466 _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
467 _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
468 _MM_HINT_T1 = 2, /* load data to L2 cache only */
469 _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
470 _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
471 _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */
472 _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */
473 _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */
474 };
475
476 // The bit field mapping to the FPCR(floating-point control register)
477 typedef struct {
478 uint16_t res0;
479 uint8_t res1 : 6;
480 uint8_t bit22 : 1;
481 uint8_t bit23 : 1;
482 uint8_t res2;
483 #if defined(__aarch64__)
484 uint32_t res3;
485 #endif
486 } fpcr_bitfield;
487
488 // Takes the upper 64 bits of a and places it in the low end of the result
489 // Takes the lower 64 bits of b and places it into the high end of the result.
_mm_shuffle_ps_1032(__m128 a,__m128 b)490 FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
491 {
492 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
493 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
494 return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
495 }
496
497 // takes the lower two 32-bit values from a and swaps them and places in high
498 // end of result takes the higher two 32 bit values from b and swaps them and
499 // places in low end of result.
_mm_shuffle_ps_2301(__m128 a,__m128 b)500 FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
501 {
502 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
503 float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
504 return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
505 }
506
_mm_shuffle_ps_0321(__m128 a,__m128 b)507 FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
508 {
509 float32x2_t a21 = vget_high_f32(
510 vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
511 float32x2_t b03 = vget_low_f32(
512 vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
513 return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
514 }
515
_mm_shuffle_ps_2103(__m128 a,__m128 b)516 FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
517 {
518 float32x2_t a03 = vget_low_f32(
519 vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
520 float32x2_t b21 = vget_high_f32(
521 vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
522 return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
523 }
524
_mm_shuffle_ps_1010(__m128 a,__m128 b)525 FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
526 {
527 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
528 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
529 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
530 }
531
_mm_shuffle_ps_1001(__m128 a,__m128 b)532 FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
533 {
534 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
535 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
536 return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
537 }
538
_mm_shuffle_ps_0101(__m128 a,__m128 b)539 FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
540 {
541 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
542 float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
543 return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
544 }
545
546 // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
547 // high
_mm_shuffle_ps_3210(__m128 a,__m128 b)548 FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
549 {
550 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
551 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
552 return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
553 }
554
_mm_shuffle_ps_0011(__m128 a,__m128 b)555 FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
556 {
557 float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
558 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
559 return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
560 }
561
_mm_shuffle_ps_0022(__m128 a,__m128 b)562 FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
563 {
564 float32x2_t a22 =
565 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
566 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
567 return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
568 }
569
_mm_shuffle_ps_2200(__m128 a,__m128 b)570 FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
571 {
572 float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
573 float32x2_t b22 =
574 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
575 return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
576 }
577
_mm_shuffle_ps_3202(__m128 a,__m128 b)578 FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
579 {
580 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
581 float32x2_t a22 =
582 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
583 float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
584 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
585 return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
586 }
587
_mm_shuffle_ps_1133(__m128 a,__m128 b)588 FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
589 {
590 float32x2_t a33 =
591 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
592 float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
593 return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
594 }
595
_mm_shuffle_ps_2010(__m128 a,__m128 b)596 FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
597 {
598 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
599 float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
600 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
601 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
602 return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
603 }
604
_mm_shuffle_ps_2001(__m128 a,__m128 b)605 FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
606 {
607 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
608 float32_t b2 = vgetq_lane_f32(b, 2);
609 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
610 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
611 return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
612 }
613
_mm_shuffle_ps_2032(__m128 a,__m128 b)614 FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
615 {
616 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
617 float32_t b2 = vgetq_lane_f32(b, 2);
618 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
619 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
620 return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
621 }
622
623 // Kahan summation for accurate summation of floating-point numbers.
624 // http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
_sse2neon_kadd_f32(float * sum,float * c,float y)625 FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
626 {
627 y -= *c;
628 float t = *sum + y;
629 *c = (t - *sum) - y;
630 *sum = t;
631 }
632
633 #if defined(__ARM_FEATURE_CRYPTO)
634 // Wraps vmull_p64
_sse2neon_vmull_p64(uint64x1_t _a,uint64x1_t _b)635 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
636 {
637 poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
638 poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
639 return vreinterpretq_u64_p128(vmull_p64(a, b));
640 }
641 #else // ARMv7 polyfill
642 // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
643 //
644 // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
645 // 64-bit->128-bit polynomial multiply.
646 //
647 // It needs some work and is somewhat slow, but it is still faster than all
648 // known scalar methods.
649 //
650 // Algorithm adapted to C from
651 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
652 // from "Fast Software Polynomial Multiplication on ARM Processors Using the
653 // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
654 // (https://hal.inria.fr/hal-01506572)
_sse2neon_vmull_p64(uint64x1_t _a,uint64x1_t _b)655 static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
656 {
657 poly8x8_t a = vreinterpret_p8_u64(_a);
658 poly8x8_t b = vreinterpret_p8_u64(_b);
659
660 // Masks
661 uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
662 vcreate_u8(0x00000000ffffffff));
663 uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
664 vcreate_u8(0x0000000000000000));
665
666 // Do the multiplies, rotating with vext to get all combinations
667 uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
668 uint8x16_t e =
669 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
670 uint8x16_t f =
671 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
672 uint8x16_t g =
673 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
674 uint8x16_t h =
675 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
676 uint8x16_t i =
677 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
678 uint8x16_t j =
679 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
680 uint8x16_t k =
681 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
682
683 // Add cross products
684 uint8x16_t l = veorq_u8(e, f); // L = E + F
685 uint8x16_t m = veorq_u8(g, h); // M = G + H
686 uint8x16_t n = veorq_u8(i, j); // N = I + J
687
688 // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
689 // instructions.
690 #if defined(__aarch64__)
691 uint8x16_t lm_p0 = vreinterpretq_u8_u64(
692 vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
693 uint8x16_t lm_p1 = vreinterpretq_u8_u64(
694 vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
695 uint8x16_t nk_p0 = vreinterpretq_u8_u64(
696 vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
697 uint8x16_t nk_p1 = vreinterpretq_u8_u64(
698 vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
699 #else
700 uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
701 uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
702 uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
703 uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
704 #endif
705 // t0 = (L) (P0 + P1) << 8
706 // t1 = (M) (P2 + P3) << 16
707 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
708 uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
709 uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
710
711 // t2 = (N) (P4 + P5) << 24
712 // t3 = (K) (P6 + P7) << 32
713 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
714 uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
715 uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
716
717 // De-interleave
718 #if defined(__aarch64__)
719 uint8x16_t t0 = vreinterpretq_u8_u64(
720 vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
721 uint8x16_t t1 = vreinterpretq_u8_u64(
722 vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
723 uint8x16_t t2 = vreinterpretq_u8_u64(
724 vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
725 uint8x16_t t3 = vreinterpretq_u8_u64(
726 vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
727 #else
728 uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
729 uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
730 uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
731 uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
732 #endif
733 // Shift the cross products
734 uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
735 uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
736 uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
737 uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
738
739 // Accumulate the products
740 uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
741 uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
742 uint8x16_t mix = veorq_u8(d, cross1);
743 uint8x16_t r = veorq_u8(mix, cross2);
744 return vreinterpretq_u64_u8(r);
745 }
746 #endif // ARMv7 polyfill
747
748 // C equivalent:
749 // __m128i _mm_shuffle_epi32_default(__m128i a,
750 // __constrange(0, 255) int imm) {
751 // __m128i ret;
752 // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
753 // ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];
754 // return ret;
755 // }
756 #define _mm_shuffle_epi32_default(a, imm) \
757 __extension__({ \
758 int32x4_t ret; \
759 ret = vmovq_n_s32( \
760 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \
761 ret = vsetq_lane_s32( \
762 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
763 ret, 1); \
764 ret = vsetq_lane_s32( \
765 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
766 ret, 2); \
767 ret = vsetq_lane_s32( \
768 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
769 ret, 3); \
770 vreinterpretq_m128i_s32(ret); \
771 })
772
773 // Takes the upper 64 bits of a and places it in the low end of the result
774 // Takes the lower 64 bits of a and places it into the high end of the result.
_mm_shuffle_epi_1032(__m128i a)775 FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
776 {
777 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
778 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
779 return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
780 }
781
782 // takes the lower two 32-bit values from a and swaps them and places in low end
783 // of result takes the higher two 32 bit values from a and swaps them and places
784 // in high end of result.
_mm_shuffle_epi_2301(__m128i a)785 FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
786 {
787 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
788 int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
789 return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
790 }
791
792 // rotates the least significant 32 bits into the most significant 32 bits, and
793 // shifts the rest down
_mm_shuffle_epi_0321(__m128i a)794 FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
795 {
796 return vreinterpretq_m128i_s32(
797 vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
798 }
799
800 // rotates the most significant 32 bits into the least significant 32 bits, and
801 // shifts the rest up
_mm_shuffle_epi_2103(__m128i a)802 FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
803 {
804 return vreinterpretq_m128i_s32(
805 vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
806 }
807
808 // gets the lower 64 bits of a, and places it in the upper 64 bits
809 // gets the lower 64 bits of a and places it in the lower 64 bits
_mm_shuffle_epi_1010(__m128i a)810 FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
811 {
812 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
813 return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
814 }
815
816 // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
817 // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
_mm_shuffle_epi_1001(__m128i a)818 FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
819 {
820 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
821 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
822 return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
823 }
824
825 // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
826 // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
827 // places it in the lower 64 bits
_mm_shuffle_epi_0101(__m128i a)828 FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
829 {
830 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
831 return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
832 }
833
_mm_shuffle_epi_2211(__m128i a)834 FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
835 {
836 int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
837 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
838 return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
839 }
840
_mm_shuffle_epi_0122(__m128i a)841 FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
842 {
843 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
844 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
845 return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
846 }
847
_mm_shuffle_epi_3332(__m128i a)848 FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
849 {
850 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
851 int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
852 return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
853 }
854
855 // FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
856 // int imm)
857 #if defined(__aarch64__)
858 #define _mm_shuffle_epi32_splat(a, imm) \
859 __extension__({ \
860 vreinterpretq_m128i_s32( \
861 vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
862 })
863 #else
864 #define _mm_shuffle_epi32_splat(a, imm) \
865 __extension__({ \
866 vreinterpretq_m128i_s32( \
867 vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
868 })
869 #endif
870
871 // NEON does not support a general purpose permute intrinsic
872 // Selects four specific single-precision, floating-point values from a and b,
873 // based on the mask i.
874 //
875 // C equivalent:
876 // __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
877 // __constrange(0, 255) int imm) {
878 // __m128 ret;
879 // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
880 // ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];
881 // return ret;
882 // }
883 //
884 // https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
885 #define _mm_shuffle_ps_default(a, b, imm) \
886 __extension__({ \
887 float32x4_t ret; \
888 ret = vmovq_n_f32( \
889 vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \
890 ret = vsetq_lane_f32( \
891 vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
892 ret, 1); \
893 ret = vsetq_lane_f32( \
894 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
895 ret, 2); \
896 ret = vsetq_lane_f32( \
897 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
898 ret, 3); \
899 vreinterpretq_m128_f32(ret); \
900 })
901
902 // Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
903 // by imm.
904 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
905 // FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
906 // __constrange(0,255) int
907 // imm)
908 #define _mm_shufflelo_epi16_function(a, imm) \
909 __extension__({ \
910 int16x8_t ret = vreinterpretq_s16_m128i(a); \
911 int16x4_t lowBits = vget_low_s16(ret); \
912 ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
913 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
914 1); \
915 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
916 2); \
917 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
918 3); \
919 vreinterpretq_m128i_s16(ret); \
920 })
921
922 // Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
923 // by imm.
924 // https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
925 // FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
926 // __constrange(0,255) int
927 // imm)
928 #define _mm_shufflehi_epi16_function(a, imm) \
929 __extension__({ \
930 int16x8_t ret = vreinterpretq_s16_m128i(a); \
931 int16x4_t highBits = vget_high_s16(ret); \
932 ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
933 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
934 5); \
935 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
936 6); \
937 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
938 7); \
939 vreinterpretq_m128i_s16(ret); \
940 })
941
942 /* SSE */
943
944 // Adds the four single-precision, floating-point values of a and b.
945 //
946 // r0 := a0 + b0
947 // r1 := a1 + b1
948 // r2 := a2 + b2
949 // r3 := a3 + b3
950 //
951 // https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
_mm_add_ps(__m128 a,__m128 b)952 FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
953 {
954 return vreinterpretq_m128_f32(
955 vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
956 }
957
958 // adds the scalar single-precision floating point values of a and b.
959 // https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
_mm_add_ss(__m128 a,__m128 b)960 FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
961 {
962 float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
963 float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
964 // the upper values in the result must be the remnants of <a>.
965 return vreinterpretq_m128_f32(vaddq_f32(a, value));
966 }
967
968 // Computes the bitwise AND of the four single-precision, floating-point values
969 // of a and b.
970 //
971 // r0 := a0 & b0
972 // r1 := a1 & b1
973 // r2 := a2 & b2
974 // r3 := a3 & b3
975 //
976 // https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
_mm_and_ps(__m128 a,__m128 b)977 FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
978 {
979 return vreinterpretq_m128_s32(
980 vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
981 }
982
983 // Computes the bitwise AND-NOT of the four single-precision, floating-point
984 // values of a and b.
985 //
986 // r0 := ~a0 & b0
987 // r1 := ~a1 & b1
988 // r2 := ~a2 & b2
989 // r3 := ~a3 & b3
990 //
991 // https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
_mm_andnot_ps(__m128 a,__m128 b)992 FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
993 {
994 return vreinterpretq_m128_s32(
995 vbicq_s32(vreinterpretq_s32_m128(b),
996 vreinterpretq_s32_m128(a))); // *NOTE* argument swap
997 }
998
999 // Average packed unsigned 16-bit integers in a and b, and store the results in
1000 // dst.
1001 //
1002 // FOR j := 0 to 3
1003 // i := j*16
1004 // dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
1005 // ENDFOR
1006 //
1007 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
_mm_avg_pu16(__m64 a,__m64 b)1008 FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
1009 {
1010 return vreinterpret_m64_u16(
1011 vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
1012 }
1013
1014 // Average packed unsigned 8-bit integers in a and b, and store the results in
1015 // dst.
1016 //
1017 // FOR j := 0 to 7
1018 // i := j*8
1019 // dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
1020 // ENDFOR
1021 //
1022 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
_mm_avg_pu8(__m64 a,__m64 b)1023 FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
1024 {
1025 return vreinterpret_m64_u8(
1026 vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1027 }
1028
1029 // Compares for equality.
1030 // https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
_mm_cmpeq_ps(__m128 a,__m128 b)1031 FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
1032 {
1033 return vreinterpretq_m128_u32(
1034 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1035 }
1036
1037 // Compares for equality.
1038 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
_mm_cmpeq_ss(__m128 a,__m128 b)1039 FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
1040 {
1041 return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
1042 }
1043
1044 // Compares for greater than or equal.
1045 // https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
_mm_cmpge_ps(__m128 a,__m128 b)1046 FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
1047 {
1048 return vreinterpretq_m128_u32(
1049 vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1050 }
1051
1052 // Compares for greater than or equal.
1053 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
_mm_cmpge_ss(__m128 a,__m128 b)1054 FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
1055 {
1056 return _mm_move_ss(a, _mm_cmpge_ps(a, b));
1057 }
1058
1059 // Compares for greater than.
1060 //
1061 // r0 := (a0 > b0) ? 0xffffffff : 0x0
1062 // r1 := (a1 > b1) ? 0xffffffff : 0x0
1063 // r2 := (a2 > b2) ? 0xffffffff : 0x0
1064 // r3 := (a3 > b3) ? 0xffffffff : 0x0
1065 //
1066 // https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
_mm_cmpgt_ps(__m128 a,__m128 b)1067 FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
1068 {
1069 return vreinterpretq_m128_u32(
1070 vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1071 }
1072
1073 // Compares for greater than.
1074 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
_mm_cmpgt_ss(__m128 a,__m128 b)1075 FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
1076 {
1077 return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
1078 }
1079
1080 // Compares for less than or equal.
1081 //
1082 // r0 := (a0 <= b0) ? 0xffffffff : 0x0
1083 // r1 := (a1 <= b1) ? 0xffffffff : 0x0
1084 // r2 := (a2 <= b2) ? 0xffffffff : 0x0
1085 // r3 := (a3 <= b3) ? 0xffffffff : 0x0
1086 //
1087 // https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
_mm_cmple_ps(__m128 a,__m128 b)1088 FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
1089 {
1090 return vreinterpretq_m128_u32(
1091 vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1092 }
1093
1094 // Compares for less than or equal.
1095 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
_mm_cmple_ss(__m128 a,__m128 b)1096 FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
1097 {
1098 return _mm_move_ss(a, _mm_cmple_ps(a, b));
1099 }
1100
1101 // Compares for less than
1102 // https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
_mm_cmplt_ps(__m128 a,__m128 b)1103 FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
1104 {
1105 return vreinterpretq_m128_u32(
1106 vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1107 }
1108
1109 // Compares for less than
1110 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
_mm_cmplt_ss(__m128 a,__m128 b)1111 FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
1112 {
1113 return _mm_move_ss(a, _mm_cmplt_ps(a, b));
1114 }
1115
1116 // Compares for inequality.
1117 // https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
_mm_cmpneq_ps(__m128 a,__m128 b)1118 FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
1119 {
1120 return vreinterpretq_m128_u32(vmvnq_u32(
1121 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1122 }
1123
1124 // Compares for inequality.
1125 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
_mm_cmpneq_ss(__m128 a,__m128 b)1126 FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
1127 {
1128 return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
1129 }
1130
1131 // Compares for not greater than or equal.
1132 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
_mm_cmpnge_ps(__m128 a,__m128 b)1133 FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
1134 {
1135 return _mm_cmplt_ps(a, b);
1136 }
1137
1138 // Compares for not greater than or equal.
1139 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
_mm_cmpnge_ss(__m128 a,__m128 b)1140 FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
1141 {
1142 return _mm_cmplt_ss(a, b);
1143 }
1144
1145 // Compares for not greater than.
1146 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
_mm_cmpngt_ps(__m128 a,__m128 b)1147 FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
1148 {
1149 return _mm_cmple_ps(a, b);
1150 }
1151
1152 // Compares for not greater than.
1153 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
_mm_cmpngt_ss(__m128 a,__m128 b)1154 FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
1155 {
1156 return _mm_cmple_ss(a, b);
1157 }
1158
1159 // Compares for not less than or equal.
1160 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
_mm_cmpnle_ps(__m128 a,__m128 b)1161 FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
1162 {
1163 return _mm_cmpgt_ps(a, b);
1164 }
1165
1166 // Compares for not less than or equal.
1167 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
_mm_cmpnle_ss(__m128 a,__m128 b)1168 FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
1169 {
1170 return _mm_cmpgt_ss(a, b);
1171 }
1172
1173 // Compares for not less than.
1174 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
_mm_cmpnlt_ps(__m128 a,__m128 b)1175 FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
1176 {
1177 return _mm_cmpge_ps(a, b);
1178 }
1179
1180 // Compares for not less than.
1181 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
_mm_cmpnlt_ss(__m128 a,__m128 b)1182 FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
1183 {
1184 return _mm_cmpge_ss(a, b);
1185 }
1186
1187 // Compares the four 32-bit floats in a and b to check if any values are NaN.
1188 // Ordered compare between each value returns true for "orderable" and false for
1189 // "not orderable" (NaN).
1190 // https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
1191 // also:
1192 // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
1193 // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
_mm_cmpord_ps(__m128 a,__m128 b)1194 FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
1195 {
1196 // Note: NEON does not have ordered compare builtin
1197 // Need to compare a eq a and b eq b to check for NaN
1198 // Do AND of results to get final
1199 uint32x4_t ceqaa =
1200 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1201 uint32x4_t ceqbb =
1202 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1203 return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
1204 }
1205
1206 // Compares for ordered.
1207 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
_mm_cmpord_ss(__m128 a,__m128 b)1208 FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
1209 {
1210 return _mm_move_ss(a, _mm_cmpord_ps(a, b));
1211 }
1212
1213 // Compares for unordered.
1214 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
_mm_cmpunord_ps(__m128 a,__m128 b)1215 FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
1216 {
1217 uint32x4_t f32a =
1218 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1219 uint32x4_t f32b =
1220 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1221 return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
1222 }
1223
1224 // Compares for unordered.
1225 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
_mm_cmpunord_ss(__m128 a,__m128 b)1226 FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
1227 {
1228 return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
1229 }
1230
1231 // Compares the lower single-precision floating point scalar values of a and b
1232 // using an equality operation. :
1233 // https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
_mm_comieq_ss(__m128 a,__m128 b)1234 FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
1235 {
1236 // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
1237 // vreinterpretq_f32_m128(b)), 0);
1238 uint32x4_t a_not_nan =
1239 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1240 uint32x4_t b_not_nan =
1241 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1242 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1243 uint32x4_t a_eq_b =
1244 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1245 return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) & 0x1;
1246 }
1247
1248 // Compares the lower single-precision floating point scalar values of a and b
1249 // using a greater than or equal operation. :
1250 // https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
_mm_comige_ss(__m128 a,__m128 b)1251 FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
1252 {
1253 // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
1254 // vreinterpretq_f32_m128(b)), 0);
1255 uint32x4_t a_not_nan =
1256 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1257 uint32x4_t b_not_nan =
1258 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1259 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1260 uint32x4_t a_ge_b =
1261 vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1262 return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) & 0x1;
1263 }
1264
1265 // Compares the lower single-precision floating point scalar values of a and b
1266 // using a greater than operation. :
1267 // https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
_mm_comigt_ss(__m128 a,__m128 b)1268 FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
1269 {
1270 // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
1271 // vreinterpretq_f32_m128(b)), 0);
1272 uint32x4_t a_not_nan =
1273 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1274 uint32x4_t b_not_nan =
1275 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1276 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1277 uint32x4_t a_gt_b =
1278 vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1279 return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) & 0x1;
1280 }
1281
1282 // Compares the lower single-precision floating point scalar values of a and b
1283 // using a less than or equal operation. :
1284 // https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
_mm_comile_ss(__m128 a,__m128 b)1285 FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
1286 {
1287 // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
1288 // vreinterpretq_f32_m128(b)), 0);
1289 uint32x4_t a_not_nan =
1290 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1291 uint32x4_t b_not_nan =
1292 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1293 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1294 uint32x4_t a_le_b =
1295 vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1296 return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) & 0x1;
1297 }
1298
1299 // Compares the lower single-precision floating point scalar values of a and b
1300 // using a less than operation. :
1301 // https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
1302 // note!! The documentation on MSDN is incorrect! If either of the values is a
1303 // NAN the docs say you will get a one, but in fact, it will return a zero!!
_mm_comilt_ss(__m128 a,__m128 b)1304 FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
1305 {
1306 uint32x4_t a_not_nan =
1307 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1308 uint32x4_t b_not_nan =
1309 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1310 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1311 uint32x4_t a_lt_b =
1312 vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1313 return vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) & 0x1;
1314 }
1315
1316 // Compares the lower single-precision floating point scalar values of a and b
1317 // using an inequality operation. :
1318 // https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
_mm_comineq_ss(__m128 a,__m128 b)1319 FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
1320 {
1321 // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
1322 // vreinterpretq_f32_m128(b)), 0);
1323 uint32x4_t a_not_nan =
1324 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1325 uint32x4_t b_not_nan =
1326 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1327 uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1328 uint32x4_t a_neq_b = vmvnq_u32(
1329 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1330 return vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) & 0x1;
1331 }
1332
1333 // Convert packed signed 32-bit integers in b to packed single-precision
1334 // (32-bit) floating-point elements, store the results in the lower 2 elements
1335 // of dst, and copy the upper 2 packed elements from a to the upper elements of
1336 // dst.
1337 //
1338 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1339 // dst[63:32] := Convert_Int32_To_FP32(b[63:32])
1340 // dst[95:64] := a[95:64]
1341 // dst[127:96] := a[127:96]
1342 //
1343 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
_mm_cvt_pi2ps(__m128 a,__m64 b)1344 FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
1345 {
1346 return vreinterpretq_m128_f32(
1347 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1348 vget_high_f32(vreinterpretq_f32_m128(a))));
1349 }
1350
1351 // Convert packed single-precision (32-bit) floating-point elements in a to
1352 // packed 32-bit integers, and store the results in dst.
1353 //
1354 // FOR j := 0 to 1
1355 // i := 32*j
1356 // dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
1357 // ENDFOR
1358 //
1359 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
_mm_cvt_ps2pi(__m128 a)1360 FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
1361 {
1362 #if defined(__aarch64__)
1363 return vreinterpret_m64_s32(
1364 vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
1365 #else
1366 return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
1367 vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
1368 #endif
1369 }
1370
1371 // Convert the signed 32-bit integer b to a single-precision (32-bit)
1372 // floating-point element, store the result in the lower element of dst, and
1373 // copy the upper 3 packed elements from a to the upper elements of dst.
1374 //
1375 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1376 // dst[127:32] := a[127:32]
1377 //
1378 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
_mm_cvt_si2ss(__m128 a,int b)1379 FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
1380 {
1381 return vreinterpretq_m128_f32(
1382 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1383 }
1384
1385 // Convert the lower single-precision (32-bit) floating-point element in a to a
1386 // 32-bit integer, and store the result in dst.
1387 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
_mm_cvt_ss2si(__m128 a)1388 FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
1389 {
1390 #if defined(__aarch64__)
1391 return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
1392 0);
1393 #else
1394 float32_t data = vgetq_lane_f32(
1395 vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1396 return (int32_t) data;
1397 #endif
1398 }
1399
1400 // Convert packed 16-bit integers in a to packed single-precision (32-bit)
1401 // floating-point elements, and store the results in dst.
1402 //
1403 // FOR j := 0 to 3
1404 // i := j*16
1405 // m := j*32
1406 // dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
1407 // ENDFOR
1408 //
1409 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
_mm_cvtpi16_ps(__m64 a)1410 FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
1411 {
1412 return vreinterpretq_m128_f32(
1413 vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
1414 }
1415
1416 // Convert packed 32-bit integers in b to packed single-precision (32-bit)
1417 // floating-point elements, store the results in the lower 2 elements of dst,
1418 // and copy the upper 2 packed elements from a to the upper elements of dst.
1419 //
1420 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1421 // dst[63:32] := Convert_Int32_To_FP32(b[63:32])
1422 // dst[95:64] := a[95:64]
1423 // dst[127:96] := a[127:96]
1424 //
1425 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
_mm_cvtpi32_ps(__m128 a,__m64 b)1426 FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
1427 {
1428 return vreinterpretq_m128_f32(
1429 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1430 vget_high_f32(vreinterpretq_f32_m128(a))));
1431 }
1432
1433 // Convert packed signed 32-bit integers in a to packed single-precision
1434 // (32-bit) floating-point elements, store the results in the lower 2 elements
1435 // of dst, then covert the packed signed 32-bit integers in b to
1436 // single-precision (32-bit) floating-point element, and store the results in
1437 // the upper 2 elements of dst.
1438 //
1439 // dst[31:0] := Convert_Int32_To_FP32(a[31:0])
1440 // dst[63:32] := Convert_Int32_To_FP32(a[63:32])
1441 // dst[95:64] := Convert_Int32_To_FP32(b[31:0])
1442 // dst[127:96] := Convert_Int32_To_FP32(b[63:32])
1443 //
1444 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
_mm_cvtpi32x2_ps(__m64 a,__m64 b)1445 FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
1446 {
1447 return vreinterpretq_m128_f32(vcvtq_f32_s32(
1448 vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
1449 }
1450
1451 // Convert the lower packed 8-bit integers in a to packed single-precision
1452 // (32-bit) floating-point elements, and store the results in dst.
1453 //
1454 // FOR j := 0 to 3
1455 // i := j*8
1456 // m := j*32
1457 // dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
1458 // ENDFOR
1459 //
1460 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
_mm_cvtpi8_ps(__m64 a)1461 FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
1462 {
1463 return vreinterpretq_m128_f32(vcvtq_f32_s32(
1464 vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
1465 }
1466
1467 // Convert packed single-precision (32-bit) floating-point elements in a to
1468 // packed 16-bit integers, and store the results in dst. Note: this intrinsic
1469 // will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
1470 // 0x7FFFFFFF.
1471 //
1472 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
_mm_cvtps_pi16(__m128 a)1473 FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
1474 {
1475 return vreinterpret_m64_s16(
1476 vmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
1477 }
1478
1479 // Convert packed single-precision (32-bit) floating-point elements in a to
1480 // packed 32-bit integers, and store the results in dst.
1481 //
1482 // FOR j := 0 to 1
1483 // i := 32*j
1484 // dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
1485 // ENDFOR
1486 //
1487 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
1488 #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1489
1490 // Convert packed unsigned 16-bit integers in a to packed single-precision
1491 // (32-bit) floating-point elements, and store the results in dst.
1492 //
1493 // FOR j := 0 to 3
1494 // i := j*16
1495 // m := j*32
1496 // dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
1497 // ENDFOR
1498 //
1499 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
_mm_cvtpu16_ps(__m64 a)1500 FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
1501 {
1502 return vreinterpretq_m128_f32(
1503 vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
1504 }
1505
1506 // Convert the lower packed unsigned 8-bit integers in a to packed
1507 // single-precision (32-bit) floating-point elements, and store the results in
1508 // dst.
1509 //
1510 // FOR j := 0 to 3
1511 // i := j*8
1512 // m := j*32
1513 // dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
1514 // ENDFOR
1515 //
1516 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
_mm_cvtpu8_ps(__m64 a)1517 FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
1518 {
1519 return vreinterpretq_m128_f32(vcvtq_f32_u32(
1520 vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
1521 }
1522
1523 // Convert the signed 32-bit integer b to a single-precision (32-bit)
1524 // floating-point element, store the result in the lower element of dst, and
1525 // copy the upper 3 packed elements from a to the upper elements of dst.
1526 //
1527 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
1528 // dst[127:32] := a[127:32]
1529 //
1530 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
1531 #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1532
1533 // Convert the signed 64-bit integer b to a single-precision (32-bit)
1534 // floating-point element, store the result in the lower element of dst, and
1535 // copy the upper 3 packed elements from a to the upper elements of dst.
1536 //
1537 // dst[31:0] := Convert_Int64_To_FP32(b[63:0])
1538 // dst[127:32] := a[127:32]
1539 //
1540 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
_mm_cvtsi64_ss(__m128 a,int64_t b)1541 FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
1542 {
1543 return vreinterpretq_m128_f32(
1544 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1545 }
1546
1547 // Copy the lower single-precision (32-bit) floating-point element of a to dst.
1548 //
1549 // dst[31:0] := a[31:0]
1550 //
1551 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
_mm_cvtss_f32(__m128 a)1552 FORCE_INLINE float _mm_cvtss_f32(__m128 a)
1553 {
1554 return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1555 }
1556
1557 // Convert the lower single-precision (32-bit) floating-point element in a to a
1558 // 32-bit integer, and store the result in dst.
1559 //
1560 // dst[31:0] := Convert_FP32_To_Int32(a[31:0])
1561 //
1562 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
1563 #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1564
1565 // Convert the lower single-precision (32-bit) floating-point element in a to a
1566 // 64-bit integer, and store the result in dst.
1567 //
1568 // dst[63:0] := Convert_FP32_To_Int64(a[31:0])
1569 //
1570 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
_mm_cvtss_si64(__m128 a)1571 FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
1572 {
1573 #if defined(__aarch64__)
1574 return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
1575 #else
1576 float32_t data = vgetq_lane_f32(
1577 vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1578 return (int64_t) data;
1579 #endif
1580 }
1581
1582 // Convert packed single-precision (32-bit) floating-point elements in a to
1583 // packed 32-bit integers with truncation, and store the results in dst.
1584 //
1585 // FOR j := 0 to 1
1586 // i := 32*j
1587 // dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
1588 // ENDFOR
1589 //
1590 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
_mm_cvtt_ps2pi(__m128 a)1591 FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
1592 {
1593 return vreinterpret_m64_s32(
1594 vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
1595 }
1596
1597 // Convert the lower single-precision (32-bit) floating-point element in a to a
1598 // 32-bit integer with truncation, and store the result in dst.
1599 //
1600 // dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
1601 //
1602 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
_mm_cvtt_ss2si(__m128 a)1603 FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
1604 {
1605 return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
1606 }
1607
1608 // Convert packed single-precision (32-bit) floating-point elements in a to
1609 // packed 32-bit integers with truncation, and store the results in dst.
1610 //
1611 // FOR j := 0 to 1
1612 // i := 32*j
1613 // dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
1614 // ENDFOR
1615 //
1616 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
1617 #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1618
1619 // Convert the lower single-precision (32-bit) floating-point element in a to a
1620 // 32-bit integer with truncation, and store the result in dst.
1621 //
1622 // dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
1623 //
1624 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
1625 #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1626
1627 // Convert the lower single-precision (32-bit) floating-point element in a to a
1628 // 64-bit integer with truncation, and store the result in dst.
1629 //
1630 // dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
1631 //
1632 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
_mm_cvttss_si64(__m128 a)1633 FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
1634 {
1635 return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1636 }
1637
1638 // Divides the four single-precision, floating-point values of a and b.
1639 //
1640 // r0 := a0 / b0
1641 // r1 := a1 / b1
1642 // r2 := a2 / b2
1643 // r3 := a3 / b3
1644 //
1645 // https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
_mm_div_ps(__m128 a,__m128 b)1646 FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
1647 {
1648 #if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
1649 return vreinterpretq_m128_f32(
1650 vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1651 #else
1652 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
1653 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1654 #if SSE2NEON_PRECISE_DIV
1655 // Additional Netwon-Raphson iteration for accuracy
1656 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1657 #endif
1658 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
1659 #endif
1660 }
1661
1662 // Divides the scalar single-precision floating point value of a by b.
1663 // https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
_mm_div_ss(__m128 a,__m128 b)1664 FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
1665 {
1666 float32_t value =
1667 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
1668 return vreinterpretq_m128_f32(
1669 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1670 }
1671
1672 // Extract a 16-bit integer from a, selected with imm8, and store the result in
1673 // the lower element of dst.
1674 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16
1675 #define _mm_extract_pi16(a, imm) \
1676 (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1677
1678 // Free aligned memory that was allocated with _mm_malloc.
1679 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
_mm_free(void * addr)1680 FORCE_INLINE void _mm_free(void *addr)
1681 {
1682 free(addr);
1683 }
1684
1685 // Macro: Get the rounding mode bits from the MXCSR control and status register.
1686 // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
1687 // _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
1688 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE
_MM_GET_ROUNDING_MODE()1689 FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
1690 {
1691 union {
1692 fpcr_bitfield field;
1693 #if defined(__aarch64__)
1694 uint64_t value;
1695 #else
1696 uint32_t value;
1697 #endif
1698 } r;
1699
1700 #if defined(__aarch64__)
1701 asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
1702 #else
1703 asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1704 #endif
1705
1706 if (r.field.bit22) {
1707 return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
1708 } else {
1709 return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
1710 }
1711 }
1712
1713 // Copy a to dst, and insert the 16-bit integer i into dst at the location
1714 // specified by imm8.
1715 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
1716 #define _mm_insert_pi16(a, b, imm) \
1717 __extension__({ \
1718 vreinterpret_m64_s16( \
1719 vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
1720 })
1721
1722 // Loads four single-precision, floating-point values.
1723 // https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
_mm_load_ps(const float * p)1724 FORCE_INLINE __m128 _mm_load_ps(const float *p)
1725 {
1726 return vreinterpretq_m128_f32(vld1q_f32(p));
1727 }
1728
1729 // Load a single-precision (32-bit) floating-point element from memory into all
1730 // elements of dst.
1731 //
1732 // dst[31:0] := MEM[mem_addr+31:mem_addr]
1733 // dst[63:32] := MEM[mem_addr+31:mem_addr]
1734 // dst[95:64] := MEM[mem_addr+31:mem_addr]
1735 // dst[127:96] := MEM[mem_addr+31:mem_addr]
1736 //
1737 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
1738 #define _mm_load_ps1 _mm_load1_ps
1739
1740 // Loads an single - precision, floating - point value into the low word and
1741 // clears the upper three words.
1742 // https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
_mm_load_ss(const float * p)1743 FORCE_INLINE __m128 _mm_load_ss(const float *p)
1744 {
1745 return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1746 }
1747
1748 // Loads a single single-precision, floating-point value, copying it into all
1749 // four words
1750 // https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
_mm_load1_ps(const float * p)1751 FORCE_INLINE __m128 _mm_load1_ps(const float *p)
1752 {
1753 return vreinterpretq_m128_f32(vld1q_dup_f32(p));
1754 }
1755
1756 // Sets the upper two single-precision, floating-point values with 64
1757 // bits of data loaded from the address p; the lower two values are passed
1758 // through from a.
1759 //
1760 // r0 := a0
1761 // r1 := a1
1762 // r2 := *p0
1763 // r3 := *p1
1764 //
1765 // https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
_mm_loadh_pi(__m128 a,__m64 const * p)1766 FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
1767 {
1768 return vreinterpretq_m128_f32(
1769 vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
1770 }
1771
1772 // Sets the lower two single-precision, floating-point values with 64
1773 // bits of data loaded from the address p; the upper two values are passed
1774 // through from a.
1775 //
1776 // Return Value
1777 // r0 := *p0
1778 // r1 := *p1
1779 // r2 := a2
1780 // r3 := a3
1781 //
1782 // https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
_mm_loadl_pi(__m128 a,__m64 const * p)1783 FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
1784 {
1785 return vreinterpretq_m128_f32(
1786 vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
1787 }
1788
1789 // Load 4 single-precision (32-bit) floating-point elements from memory into dst
1790 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1791 // general-protection exception may be generated.
1792 //
1793 // dst[31:0] := MEM[mem_addr+127:mem_addr+96]
1794 // dst[63:32] := MEM[mem_addr+95:mem_addr+64]
1795 // dst[95:64] := MEM[mem_addr+63:mem_addr+32]
1796 // dst[127:96] := MEM[mem_addr+31:mem_addr]
1797 //
1798 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
_mm_loadr_ps(const float * p)1799 FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
1800 {
1801 float32x4_t v = vrev64q_f32(vld1q_f32(p));
1802 return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
1803 }
1804
1805 // Loads four single-precision, floating-point values.
1806 // https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
_mm_loadu_ps(const float * p)1807 FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
1808 {
1809 // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
1810 // equivalent for neon
1811 return vreinterpretq_m128_f32(vld1q_f32(p));
1812 }
1813
1814 // Load unaligned 16-bit integer from memory into the first element of dst.
1815 //
1816 // dst[15:0] := MEM[mem_addr+15:mem_addr]
1817 // dst[MAX:16] := 0
1818 //
1819 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
_mm_loadu_si16(const void * p)1820 FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
1821 {
1822 return vreinterpretq_m128i_s16(
1823 vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
1824 }
1825
1826 // Load unaligned 64-bit integer from memory into the first element of dst.
1827 //
1828 // dst[63:0] := MEM[mem_addr+63:mem_addr]
1829 // dst[MAX:64] := 0
1830 //
1831 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
_mm_loadu_si64(const void * p)1832 FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
1833 {
1834 return vreinterpretq_m128i_s64(
1835 vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
1836 }
1837
1838 // Allocate aligned blocks of memory.
1839 // https://software.intel.com/en-us/
1840 // cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
_mm_malloc(size_t size,size_t align)1841 FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
1842 {
1843 void *ptr;
1844 if (align == 1)
1845 return malloc(size);
1846 if (align == 2 || (sizeof(void *) == 8 && align == 4))
1847 align = sizeof(void *);
1848 if (!posix_memalign(&ptr, align, size))
1849 return ptr;
1850 return NULL;
1851 }
1852
1853 // Conditionally store 8-bit integer elements from a into memory using mask
1854 // (elements are not stored when the highest bit is not set in the corresponding
1855 // element) and a non-temporal memory hint.
1856 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64
_mm_maskmove_si64(__m64 a,__m64 mask,char * mem_addr)1857 FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
1858 {
1859 int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
1860 __m128 b = _mm_load_ps((const float *) mem_addr);
1861 int8x8_t masked =
1862 vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
1863 vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
1864 vst1_s8((int8_t *) mem_addr, masked);
1865 }
1866
1867 // Conditionally store 8-bit integer elements from a into memory using mask
1868 // (elements are not stored when the highest bit is not set in the corresponding
1869 // element) and a non-temporal memory hint.
1870 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq
1871 #define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
1872
1873 // Compare packed signed 16-bit integers in a and b, and store packed maximum
1874 // values in dst.
1875 //
1876 // FOR j := 0 to 3
1877 // i := j*16
1878 // dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
1879 // ENDFOR
1880 //
1881 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
_mm_max_pi16(__m64 a,__m64 b)1882 FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
1883 {
1884 return vreinterpret_m64_s16(
1885 vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
1886 }
1887
1888 // Computes the maximums of the four single-precision, floating-point values of
1889 // a and b.
1890 // https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
_mm_max_ps(__m128 a,__m128 b)1891 FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
1892 {
1893 #if SSE2NEON_PRECISE_MINMAX
1894 float32x4_t _a = vreinterpretq_f32_m128(a);
1895 float32x4_t _b = vreinterpretq_f32_m128(b);
1896 return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
1897 #else
1898 return vreinterpretq_m128_f32(
1899 vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1900 #endif
1901 }
1902
1903 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
1904 // values in dst.
1905 //
1906 // FOR j := 0 to 7
1907 // i := j*8
1908 // dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
1909 // ENDFOR
1910 //
1911 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
_mm_max_pu8(__m64 a,__m64 b)1912 FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
1913 {
1914 return vreinterpret_m64_u8(
1915 vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1916 }
1917
1918 // Computes the maximum of the two lower scalar single-precision floating point
1919 // values of a and b.
1920 // https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
_mm_max_ss(__m128 a,__m128 b)1921 FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
1922 {
1923 float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
1924 return vreinterpretq_m128_f32(
1925 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1926 }
1927
1928 // Compare packed signed 16-bit integers in a and b, and store packed minimum
1929 // values in dst.
1930 //
1931 // FOR j := 0 to 3
1932 // i := j*16
1933 // dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
1934 // ENDFOR
1935 //
1936 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
_mm_min_pi16(__m64 a,__m64 b)1937 FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
1938 {
1939 return vreinterpret_m64_s16(
1940 vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
1941 }
1942
1943 // Computes the minima of the four single-precision, floating-point values of a
1944 // and b.
1945 // https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
_mm_min_ps(__m128 a,__m128 b)1946 FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
1947 {
1948 #if SSE2NEON_PRECISE_MINMAX
1949 float32x4_t _a = vreinterpretq_f32_m128(a);
1950 float32x4_t _b = vreinterpretq_f32_m128(b);
1951 return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
1952 #else
1953 return vreinterpretq_m128_f32(
1954 vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1955 #endif
1956 }
1957
1958 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
1959 // values in dst.
1960 //
1961 // FOR j := 0 to 7
1962 // i := j*8
1963 // dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
1964 // ENDFOR
1965 //
1966 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
_mm_min_pu8(__m64 a,__m64 b)1967 FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
1968 {
1969 return vreinterpret_m64_u8(
1970 vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1971 }
1972
1973 // Computes the minimum of the two lower scalar single-precision floating point
1974 // values of a and b.
1975 // https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
_mm_min_ss(__m128 a,__m128 b)1976 FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
1977 {
1978 float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
1979 return vreinterpretq_m128_f32(
1980 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1981 }
1982
1983 // Sets the low word to the single-precision, floating-point value of b
1984 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
_mm_move_ss(__m128 a,__m128 b)1985 FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
1986 {
1987 return vreinterpretq_m128_f32(
1988 vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
1989 vreinterpretq_f32_m128(a), 0));
1990 }
1991
1992 // Moves the upper two values of B into the lower two values of A.
1993 //
1994 // r3 := a3
1995 // r2 := a2
1996 // r1 := b3
1997 // r0 := b2
_mm_movehl_ps(__m128 __A,__m128 __B)1998 FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
1999 {
2000 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
2001 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
2002 return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
2003 }
2004
2005 // Moves the lower two values of B into the upper two values of A.
2006 //
2007 // r3 := b1
2008 // r2 := b0
2009 // r1 := a1
2010 // r0 := a0
_mm_movelh_ps(__m128 __A,__m128 __B)2011 FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
2012 {
2013 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
2014 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
2015 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
2016 }
2017
2018 // Create mask from the most significant bit of each 8-bit element in a, and
2019 // store the result in dst.
2020 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8
_mm_movemask_pi8(__m64 a)2021 FORCE_INLINE int _mm_movemask_pi8(__m64 a)
2022 {
2023 uint8x8_t input = vreinterpret_u8_m64(a);
2024 #if defined(__aarch64__)
2025 static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
2026 uint8x8_t tmp = vshr_n_u8(input, 7);
2027 return vaddv_u8(vshl_u8(tmp, shift));
2028 #else
2029 // Refer the implementation of `_mm_movemask_epi8`
2030 uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2031 uint32x2_t paired16 =
2032 vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2033 uint8x8_t paired32 =
2034 vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2035 return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2036 #endif
2037 }
2038
2039 // NEON does not provide this method
2040 // Creates a 4-bit mask from the most significant bits of the four
2041 // single-precision, floating-point values.
2042 // https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
_mm_movemask_ps(__m128 a)2043 FORCE_INLINE int _mm_movemask_ps(__m128 a)
2044 {
2045 uint32x4_t input = vreinterpretq_u32_m128(a);
2046 #if defined(__aarch64__)
2047 static const int32x4_t shift = {0, 1, 2, 3};
2048 uint32x4_t tmp = vshrq_n_u32(input, 31);
2049 return vaddvq_u32(vshlq_u32(tmp, shift));
2050 #else
2051 // Uses the exact same method as _mm_movemask_epi8, see that for details.
2052 // Shift out everything but the sign bits with a 32-bit unsigned shift
2053 // right.
2054 uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2055 // Merge the two pairs together with a 64-bit unsigned shift right + add.
2056 uint8x16_t paired =
2057 vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2058 // Extract the result.
2059 return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2060 #endif
2061 }
2062
2063 // Multiplies the four single-precision, floating-point values of a and b.
2064 //
2065 // r0 := a0 * b0
2066 // r1 := a1 * b1
2067 // r2 := a2 * b2
2068 // r3 := a3 * b3
2069 //
2070 // https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
_mm_mul_ps(__m128 a,__m128 b)2071 FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2072 {
2073 return vreinterpretq_m128_f32(
2074 vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2075 }
2076
2077 // Multiply the lower single-precision (32-bit) floating-point element in a and
2078 // b, store the result in the lower element of dst, and copy the upper 3 packed
2079 // elements from a to the upper elements of dst.
2080 //
2081 // dst[31:0] := a[31:0] * b[31:0]
2082 // dst[127:32] := a[127:32]
2083 //
2084 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
_mm_mul_ss(__m128 a,__m128 b)2085 FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
2086 {
2087 return _mm_move_ss(a, _mm_mul_ps(a, b));
2088 }
2089
2090 // Multiply the packed unsigned 16-bit integers in a and b, producing
2091 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
2092 // integers in dst.
2093 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
_mm_mulhi_pu16(__m64 a,__m64 b)2094 FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
2095 {
2096 return vreinterpret_m64_u16(vshrn_n_u32(
2097 vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
2098 }
2099
2100 // Computes the bitwise OR of the four single-precision, floating-point values
2101 // of a and b.
2102 // https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
_mm_or_ps(__m128 a,__m128 b)2103 FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
2104 {
2105 return vreinterpretq_m128_s32(
2106 vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
2107 }
2108
2109 // Average packed unsigned 8-bit integers in a and b, and store the results in
2110 // dst.
2111 //
2112 // FOR j := 0 to 7
2113 // i := j*8
2114 // dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2115 // ENDFOR
2116 //
2117 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
2118 #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2119
2120 // Average packed unsigned 16-bit integers in a and b, and store the results in
2121 // dst.
2122 //
2123 // FOR j := 0 to 3
2124 // i := j*16
2125 // dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2126 // ENDFOR
2127 //
2128 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
2129 #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2130
2131 // Extract a 16-bit integer from a, selected with imm8, and store the result in
2132 // the lower element of dst.
2133 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
2134 #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2135
2136 // Copy a to dst, and insert the 16-bit integer i into dst at the location
2137 // specified by imm8.
2138 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
2139 #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2140
2141 // Compare packed signed 16-bit integers in a and b, and store packed maximum
2142 // values in dst.
2143 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
2144 #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2145
2146 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2147 // values in dst.
2148 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
2149 #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2150
2151 // Compare packed signed 16-bit integers in a and b, and store packed minimum
2152 // values in dst.
2153 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
2154 #define _m_pminsw(a, b) _mm_min_pi16(a, b)
2155
2156 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2157 // values in dst.
2158 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
2159 #define _m_pminub(a, b) _mm_min_pu8(a, b)
2160
2161 // Create mask from the most significant bit of each 8-bit element in a, and
2162 // store the result in dst.
2163 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb
2164 #define _m_pmovmskb(a) _mm_movemask_pi8(a)
2165
2166 // Multiply the packed unsigned 16-bit integers in a and b, producing
2167 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
2168 // integers in dst.
2169 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
2170 #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2171
2172 // Loads one cache line of data from address p to a location closer to the
2173 // processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
_mm_prefetch(const void * p,int i)2174 FORCE_INLINE void _mm_prefetch(const void *p, int i)
2175 {
2176 (void) i;
2177 __builtin_prefetch(p);
2178 }
2179
2180 // Compute the absolute differences of packed unsigned 8-bit integers in a and
2181 // b, then horizontally sum each consecutive 8 differences to produce four
2182 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2183 // 16 bits of dst.
2184 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw
2185 #define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2186
2187 // Shuffle 16-bit integers in a using the control in imm8, and store the results
2188 // in dst.
2189 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pshufw
2190 #define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2191
2192 // Compute the approximate reciprocal of packed single-precision (32-bit)
2193 // floating-point elements in a, and store the results in dst. The maximum
2194 // relative error for this approximation is less than 1.5*2^-12.
2195 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
_mm_rcp_ps(__m128 in)2196 FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
2197 {
2198 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2199 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2200 #if SSE2NEON_PRECISE_DIV
2201 // Additional Netwon-Raphson iteration for accuracy
2202 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2203 #endif
2204 return vreinterpretq_m128_f32(recip);
2205 }
2206
2207 // Compute the approximate reciprocal of the lower single-precision (32-bit)
2208 // floating-point element in a, store the result in the lower element of dst,
2209 // and copy the upper 3 packed elements from a to the upper elements of dst. The
2210 // maximum relative error for this approximation is less than 1.5*2^-12.
2211 //
2212 // dst[31:0] := (1.0 / a[31:0])
2213 // dst[127:32] := a[127:32]
2214 //
2215 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
_mm_rcp_ss(__m128 a)2216 FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
2217 {
2218 return _mm_move_ss(a, _mm_rcp_ps(a));
2219 }
2220
2221 // Computes the approximations of the reciprocal square roots of the four
2222 // single-precision floating point values of in.
2223 // The current precision is 1% error.
2224 // https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
_mm_rsqrt_ps(__m128 in)2225 FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
2226 {
2227 float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2228 #if SSE2NEON_PRECISE_SQRT
2229 // Additional Netwon-Raphson iteration for accuracy
2230 out = vmulq_f32(
2231 out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2232 out = vmulq_f32(
2233 out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2234 #endif
2235 return vreinterpretq_m128_f32(out);
2236 }
2237
2238 // Compute the approximate reciprocal square root of the lower single-precision
2239 // (32-bit) floating-point element in a, store the result in the lower element
2240 // of dst, and copy the upper 3 packed elements from a to the upper elements of
2241 // dst.
2242 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
_mm_rsqrt_ss(__m128 in)2243 FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
2244 {
2245 return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
2246 }
2247
2248 // Compute the absolute differences of packed unsigned 8-bit integers in a and
2249 // b, then horizontally sum each consecutive 8 differences to produce four
2250 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2251 // 16 bits of dst.
2252 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
_mm_sad_pu8(__m64 a,__m64 b)2253 FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
2254 {
2255 uint64x1_t t = vpaddl_u32(vpaddl_u16(
2256 vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
2257 return vreinterpret_m64_u16(
2258 vset_lane_u16(vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2259 }
2260
2261 // Sets the four single-precision, floating-point values to the four inputs.
2262 // https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
_mm_set_ps(float w,float z,float y,float x)2263 FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
2264 {
2265 float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
2266 return vreinterpretq_m128_f32(vld1q_f32(data));
2267 }
2268
2269 // Sets the four single-precision, floating-point values to w.
2270 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
_mm_set_ps1(float _w)2271 FORCE_INLINE __m128 _mm_set_ps1(float _w)
2272 {
2273 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2274 }
2275
2276 // Macro: Set the rounding mode bits of the MXCSR control and status register to
2277 // the value in unsigned 32-bit integer a. The rounding mode may contain any of
2278 // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
2279 // _MM_ROUND_TOWARD_ZERO
2280 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
_MM_SET_ROUNDING_MODE(int rounding)2281 FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
2282 {
2283 union {
2284 fpcr_bitfield field;
2285 #if defined(__aarch64__)
2286 uint64_t value;
2287 #else
2288 uint32_t value;
2289 #endif
2290 } r;
2291
2292 #if defined(__aarch64__)
2293 asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
2294 #else
2295 asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2296 #endif
2297
2298 switch (rounding) {
2299 case _MM_ROUND_TOWARD_ZERO:
2300 r.field.bit22 = 1;
2301 r.field.bit23 = 1;
2302 break;
2303 case _MM_ROUND_DOWN:
2304 r.field.bit22 = 0;
2305 r.field.bit23 = 1;
2306 break;
2307 case _MM_ROUND_UP:
2308 r.field.bit22 = 1;
2309 r.field.bit23 = 0;
2310 break;
2311 default: //_MM_ROUND_NEAREST
2312 r.field.bit22 = 0;
2313 r.field.bit23 = 0;
2314 }
2315
2316 #if defined(__aarch64__)
2317 asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
2318 #else
2319 asm volatile("vmsr FPSCR, %0" ::"r"(r)); /* write */
2320 #endif
2321 }
2322
2323 // Copy single-precision (32-bit) floating-point element a to the lower element
2324 // of dst, and zero the upper 3 elements.
2325 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
_mm_set_ss(float a)2326 FORCE_INLINE __m128 _mm_set_ss(float a)
2327 {
2328 float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
2329 return vreinterpretq_m128_f32(vld1q_f32(data));
2330 }
2331
2332 // Sets the four single-precision, floating-point values to w.
2333 //
2334 // r0 := r1 := r2 := r3 := w
2335 //
2336 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
_mm_set1_ps(float _w)2337 FORCE_INLINE __m128 _mm_set1_ps(float _w)
2338 {
2339 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2340 }
2341
_mm_setcsr(unsigned int a)2342 FORCE_INLINE void _mm_setcsr(unsigned int a)
2343 {
2344 _MM_SET_ROUNDING_MODE(a);
2345 }
2346
2347 // Sets the four single-precision, floating-point values to the four inputs in
2348 // reverse order.
2349 // https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
_mm_setr_ps(float w,float z,float y,float x)2350 FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
2351 {
2352 float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
2353 return vreinterpretq_m128_f32(vld1q_f32(data));
2354 }
2355
2356 // Clears the four single-precision, floating-point values.
2357 // https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
_mm_setzero_ps(void)2358 FORCE_INLINE __m128 _mm_setzero_ps(void)
2359 {
2360 return vreinterpretq_m128_f32(vdupq_n_f32(0));
2361 }
2362
2363 // Shuffle 16-bit integers in a using the control in imm8, and store the results
2364 // in dst.
2365 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pi16
2366 #if __has_builtin(__builtin_shufflevector)
2367 #define _mm_shuffle_pi16(a, imm) \
2368 __extension__({ \
2369 vreinterpret_m64_s16(__builtin_shufflevector( \
2370 vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
2371 ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))); \
2372 })
2373 #else
2374 #define _mm_shuffle_pi16(a, imm) \
2375 __extension__({ \
2376 int16x4_t ret; \
2377 ret = \
2378 vmov_n_s16(vget_lane_s16(vreinterpret_s16_m64(a), (imm) & (0x3))); \
2379 ret = vset_lane_s16( \
2380 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 2) & 0x3), ret, \
2381 1); \
2382 ret = vset_lane_s16( \
2383 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 4) & 0x3), ret, \
2384 2); \
2385 ret = vset_lane_s16( \
2386 vget_lane_s16(vreinterpret_s16_m64(a), ((imm) >> 6) & 0x3), ret, \
2387 3); \
2388 vreinterpret_m64_s16(ret); \
2389 })
2390 #endif
2391
2392 // Guarantees that every preceding store is globally visible before any
2393 // subsequent store.
2394 // https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
_mm_sfence(void)2395 FORCE_INLINE void _mm_sfence(void)
2396 {
2397 __sync_synchronize();
2398 }
2399
2400 // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
2401 // int imm)
2402 #if __has_builtin(__builtin_shufflevector)
2403 #define _mm_shuffle_ps(a, b, imm) \
2404 __extension__({ \
2405 float32x4_t _input1 = vreinterpretq_f32_m128(a); \
2406 float32x4_t _input2 = vreinterpretq_f32_m128(b); \
2407 float32x4_t _shuf = __builtin_shufflevector( \
2408 _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2409 (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2410 vreinterpretq_m128_f32(_shuf); \
2411 })
2412 #else // generic
2413 #define _mm_shuffle_ps(a, b, imm) \
2414 __extension__({ \
2415 __m128 ret; \
2416 switch (imm) { \
2417 case _MM_SHUFFLE(1, 0, 3, 2): \
2418 ret = _mm_shuffle_ps_1032((a), (b)); \
2419 break; \
2420 case _MM_SHUFFLE(2, 3, 0, 1): \
2421 ret = _mm_shuffle_ps_2301((a), (b)); \
2422 break; \
2423 case _MM_SHUFFLE(0, 3, 2, 1): \
2424 ret = _mm_shuffle_ps_0321((a), (b)); \
2425 break; \
2426 case _MM_SHUFFLE(2, 1, 0, 3): \
2427 ret = _mm_shuffle_ps_2103((a), (b)); \
2428 break; \
2429 case _MM_SHUFFLE(1, 0, 1, 0): \
2430 ret = _mm_movelh_ps((a), (b)); \
2431 break; \
2432 case _MM_SHUFFLE(1, 0, 0, 1): \
2433 ret = _mm_shuffle_ps_1001((a), (b)); \
2434 break; \
2435 case _MM_SHUFFLE(0, 1, 0, 1): \
2436 ret = _mm_shuffle_ps_0101((a), (b)); \
2437 break; \
2438 case _MM_SHUFFLE(3, 2, 1, 0): \
2439 ret = _mm_shuffle_ps_3210((a), (b)); \
2440 break; \
2441 case _MM_SHUFFLE(0, 0, 1, 1): \
2442 ret = _mm_shuffle_ps_0011((a), (b)); \
2443 break; \
2444 case _MM_SHUFFLE(0, 0, 2, 2): \
2445 ret = _mm_shuffle_ps_0022((a), (b)); \
2446 break; \
2447 case _MM_SHUFFLE(2, 2, 0, 0): \
2448 ret = _mm_shuffle_ps_2200((a), (b)); \
2449 break; \
2450 case _MM_SHUFFLE(3, 2, 0, 2): \
2451 ret = _mm_shuffle_ps_3202((a), (b)); \
2452 break; \
2453 case _MM_SHUFFLE(3, 2, 3, 2): \
2454 ret = _mm_movehl_ps((b), (a)); \
2455 break; \
2456 case _MM_SHUFFLE(1, 1, 3, 3): \
2457 ret = _mm_shuffle_ps_1133((a), (b)); \
2458 break; \
2459 case _MM_SHUFFLE(2, 0, 1, 0): \
2460 ret = _mm_shuffle_ps_2010((a), (b)); \
2461 break; \
2462 case _MM_SHUFFLE(2, 0, 0, 1): \
2463 ret = _mm_shuffle_ps_2001((a), (b)); \
2464 break; \
2465 case _MM_SHUFFLE(2, 0, 3, 2): \
2466 ret = _mm_shuffle_ps_2032((a), (b)); \
2467 break; \
2468 default: \
2469 ret = _mm_shuffle_ps_default((a), (b), (imm)); \
2470 break; \
2471 } \
2472 ret; \
2473 })
2474 #endif
2475
2476 // Computes the approximations of square roots of the four single-precision,
2477 // floating-point values of a. First computes reciprocal square roots and then
2478 // reciprocals of the four values.
2479 //
2480 // r0 := sqrt(a0)
2481 // r1 := sqrt(a1)
2482 // r2 := sqrt(a2)
2483 // r3 := sqrt(a3)
2484 //
2485 // https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
_mm_sqrt_ps(__m128 in)2486 FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
2487 {
2488 #if SSE2NEON_PRECISE_SQRT
2489 float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2490
2491 // Test for vrsqrteq_f32(0) -> positive infinity case.
2492 // Change to zero, so that s * 1/sqrt(s) result is zero too.
2493 const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2494 const uint32x4_t div_by_zero =
2495 vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2496 recip = vreinterpretq_f32_u32(
2497 vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2498
2499 // Additional Netwon-Raphson iteration for accuracy
2500 recip = vmulq_f32(
2501 vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2502 recip);
2503 recip = vmulq_f32(
2504 vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2505 recip);
2506
2507 // sqrt(s) = s * 1/sqrt(s)
2508 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
2509 #elif defined(__aarch64__)
2510 return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
2511 #else
2512 float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2513 float32x4_t sq = vrecpeq_f32(recipsq);
2514 return vreinterpretq_m128_f32(sq);
2515 #endif
2516 }
2517
2518 // Computes the approximation of the square root of the scalar single-precision
2519 // floating point value of in.
2520 // https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
_mm_sqrt_ss(__m128 in)2521 FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
2522 {
2523 float32_t value =
2524 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
2525 return vreinterpretq_m128_f32(
2526 vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
2527 }
2528
2529 // Stores four single-precision, floating-point values.
2530 // https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
_mm_store_ps(float * p,__m128 a)2531 FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
2532 {
2533 vst1q_f32(p, vreinterpretq_f32_m128(a));
2534 }
2535
2536 // Store the lower single-precision (32-bit) floating-point element from a into
2537 // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2538 // boundary or a general-protection exception may be generated.
2539 //
2540 // MEM[mem_addr+31:mem_addr] := a[31:0]
2541 // MEM[mem_addr+63:mem_addr+32] := a[31:0]
2542 // MEM[mem_addr+95:mem_addr+64] := a[31:0]
2543 // MEM[mem_addr+127:mem_addr+96] := a[31:0]
2544 //
2545 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
_mm_store_ps1(float * p,__m128 a)2546 FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
2547 {
2548 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
2549 vst1q_f32(p, vdupq_n_f32(a0));
2550 }
2551
2552 // Stores the lower single - precision, floating - point value.
2553 // https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
_mm_store_ss(float * p,__m128 a)2554 FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
2555 {
2556 vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
2557 }
2558
2559 // Store the lower single-precision (32-bit) floating-point element from a into
2560 // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2561 // boundary or a general-protection exception may be generated.
2562 //
2563 // MEM[mem_addr+31:mem_addr] := a[31:0]
2564 // MEM[mem_addr+63:mem_addr+32] := a[31:0]
2565 // MEM[mem_addr+95:mem_addr+64] := a[31:0]
2566 // MEM[mem_addr+127:mem_addr+96] := a[31:0]
2567 //
2568 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
2569 #define _mm_store1_ps _mm_store_ps1
2570
2571 // Stores the upper two single-precision, floating-point values of a to the
2572 // address p.
2573 //
2574 // *p0 := a2
2575 // *p1 := a3
2576 //
2577 // https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
_mm_storeh_pi(__m64 * p,__m128 a)2578 FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
2579 {
2580 *p = vreinterpret_m64_f32(vget_high_f32(a));
2581 }
2582
2583 // Stores the lower two single-precision floating point values of a to the
2584 // address p.
2585 //
2586 // *p0 := a0
2587 // *p1 := a1
2588 //
2589 // https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
_mm_storel_pi(__m64 * p,__m128 a)2590 FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
2591 {
2592 *p = vreinterpret_m64_f32(vget_low_f32(a));
2593 }
2594
2595 // Store 4 single-precision (32-bit) floating-point elements from a into memory
2596 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
2597 // general-protection exception may be generated.
2598 //
2599 // MEM[mem_addr+31:mem_addr] := a[127:96]
2600 // MEM[mem_addr+63:mem_addr+32] := a[95:64]
2601 // MEM[mem_addr+95:mem_addr+64] := a[63:32]
2602 // MEM[mem_addr+127:mem_addr+96] := a[31:0]
2603 //
2604 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
_mm_storer_ps(float * p,__m128 a)2605 FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
2606 {
2607 float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
2608 float32x4_t rev = vextq_f32(tmp, tmp, 2);
2609 vst1q_f32(p, rev);
2610 }
2611
2612 // Stores four single-precision, floating-point values.
2613 // https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
_mm_storeu_ps(float * p,__m128 a)2614 FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
2615 {
2616 vst1q_f32(p, vreinterpretq_f32_m128(a));
2617 }
2618
2619 // Stores 16-bits of integer data a at the address p.
2620 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16
_mm_storeu_si16(void * p,__m128i a)2621 FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
2622 {
2623 vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
2624 }
2625
2626 // Stores 64-bits of integer data a at the address p.
2627 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64
_mm_storeu_si64(void * p,__m128i a)2628 FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
2629 {
2630 vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
2631 }
2632
2633 // Store 64-bits of integer data from a into memory using a non-temporal memory
2634 // hint.
2635 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi
_mm_stream_pi(__m64 * p,__m64 a)2636 FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
2637 {
2638 vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
2639 }
2640
2641 // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
2642 // point elements) from a into memory using a non-temporal memory hint.
2643 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
_mm_stream_ps(float * p,__m128 a)2644 FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
2645 {
2646 #if __has_builtin(__builtin_nontemporal_store)
2647 __builtin_nontemporal_store(a, (float32x4_t *) p);
2648 #else
2649 vst1q_f32(p, vreinterpretq_f32_m128(a));
2650 #endif
2651 }
2652
2653 // Subtracts the four single-precision, floating-point values of a and b.
2654 //
2655 // r0 := a0 - b0
2656 // r1 := a1 - b1
2657 // r2 := a2 - b2
2658 // r3 := a3 - b3
2659 //
2660 // https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
_mm_sub_ps(__m128 a,__m128 b)2661 FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2662 {
2663 return vreinterpretq_m128_f32(
2664 vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2665 }
2666
2667 // Subtract the lower single-precision (32-bit) floating-point element in b from
2668 // the lower single-precision (32-bit) floating-point element in a, store the
2669 // result in the lower element of dst, and copy the upper 3 packed elements from
2670 // a to the upper elements of dst.
2671 //
2672 // dst[31:0] := a[31:0] - b[31:0]
2673 // dst[127:32] := a[127:32]
2674 //
2675 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
_mm_sub_ss(__m128 a,__m128 b)2676 FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2677 {
2678 return _mm_move_ss(a, _mm_sub_ps(a, b));
2679 }
2680
2681 // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
2682 // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
2683 // transposed matrix in these vectors (row0 now contains column 0, etc.).
2684 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
2685 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2686 do { \
2687 float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
2688 float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
2689 row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
2690 vget_low_f32(ROW23.val[0])); \
2691 row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
2692 vget_low_f32(ROW23.val[1])); \
2693 row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
2694 vget_high_f32(ROW23.val[0])); \
2695 row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
2696 vget_high_f32(ROW23.val[1])); \
2697 } while (0)
2698
2699 // according to the documentation, these intrinsics behave the same as the
2700 // non-'u' versions. We'll just alias them here.
2701 #define _mm_ucomieq_ss _mm_comieq_ss
2702 #define _mm_ucomige_ss _mm_comige_ss
2703 #define _mm_ucomigt_ss _mm_comigt_ss
2704 #define _mm_ucomile_ss _mm_comile_ss
2705 #define _mm_ucomilt_ss _mm_comilt_ss
2706 #define _mm_ucomineq_ss _mm_comineq_ss
2707
2708 // Return vector of type __m128i with undefined elements.
2709 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_undefined_si128
_mm_undefined_si128(void)2710 FORCE_INLINE __m128i _mm_undefined_si128(void)
2711 {
2712 #if defined(__GNUC__) || defined(__clang__)
2713 #pragma GCC diagnostic push
2714 #pragma GCC diagnostic ignored "-Wuninitialized"
2715 #endif
2716 __m128i a;
2717 return a;
2718 #if defined(__GNUC__) || defined(__clang__)
2719 #pragma GCC diagnostic pop
2720 #endif
2721 }
2722
2723 // Return vector of type __m128 with undefined elements.
2724 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
_mm_undefined_ps(void)2725 FORCE_INLINE __m128 _mm_undefined_ps(void)
2726 {
2727 #if defined(__GNUC__) || defined(__clang__)
2728 #pragma GCC diagnostic push
2729 #pragma GCC diagnostic ignored "-Wuninitialized"
2730 #endif
2731 __m128 a;
2732 return a;
2733 #if defined(__GNUC__) || defined(__clang__)
2734 #pragma GCC diagnostic pop
2735 #endif
2736 }
2737
2738 // Selects and interleaves the upper two single-precision, floating-point values
2739 // from a and b.
2740 //
2741 // r0 := a2
2742 // r1 := b2
2743 // r2 := a3
2744 // r3 := b3
2745 //
2746 // https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
_mm_unpackhi_ps(__m128 a,__m128 b)2747 FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
2748 {
2749 #if defined(__aarch64__)
2750 return vreinterpretq_m128_f32(
2751 vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2752 #else
2753 float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
2754 float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
2755 float32x2x2_t result = vzip_f32(a1, b1);
2756 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2757 #endif
2758 }
2759
2760 // Selects and interleaves the lower two single-precision, floating-point values
2761 // from a and b.
2762 //
2763 // r0 := a0
2764 // r1 := b0
2765 // r2 := a1
2766 // r3 := b1
2767 //
2768 // https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
_mm_unpacklo_ps(__m128 a,__m128 b)2769 FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
2770 {
2771 #if defined(__aarch64__)
2772 return vreinterpretq_m128_f32(
2773 vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2774 #else
2775 float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
2776 float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
2777 float32x2x2_t result = vzip_f32(a1, b1);
2778 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2779 #endif
2780 }
2781
2782 // Computes bitwise EXOR (exclusive-or) of the four single-precision,
2783 // floating-point values of a and b.
2784 // https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
_mm_xor_ps(__m128 a,__m128 b)2785 FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
2786 {
2787 return vreinterpretq_m128_s32(
2788 veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
2789 }
2790
2791 /* SSE2 */
2792
2793 // Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
2794 // unsigned 16-bit integers in b.
2795 // https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
_mm_add_epi16(__m128i a,__m128i b)2796 FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
2797 {
2798 return vreinterpretq_m128i_s16(
2799 vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2800 }
2801
2802 // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
2803 // unsigned 32-bit integers in b.
2804 //
2805 // r0 := a0 + b0
2806 // r1 := a1 + b1
2807 // r2 := a2 + b2
2808 // r3 := a3 + b3
2809 //
2810 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
_mm_add_epi32(__m128i a,__m128i b)2811 FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
2812 {
2813 return vreinterpretq_m128i_s32(
2814 vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2815 }
2816
2817 // Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
2818 // unsigned 32-bit integers in b.
2819 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
_mm_add_epi64(__m128i a,__m128i b)2820 FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
2821 {
2822 return vreinterpretq_m128i_s64(
2823 vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2824 }
2825
2826 // Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
2827 // unsigned 8-bit integers in b.
2828 // https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
_mm_add_epi8(__m128i a,__m128i b)2829 FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
2830 {
2831 return vreinterpretq_m128i_s8(
2832 vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2833 }
2834
2835 // Add packed double-precision (64-bit) floating-point elements in a and b, and
2836 // store the results in dst.
2837 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
_mm_add_pd(__m128d a,__m128d b)2838 FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
2839 {
2840 #if defined(__aarch64__)
2841 return vreinterpretq_m128d_f64(
2842 vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2843 #else
2844 double *da = (double *) &a;
2845 double *db = (double *) &b;
2846 double c[2];
2847 c[0] = da[0] + db[0];
2848 c[1] = da[1] + db[1];
2849 return vld1q_f32((float32_t *) c);
2850 #endif
2851 }
2852
2853 // Add the lower double-precision (64-bit) floating-point element in a and b,
2854 // store the result in the lower element of dst, and copy the upper element from
2855 // a to the upper element of dst.
2856 //
2857 // dst[63:0] := a[63:0] + b[63:0]
2858 // dst[127:64] := a[127:64]
2859 //
2860 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
_mm_add_sd(__m128d a,__m128d b)2861 FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
2862 {
2863 #if defined(__aarch64__)
2864 return _mm_move_sd(a, _mm_add_pd(a, b));
2865 #else
2866 double *da = (double *) &a;
2867 double *db = (double *) &b;
2868 double c[2];
2869 c[0] = da[0] + db[0];
2870 c[1] = da[1];
2871 return vld1q_f32((float32_t *) c);
2872 #endif
2873 }
2874
2875 // Add 64-bit integers a and b, and store the result in dst.
2876 //
2877 // dst[63:0] := a[63:0] + b[63:0]
2878 //
2879 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
_mm_add_si64(__m64 a,__m64 b)2880 FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
2881 {
2882 return vreinterpret_m64_s64(
2883 vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2884 }
2885
2886 // Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
2887 // and saturates.
2888 //
2889 // r0 := SignedSaturate(a0 + b0)
2890 // r1 := SignedSaturate(a1 + b1)
2891 // ...
2892 // r7 := SignedSaturate(a7 + b7)
2893 //
2894 // https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
_mm_adds_epi16(__m128i a,__m128i b)2895 FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
2896 {
2897 return vreinterpretq_m128i_s16(
2898 vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2899 }
2900
2901 // Add packed signed 8-bit integers in a and b using saturation, and store the
2902 // results in dst.
2903 //
2904 // FOR j := 0 to 15
2905 // i := j*8
2906 // dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
2907 // ENDFOR
2908 //
2909 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
_mm_adds_epi8(__m128i a,__m128i b)2910 FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
2911 {
2912 return vreinterpretq_m128i_s8(
2913 vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2914 }
2915
2916 // Add packed unsigned 16-bit integers in a and b using saturation, and store
2917 // the results in dst.
2918 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
_mm_adds_epu16(__m128i a,__m128i b)2919 FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
2920 {
2921 return vreinterpretq_m128i_u16(
2922 vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
2923 }
2924
2925 // Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
2926 // b and saturates..
2927 // https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
_mm_adds_epu8(__m128i a,__m128i b)2928 FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
2929 {
2930 return vreinterpretq_m128i_u8(
2931 vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2932 }
2933
2934 // Compute the bitwise AND of packed double-precision (64-bit) floating-point
2935 // elements in a and b, and store the results in dst.
2936 //
2937 // FOR j := 0 to 1
2938 // i := j*64
2939 // dst[i+63:i] := a[i+63:i] AND b[i+63:i]
2940 // ENDFOR
2941 //
2942 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
_mm_and_pd(__m128d a,__m128d b)2943 FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
2944 {
2945 return vreinterpretq_m128d_s64(
2946 vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
2947 }
2948
2949 // Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
2950 // b.
2951 //
2952 // r := a & b
2953 //
2954 // https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
_mm_and_si128(__m128i a,__m128i b)2955 FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
2956 {
2957 return vreinterpretq_m128i_s32(
2958 vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2959 }
2960
2961 // Compute the bitwise NOT of packed double-precision (64-bit) floating-point
2962 // elements in a and then AND with b, and store the results in dst.
2963 //
2964 // FOR j := 0 to 1
2965 // i := j*64
2966 // dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
2967 // ENDFOR
2968 //
2969 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
_mm_andnot_pd(__m128d a,__m128d b)2970 FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
2971 {
2972 // *NOTE* argument swap
2973 return vreinterpretq_m128d_s64(
2974 vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
2975 }
2976
2977 // Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
2978 // 128-bit value in a.
2979 //
2980 // r := (~a) & b
2981 //
2982 // https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
_mm_andnot_si128(__m128i a,__m128i b)2983 FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
2984 {
2985 return vreinterpretq_m128i_s32(
2986 vbicq_s32(vreinterpretq_s32_m128i(b),
2987 vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
2988 }
2989
2990 // Computes the average of the 8 unsigned 16-bit integers in a and the 8
2991 // unsigned 16-bit integers in b and rounds.
2992 //
2993 // r0 := (a0 + b0) / 2
2994 // r1 := (a1 + b1) / 2
2995 // ...
2996 // r7 := (a7 + b7) / 2
2997 //
2998 // https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
_mm_avg_epu16(__m128i a,__m128i b)2999 FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
3000 {
3001 return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
3002 vreinterpretq_u16_m128i(b));
3003 }
3004
3005 // Computes the average of the 16 unsigned 8-bit integers in a and the 16
3006 // unsigned 8-bit integers in b and rounds.
3007 //
3008 // r0 := (a0 + b0) / 2
3009 // r1 := (a1 + b1) / 2
3010 // ...
3011 // r15 := (a15 + b15) / 2
3012 //
3013 // https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
_mm_avg_epu8(__m128i a,__m128i b)3014 FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
3015 {
3016 return vreinterpretq_m128i_u8(
3017 vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3018 }
3019
3020 // Shift a left by imm8 bytes while shifting in zeros, and store the results in
3021 // dst.
3022 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128
3023 #define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3024
3025 // Shift a right by imm8 bytes while shifting in zeros, and store the results in
3026 // dst.
3027 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128
3028 #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3029
3030 // Cast vector of type __m128d to type __m128. This intrinsic is only used for
3031 // compilation and does not generate any instructions, thus it has zero latency.
3032 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
_mm_castpd_ps(__m128d a)3033 FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
3034 {
3035 return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
3036 }
3037
3038 // Cast vector of type __m128d to type __m128i. This intrinsic is only used for
3039 // compilation and does not generate any instructions, thus it has zero latency.
3040 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
_mm_castpd_si128(__m128d a)3041 FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
3042 {
3043 return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
3044 }
3045
3046 // Cast vector of type __m128 to type __m128d. This intrinsic is only used for
3047 // compilation and does not generate any instructions, thus it has zero latency.
3048 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
_mm_castps_pd(__m128 a)3049 FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
3050 {
3051 return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
3052 }
3053
3054 // Applies a type cast to reinterpret four 32-bit floating point values passed
3055 // in as a 128-bit parameter as packed 32-bit integers.
3056 // https://msdn.microsoft.com/en-us/library/bb514099.aspx
_mm_castps_si128(__m128 a)3057 FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
3058 {
3059 return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
3060 }
3061
3062 // Cast vector of type __m128i to type __m128d. This intrinsic is only used for
3063 // compilation and does not generate any instructions, thus it has zero latency.
3064 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
_mm_castsi128_pd(__m128i a)3065 FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
3066 {
3067 #if defined(__aarch64__)
3068 return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
3069 #else
3070 return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
3071 #endif
3072 }
3073
3074 // Applies a type cast to reinterpret four 32-bit integers passed in as a
3075 // 128-bit parameter as packed 32-bit floating point values.
3076 // https://msdn.microsoft.com/en-us/library/bb514029.aspx
_mm_castsi128_ps(__m128i a)3077 FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
3078 {
3079 return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
3080 }
3081
3082 // Cache line containing p is flushed and invalidated from all caches in the
3083 // coherency domain. :
3084 // https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
_mm_clflush(void const * p)3085 FORCE_INLINE void _mm_clflush(void const *p)
3086 {
3087 (void) p;
3088 // no corollary for Neon?
3089 }
3090
3091 // Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
3092 // unsigned 16-bit integers in b for equality.
3093 // https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
_mm_cmpeq_epi16(__m128i a,__m128i b)3094 FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
3095 {
3096 return vreinterpretq_m128i_u16(
3097 vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3098 }
3099
3100 // Compare packed 32-bit integers in a and b for equality, and store the results
3101 // in dst
_mm_cmpeq_epi32(__m128i a,__m128i b)3102 FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
3103 {
3104 return vreinterpretq_m128i_u32(
3105 vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3106 }
3107
3108 // Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
3109 // unsigned 8-bit integers in b for equality.
3110 // https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
_mm_cmpeq_epi8(__m128i a,__m128i b)3111 FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
3112 {
3113 return vreinterpretq_m128i_u8(
3114 vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3115 }
3116
3117 // Compare packed double-precision (64-bit) floating-point elements in a and b
3118 // for equality, and store the results in dst.
3119 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
_mm_cmpeq_pd(__m128d a,__m128d b)3120 FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
3121 {
3122 #if defined(__aarch64__)
3123 return vreinterpretq_m128d_u64(
3124 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3125 #else
3126 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3127 uint32x4_t cmp =
3128 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3129 uint32x4_t swapped = vrev64q_u32(cmp);
3130 return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
3131 #endif
3132 }
3133
3134 // Compare the lower double-precision (64-bit) floating-point elements in a and
3135 // b for equality, store the result in the lower element of dst, and copy the
3136 // upper element from a to the upper element of dst.
3137 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd
_mm_cmpeq_sd(__m128d a,__m128d b)3138 FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
3139 {
3140 return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
3141 }
3142
3143 // Compare packed double-precision (64-bit) floating-point elements in a and b
3144 // for greater-than-or-equal, and store the results in dst.
3145 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd
_mm_cmpge_pd(__m128d a,__m128d b)3146 FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
3147 {
3148 #if defined(__aarch64__)
3149 return vreinterpretq_m128d_u64(
3150 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3151 #else
3152 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3153 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3154 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3155 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3156 uint64_t d[2];
3157 d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3158 d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3159
3160 return vreinterpretq_m128d_u64(vld1q_u64(d));
3161 #endif
3162 }
3163
3164 // Compare the lower double-precision (64-bit) floating-point elements in a and
3165 // b for greater-than-or-equal, store the result in the lower element of dst,
3166 // and copy the upper element from a to the upper element of dst.
3167 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd
_mm_cmpge_sd(__m128d a,__m128d b)3168 FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
3169 {
3170 #if defined(__aarch64__)
3171 return _mm_move_sd(a, _mm_cmpge_pd(a, b));
3172 #else
3173 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3174 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3175 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3176 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3177 uint64_t d[2];
3178 d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3179 d[1] = a1;
3180
3181 return vreinterpretq_m128d_u64(vld1q_u64(d));
3182 #endif
3183 }
3184
3185 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
3186 // in b for greater than.
3187 //
3188 // r0 := (a0 > b0) ? 0xffff : 0x0
3189 // r1 := (a1 > b1) ? 0xffff : 0x0
3190 // ...
3191 // r7 := (a7 > b7) ? 0xffff : 0x0
3192 //
3193 // https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
_mm_cmpgt_epi16(__m128i a,__m128i b)3194 FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
3195 {
3196 return vreinterpretq_m128i_u16(
3197 vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3198 }
3199
3200 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
3201 // in b for greater than.
3202 // https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
_mm_cmpgt_epi32(__m128i a,__m128i b)3203 FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
3204 {
3205 return vreinterpretq_m128i_u32(
3206 vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3207 }
3208
3209 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
3210 // in b for greater than.
3211 //
3212 // r0 := (a0 > b0) ? 0xff : 0x0
3213 // r1 := (a1 > b1) ? 0xff : 0x0
3214 // ...
3215 // r15 := (a15 > b15) ? 0xff : 0x0
3216 //
3217 // https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
_mm_cmpgt_epi8(__m128i a,__m128i b)3218 FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
3219 {
3220 return vreinterpretq_m128i_u8(
3221 vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3222 }
3223
3224 // Compare packed double-precision (64-bit) floating-point elements in a and b
3225 // for greater-than, and store the results in dst.
3226 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd
_mm_cmpgt_pd(__m128d a,__m128d b)3227 FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
3228 {
3229 #if defined(__aarch64__)
3230 return vreinterpretq_m128d_u64(
3231 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3232 #else
3233 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3234 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3235 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3236 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3237 uint64_t d[2];
3238 d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3239 d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3240
3241 return vreinterpretq_m128d_u64(vld1q_u64(d));
3242 #endif
3243 }
3244
3245 // Compare the lower double-precision (64-bit) floating-point elements in a and
3246 // b for greater-than, store the result in the lower element of dst, and copy
3247 // the upper element from a to the upper element of dst.
3248 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd
_mm_cmpgt_sd(__m128d a,__m128d b)3249 FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
3250 {
3251 #if defined(__aarch64__)
3252 return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
3253 #else
3254 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3255 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3256 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3257 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3258 uint64_t d[2];
3259 d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3260 d[1] = a1;
3261
3262 return vreinterpretq_m128d_u64(vld1q_u64(d));
3263 #endif
3264 }
3265
3266 // Compare packed double-precision (64-bit) floating-point elements in a and b
3267 // for less-than-or-equal, and store the results in dst.
3268 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd
_mm_cmple_pd(__m128d a,__m128d b)3269 FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
3270 {
3271 #if defined(__aarch64__)
3272 return vreinterpretq_m128d_u64(
3273 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3274 #else
3275 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3276 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3277 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3278 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3279 uint64_t d[2];
3280 d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3281 d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3282
3283 return vreinterpretq_m128d_u64(vld1q_u64(d));
3284 #endif
3285 }
3286
3287 // Compare the lower double-precision (64-bit) floating-point elements in a and
3288 // b for less-than-or-equal, store the result in the lower element of dst, and
3289 // copy the upper element from a to the upper element of dst.
3290 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd
_mm_cmple_sd(__m128d a,__m128d b)3291 FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
3292 {
3293 #if defined(__aarch64__)
3294 return _mm_move_sd(a, _mm_cmple_pd(a, b));
3295 #else
3296 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3297 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3298 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3299 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3300 uint64_t d[2];
3301 d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3302 d[1] = a1;
3303
3304 return vreinterpretq_m128d_u64(vld1q_u64(d));
3305 #endif
3306 }
3307
3308 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
3309 // in b for less than.
3310 //
3311 // r0 := (a0 < b0) ? 0xffff : 0x0
3312 // r1 := (a1 < b1) ? 0xffff : 0x0
3313 // ...
3314 // r7 := (a7 < b7) ? 0xffff : 0x0
3315 //
3316 // https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
_mm_cmplt_epi16(__m128i a,__m128i b)3317 FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
3318 {
3319 return vreinterpretq_m128i_u16(
3320 vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3321 }
3322
3323
3324 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
3325 // in b for less than.
3326 // https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
_mm_cmplt_epi32(__m128i a,__m128i b)3327 FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
3328 {
3329 return vreinterpretq_m128i_u32(
3330 vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3331 }
3332
3333 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
3334 // in b for lesser than.
3335 // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
_mm_cmplt_epi8(__m128i a,__m128i b)3336 FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
3337 {
3338 return vreinterpretq_m128i_u8(
3339 vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3340 }
3341
3342 // Compare packed double-precision (64-bit) floating-point elements in a and b
3343 // for less-than, and store the results in dst.
3344 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd
_mm_cmplt_pd(__m128d a,__m128d b)3345 FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
3346 {
3347 #if defined(__aarch64__)
3348 return vreinterpretq_m128d_u64(
3349 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3350 #else
3351 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3352 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3353 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3354 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3355 uint64_t d[2];
3356 d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3357 d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3358
3359 return vreinterpretq_m128d_u64(vld1q_u64(d));
3360 #endif
3361 }
3362
3363 // Compare the lower double-precision (64-bit) floating-point elements in a and
3364 // b for less-than, store the result in the lower element of dst, and copy the
3365 // upper element from a to the upper element of dst.
3366 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd
_mm_cmplt_sd(__m128d a,__m128d b)3367 FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
3368 {
3369 #if defined(__aarch64__)
3370 return _mm_move_sd(a, _mm_cmplt_pd(a, b));
3371 #else
3372 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3373 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3374 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3375 uint64_t d[2];
3376 d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3377 d[1] = a1;
3378
3379 return vreinterpretq_m128d_u64(vld1q_u64(d));
3380 #endif
3381 }
3382
3383 // Compare packed double-precision (64-bit) floating-point elements in a and b
3384 // for not-equal, and store the results in dst.
3385 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd
_mm_cmpneq_pd(__m128d a,__m128d b)3386 FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
3387 {
3388 #if defined(__aarch64__)
3389 return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
3390 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3391 #else
3392 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3393 uint32x4_t cmp =
3394 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3395 uint32x4_t swapped = vrev64q_u32(cmp);
3396 return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
3397 #endif
3398 }
3399
3400 // Compare the lower double-precision (64-bit) floating-point elements in a and
3401 // b for not-equal, store the result in the lower element of dst, and copy the
3402 // upper element from a to the upper element of dst.
3403 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd
_mm_cmpneq_sd(__m128d a,__m128d b)3404 FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
3405 {
3406 return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
3407 }
3408
3409 // Compare packed double-precision (64-bit) floating-point elements in a and b
3410 // for not-greater-than-or-equal, and store the results in dst.
3411 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd
3412 #define _mm_cmpnge_pd(a, b) _mm_cmplt_pd(a, b)
3413
3414 // Compare the lower double-precision (64-bit) floating-point elements in a and
3415 // b for not-greater-than-or-equal, store the result in the lower element of
3416 // dst, and copy the upper element from a to the upper element of dst.
3417 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd
3418 #define _mm_cmpnge_sd(a, b) _mm_cmplt_sd(a, b)
3419
3420 // Compare packed double-precision (64-bit) floating-point elements in a and b
3421 // for not-greater-than, and store the results in dst.
3422 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_cmpngt_pd
3423 #define _mm_cmpngt_pd(a, b) _mm_cmple_pd(a, b)
3424
3425 // Compare the lower double-precision (64-bit) floating-point elements in a and
3426 // b for not-greater-than, store the result in the lower element of dst, and
3427 // copy the upper element from a to the upper element of dst.
3428 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd
3429 #define _mm_cmpngt_sd(a, b) _mm_cmple_sd(a, b)
3430
3431 // Compare packed double-precision (64-bit) floating-point elements in a and b
3432 // for not-less-than-or-equal, and store the results in dst.
3433 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd
3434 #define _mm_cmpnle_pd(a, b) _mm_cmpgt_pd(a, b)
3435
3436 // Compare the lower double-precision (64-bit) floating-point elements in a and
3437 // b for not-less-than-or-equal, store the result in the lower element of dst,
3438 // and copy the upper element from a to the upper element of dst.
3439 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd
3440 #define _mm_cmpnle_sd(a, b) _mm_cmpgt_sd(a, b)
3441
3442 // Compare packed double-precision (64-bit) floating-point elements in a and b
3443 // for not-less-than, and store the results in dst.
3444 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd
3445 #define _mm_cmpnlt_pd(a, b) _mm_cmpge_pd(a, b)
3446
3447 // Compare the lower double-precision (64-bit) floating-point elements in a and
3448 // b for not-less-than, store the result in the lower element of dst, and copy
3449 // the upper element from a to the upper element of dst.
3450 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd
3451 #define _mm_cmpnlt_sd(a, b) _mm_cmpge_sd(a, b)
3452
3453 // Compare packed double-precision (64-bit) floating-point elements in a and b
3454 // to see if neither is NaN, and store the results in dst.
3455 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd
_mm_cmpord_pd(__m128d a,__m128d b)3456 FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
3457 {
3458 #if defined(__aarch64__)
3459 // Excluding NaNs, any two floating point numbers can be compared.
3460 uint64x2_t not_nan_a =
3461 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3462 uint64x2_t not_nan_b =
3463 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3464 return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
3465 #else
3466 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3467 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3468 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3469 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3470 uint64_t d[2];
3471 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3472 (*(double *) &b0) == (*(double *) &b0))
3473 ? ~UINT64_C(0)
3474 : UINT64_C(0);
3475 d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3476 (*(double *) &b1) == (*(double *) &b1))
3477 ? ~UINT64_C(0)
3478 : UINT64_C(0);
3479
3480 return vreinterpretq_m128d_u64(vld1q_u64(d));
3481 #endif
3482 }
3483
3484 // Compare the lower double-precision (64-bit) floating-point elements in a and
3485 // b to see if neither is NaN, store the result in the lower element of dst, and
3486 // copy the upper element from a to the upper element of dst.
3487 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd
_mm_cmpord_sd(__m128d a,__m128d b)3488 FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
3489 {
3490 #if defined(__aarch64__)
3491 return _mm_move_sd(a, _mm_cmpord_pd(a, b));
3492 #else
3493 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3494 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3495 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3496 uint64_t d[2];
3497 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3498 (*(double *) &b0) == (*(double *) &b0))
3499 ? ~UINT64_C(0)
3500 : UINT64_C(0);
3501 d[1] = a1;
3502
3503 return vreinterpretq_m128d_u64(vld1q_u64(d));
3504 #endif
3505 }
3506
3507 // Compare packed double-precision (64-bit) floating-point elements in a and b
3508 // to see if either is NaN, and store the results in dst.
3509 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd
_mm_cmpunord_pd(__m128d a,__m128d b)3510 FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
3511 {
3512 #if defined(__aarch64__)
3513 // Two NaNs are not equal in comparison operation.
3514 uint64x2_t not_nan_a =
3515 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3516 uint64x2_t not_nan_b =
3517 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3518 return vreinterpretq_m128d_s32(
3519 vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3520 #else
3521 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3522 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3523 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3524 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3525 uint64_t d[2];
3526 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3527 (*(double *) &b0) == (*(double *) &b0))
3528 ? UINT64_C(0)
3529 : ~UINT64_C(0);
3530 d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3531 (*(double *) &b1) == (*(double *) &b1))
3532 ? UINT64_C(0)
3533 : ~UINT64_C(0);
3534
3535 return vreinterpretq_m128d_u64(vld1q_u64(d));
3536 #endif
3537 }
3538
3539 // Compare the lower double-precision (64-bit) floating-point elements in a and
3540 // b to see if either is NaN, store the result in the lower element of dst, and
3541 // copy the upper element from a to the upper element of dst.
3542 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd
_mm_cmpunord_sd(__m128d a,__m128d b)3543 FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
3544 {
3545 #if defined(__aarch64__)
3546 return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
3547 #else
3548 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3549 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3550 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3551 uint64_t d[2];
3552 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3553 (*(double *) &b0) == (*(double *) &b0))
3554 ? UINT64_C(0)
3555 : ~UINT64_C(0);
3556 d[1] = a1;
3557
3558 return vreinterpretq_m128d_u64(vld1q_u64(d));
3559 #endif
3560 }
3561
3562 // Compare the lower double-precision (64-bit) floating-point element in a and b
3563 // for greater-than-or-equal, and return the boolean result (0 or 1).
3564 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd
_mm_comige_sd(__m128d a,__m128d b)3565 FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
3566 {
3567 #if defined(__aarch64__)
3568 return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3569 #else
3570 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3571 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3572
3573 return (*(double *) &a0 >= *(double *) &b0);
3574 #endif
3575 }
3576
3577 // Compare the lower double-precision (64-bit) floating-point element in a and b
3578 // for greater-than, and return the boolean result (0 or 1).
3579 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd
_mm_comigt_sd(__m128d a,__m128d b)3580 FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
3581 {
3582 #if defined(__aarch64__)
3583 return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
3584 #else
3585 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3586 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3587
3588 return (*(double *) &a0 > *(double *) &b0);
3589 #endif
3590 }
3591
3592 // Compare the lower double-precision (64-bit) floating-point element in a and b
3593 // for less-than-or-equal, and return the boolean result (0 or 1).
3594 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd
_mm_comile_sd(__m128d a,__m128d b)3595 FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
3596 {
3597 #if defined(__aarch64__)
3598 return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
3599 #else
3600 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3601 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3602
3603 return (*(double *) &a0 <= *(double *) &b0);
3604 #endif
3605 }
3606
3607 // Compare the lower double-precision (64-bit) floating-point element in a and b
3608 // for less-than, and return the boolean result (0 or 1).
3609 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd
_mm_comilt_sd(__m128d a,__m128d b)3610 FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
3611 {
3612 #if defined(__aarch64__)
3613 return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
3614 #else
3615 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3616 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3617
3618 return (*(double *) &a0 < *(double *) &b0);
3619 #endif
3620 }
3621
3622 // Compare the lower double-precision (64-bit) floating-point element in a and b
3623 // for equality, and return the boolean result (0 or 1).
3624 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd
_mm_comieq_sd(__m128d a,__m128d b)3625 FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
3626 {
3627 #if defined(__aarch64__)
3628 return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
3629 #else
3630 uint32x4_t a_not_nan =
3631 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
3632 uint32x4_t b_not_nan =
3633 vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
3634 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3635 uint32x4_t a_eq_b =
3636 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3637 uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
3638 vreinterpretq_u64_u32(a_eq_b));
3639 return !!vgetq_lane_u64(and_results, 0);
3640 #endif
3641 }
3642
3643 // Compare the lower double-precision (64-bit) floating-point element in a and b
3644 // for not-equal, and return the boolean result (0 or 1).
3645 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd
_mm_comineq_sd(__m128d a,__m128d b)3646 FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
3647 {
3648 #if defined(__aarch64__)
3649 return !vgetq_lane_u64(vceqq_f64(a, b), 0);
3650 #else
3651 // FIXME we should handle NaN condition here
3652 uint32x4_t a_eq_b =
3653 vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3654 return !vgetq_lane_u64(vreinterpretq_u64_u32(a_eq_b), 0);
3655 #endif
3656 }
3657
3658 // Convert packed signed 32-bit integers in a to packed double-precision
3659 // (64-bit) floating-point elements, and store the results in dst.
3660 //
3661 // FOR j := 0 to 1
3662 // i := j*32
3663 // m := j*64
3664 // dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
3665 // ENDFOR
3666 //
3667 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd
_mm_cvtepi32_pd(__m128i a)3668 FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
3669 {
3670 #if defined(__aarch64__)
3671 return vreinterpretq_m128d_f64(
3672 vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
3673 #else
3674 double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
3675 double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
3676 return _mm_set_pd(a1, a0);
3677 #endif
3678 }
3679
3680 // Converts the four signed 32-bit integer values of a to single-precision,
3681 // floating-point values
3682 // https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
_mm_cvtepi32_ps(__m128i a)3683 FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
3684 {
3685 return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
3686 }
3687
3688 // Convert packed double-precision (64-bit) floating-point elements in a to
3689 // packed 32-bit integers, and store the results in dst.
3690 //
3691 // FOR j := 0 to 1
3692 // i := 32*j
3693 // k := 64*j
3694 // dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
3695 // ENDFOR
3696 //
3697 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32
_mm_cvtpd_epi32(__m128d a)3698 FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
3699 {
3700 __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3701 double d0 = ((double *) &rnd)[0];
3702 double d1 = ((double *) &rnd)[1];
3703 return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
3704 }
3705
3706 // Convert packed double-precision (64-bit) floating-point elements in a to
3707 // packed 32-bit integers, and store the results in dst.
3708 //
3709 // FOR j := 0 to 1
3710 // i := 32*j
3711 // k := 64*j
3712 // dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
3713 // ENDFOR
3714 //
3715 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_pi32
_mm_cvtpd_pi32(__m128d a)3716 FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
3717 {
3718 __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3719 double d0 = ((double *) &rnd)[0];
3720 double d1 = ((double *) &rnd)[1];
3721 int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
3722 return vreinterpret_m64_s32(vld1_s32(data));
3723 }
3724
3725 // Convert packed double-precision (64-bit) floating-point elements in a to
3726 // packed single-precision (32-bit) floating-point elements, and store the
3727 // results in dst.
3728 //
3729 // FOR j := 0 to 1
3730 // i := 32*j
3731 // k := 64*j
3732 // dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
3733 // ENDFOR
3734 // dst[127:64] := 0
3735 //
3736 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
_mm_cvtpd_ps(__m128d a)3737 FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
3738 {
3739 #if defined(__aarch64__)
3740 float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
3741 return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
3742 #else
3743 float a0 = (float) ((double *) &a)[0];
3744 float a1 = (float) ((double *) &a)[1];
3745 return _mm_set_ps(0, 0, a1, a0);
3746 #endif
3747 }
3748
3749 // Convert packed signed 32-bit integers in a to packed double-precision
3750 // (64-bit) floating-point elements, and store the results in dst.
3751 //
3752 // FOR j := 0 to 1
3753 // i := j*32
3754 // m := j*64
3755 // dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
3756 // ENDFOR
3757 //
3758 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd
_mm_cvtpi32_pd(__m64 a)3759 FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
3760 {
3761 #if defined(__aarch64__)
3762 return vreinterpretq_m128d_f64(
3763 vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
3764 #else
3765 double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
3766 double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
3767 return _mm_set_pd(a1, a0);
3768 #endif
3769 }
3770
3771 // Converts the four single-precision, floating-point values of a to signed
3772 // 32-bit integer values.
3773 //
3774 // r0 := (int) a0
3775 // r1 := (int) a1
3776 // r2 := (int) a2
3777 // r3 := (int) a3
3778 //
3779 // https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
3780 // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
3781 // does not support! It is supported on ARMv8-A however.
_mm_cvtps_epi32(__m128 a)3782 FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
3783 {
3784 #if defined(__aarch64__)
3785 switch (_MM_GET_ROUNDING_MODE()) {
3786 case _MM_ROUND_NEAREST:
3787 return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
3788 case _MM_ROUND_DOWN:
3789 return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
3790 case _MM_ROUND_UP:
3791 return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
3792 default: // _MM_ROUND_TOWARD_ZERO
3793 return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
3794 }
3795 #else
3796 float *f = (float *) &a;
3797 switch (_MM_GET_ROUNDING_MODE()) {
3798 case _MM_ROUND_NEAREST: {
3799 uint32x4_t signmask = vdupq_n_u32(0x80000000);
3800 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
3801 vdupq_n_f32(0.5f)); /* +/- 0.5 */
3802 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
3803 vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
3804 int32x4_t r_trunc = vcvtq_s32_f32(
3805 vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
3806 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
3807 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
3808 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
3809 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
3810 float32x4_t delta = vsubq_f32(
3811 vreinterpretq_f32_m128(a),
3812 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
3813 uint32x4_t is_delta_half =
3814 vceqq_f32(delta, half); /* delta == +/- 0.5 */
3815 return vreinterpretq_m128i_s32(
3816 vbslq_s32(is_delta_half, r_even, r_normal));
3817 }
3818 case _MM_ROUND_DOWN:
3819 return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
3820 floorf(f[0]));
3821 case _MM_ROUND_UP:
3822 return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
3823 ceilf(f[0]));
3824 default: // _MM_ROUND_TOWARD_ZERO
3825 return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
3826 (int32_t) f[0]);
3827 }
3828 #endif
3829 }
3830
3831 // Convert packed single-precision (32-bit) floating-point elements in a to
3832 // packed double-precision (64-bit) floating-point elements, and store the
3833 // results in dst.
3834 //
3835 // FOR j := 0 to 1
3836 // i := 64*j
3837 // k := 32*j
3838 // dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
3839 // ENDFOR
3840 //
3841 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
_mm_cvtps_pd(__m128 a)3842 FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
3843 {
3844 #if defined(__aarch64__)
3845 return vreinterpretq_m128d_f64(
3846 vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
3847 #else
3848 double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
3849 double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
3850 return _mm_set_pd(a1, a0);
3851 #endif
3852 }
3853
3854 // Copy the lower double-precision (64-bit) floating-point element of a to dst.
3855 //
3856 // dst[63:0] := a[63:0]
3857 //
3858 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
_mm_cvtsd_f64(__m128d a)3859 FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
3860 {
3861 #if defined(__aarch64__)
3862 return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
3863 #else
3864 return ((double *) &a)[0];
3865 #endif
3866 }
3867
3868 // Convert the lower double-precision (64-bit) floating-point element in a to a
3869 // 32-bit integer, and store the result in dst.
3870 //
3871 // dst[31:0] := Convert_FP64_To_Int32(a[63:0])
3872 //
3873 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32
_mm_cvtsd_si32(__m128d a)3874 FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
3875 {
3876 #if defined(__aarch64__)
3877 return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
3878 #else
3879 __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3880 double ret = ((double *) &rnd)[0];
3881 return (int32_t) ret;
3882 #endif
3883 }
3884
3885 // Convert the lower double-precision (64-bit) floating-point element in a to a
3886 // 64-bit integer, and store the result in dst.
3887 //
3888 // dst[63:0] := Convert_FP64_To_Int64(a[63:0])
3889 //
3890 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64
_mm_cvtsd_si64(__m128d a)3891 FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
3892 {
3893 #if defined(__aarch64__)
3894 return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
3895 #else
3896 __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3897 double ret = ((double *) &rnd)[0];
3898 return (int64_t) ret;
3899 #endif
3900 }
3901
3902 // Convert the lower double-precision (64-bit) floating-point element in a to a
3903 // 64-bit integer, and store the result in dst.
3904 //
3905 // dst[63:0] := Convert_FP64_To_Int64(a[63:0])
3906 //
3907 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x
3908 #define _mm_cvtsd_si64x _mm_cvtsd_si64
3909
3910 // Convert the lower double-precision (64-bit) floating-point element in b to a
3911 // single-precision (32-bit) floating-point element, store the result in the
3912 // lower element of dst, and copy the upper 3 packed elements from a to the
3913 // upper elements of dst.
3914 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss
_mm_cvtsd_ss(__m128 a,__m128d b)3915 FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
3916 {
3917 #if defined(__aarch64__)
3918 return vreinterpretq_m128_f32(vsetq_lane_f32(
3919 vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
3920 vreinterpretq_f32_m128(a), 0));
3921 #else
3922 return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
3923 vreinterpretq_f32_m128(a), 0));
3924 #endif
3925 }
3926
3927 // Copy the lower 32-bit integer in a to dst.
3928 //
3929 // dst[31:0] := a[31:0]
3930 //
3931 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
_mm_cvtsi128_si32(__m128i a)3932 FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
3933 {
3934 return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
3935 }
3936
3937 // Copy the lower 64-bit integer in a to dst.
3938 //
3939 // dst[63:0] := a[63:0]
3940 //
3941 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
_mm_cvtsi128_si64(__m128i a)3942 FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
3943 {
3944 return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
3945 }
3946
3947 // Copy the lower 64-bit integer in a to dst.
3948 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
3949 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
3950
3951 // Convert the signed 32-bit integer b to a double-precision (64-bit)
3952 // floating-point element, store the result in the lower element of dst, and
3953 // copy the upper element from a to the upper element of dst.
3954 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd
_mm_cvtsi32_sd(__m128d a,int32_t b)3955 FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
3956 {
3957 #if defined(__aarch64__)
3958 return vreinterpretq_m128d_f64(
3959 vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
3960 #else
3961 double bf = (double) b;
3962 return vreinterpretq_m128d_s64(
3963 vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
3964 #endif
3965 }
3966
3967 // Copy the lower 64-bit integer in a to dst.
3968 //
3969 // dst[63:0] := a[63:0]
3970 //
3971 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
3972 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
3973
3974 // Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
3975 // zero extending the upper bits.
3976 //
3977 // r0 := a
3978 // r1 := 0x0
3979 // r2 := 0x0
3980 // r3 := 0x0
3981 //
3982 // https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
_mm_cvtsi32_si128(int a)3983 FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
3984 {
3985 return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
3986 }
3987
3988 // Convert the signed 64-bit integer b to a double-precision (64-bit)
3989 // floating-point element, store the result in the lower element of dst, and
3990 // copy the upper element from a to the upper element of dst.
3991 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd
_mm_cvtsi64_sd(__m128d a,int64_t b)3992 FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
3993 {
3994 #if defined(__aarch64__)
3995 return vreinterpretq_m128d_f64(
3996 vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
3997 #else
3998 double bf = (double) b;
3999 return vreinterpretq_m128d_s64(
4000 vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4001 #endif
4002 }
4003
4004 // Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
4005 // zero extending the upper bits.
4006 //
4007 // r0 := a
4008 // r1 := 0x0
_mm_cvtsi64_si128(int64_t a)4009 FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
4010 {
4011 return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4012 }
4013
4014 // Copy 64-bit integer a to the lower element of dst, and zero the upper
4015 // element.
4016 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128
4017 #define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4018
4019 // Convert the signed 64-bit integer b to a double-precision (64-bit)
4020 // floating-point element, store the result in the lower element of dst, and
4021 // copy the upper element from a to the upper element of dst.
4022 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd
4023 #define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4024
4025 // Convert the lower single-precision (32-bit) floating-point element in b to a
4026 // double-precision (64-bit) floating-point element, store the result in the
4027 // lower element of dst, and copy the upper element from a to the upper element
4028 // of dst.
4029 //
4030 // dst[63:0] := Convert_FP32_To_FP64(b[31:0])
4031 // dst[127:64] := a[127:64]
4032 //
4033 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
_mm_cvtss_sd(__m128d a,__m128 b)4034 FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
4035 {
4036 double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
4037 #if defined(__aarch64__)
4038 return vreinterpretq_m128d_f64(
4039 vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4040 #else
4041 return vreinterpretq_m128d_s64(
4042 vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
4043 #endif
4044 }
4045
4046 // Convert packed double-precision (64-bit) floating-point elements in a to
4047 // packed 32-bit integers with truncation, and store the results in dst.
4048 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32
_mm_cvttpd_epi32(__m128d a)4049 FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
4050 {
4051 double a0 = ((double *) &a)[0];
4052 double a1 = ((double *) &a)[1];
4053 return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
4054 }
4055
4056 // Convert packed double-precision (64-bit) floating-point elements in a to
4057 // packed 32-bit integers with truncation, and store the results in dst.
4058 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_pi32
_mm_cvttpd_pi32(__m128d a)4059 FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
4060 {
4061 double a0 = ((double *) &a)[0];
4062 double a1 = ((double *) &a)[1];
4063 int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
4064 return vreinterpret_m64_s32(vld1_s32(data));
4065 }
4066
4067 // Converts the four single-precision, floating-point values of a to signed
4068 // 32-bit integer values using truncate.
4069 // https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
_mm_cvttps_epi32(__m128 a)4070 FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
4071 {
4072 return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4073 }
4074
4075 // Convert the lower double-precision (64-bit) floating-point element in a to a
4076 // 32-bit integer with truncation, and store the result in dst.
4077 //
4078 // dst[63:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
4079 //
4080 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32
_mm_cvttsd_si32(__m128d a)4081 FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
4082 {
4083 double ret = *((double *) &a);
4084 return (int32_t) ret;
4085 }
4086
4087 // Convert the lower double-precision (64-bit) floating-point element in a to a
4088 // 64-bit integer with truncation, and store the result in dst.
4089 //
4090 // dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4091 //
4092 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
_mm_cvttsd_si64(__m128d a)4093 FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
4094 {
4095 #if defined(__aarch64__)
4096 return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4097 #else
4098 double ret = *((double *) &a);
4099 return (int64_t) ret;
4100 #endif
4101 }
4102
4103 // Convert the lower double-precision (64-bit) floating-point element in a to a
4104 // 64-bit integer with truncation, and store the result in dst.
4105 //
4106 // dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4107 //
4108 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
4109 #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4110
4111 // Divide packed double-precision (64-bit) floating-point elements in a by
4112 // packed elements in b, and store the results in dst.
4113 //
4114 // FOR j := 0 to 1
4115 // i := 64*j
4116 // dst[i+63:i] := a[i+63:i] / b[i+63:i]
4117 // ENDFOR
4118 //
4119 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd
_mm_div_pd(__m128d a,__m128d b)4120 FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
4121 {
4122 #if defined(__aarch64__)
4123 return vreinterpretq_m128d_f64(
4124 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4125 #else
4126 double *da = (double *) &a;
4127 double *db = (double *) &b;
4128 double c[2];
4129 c[0] = da[0] / db[0];
4130 c[1] = da[1] / db[1];
4131 return vld1q_f32((float32_t *) c);
4132 #endif
4133 }
4134
4135 // Divide the lower double-precision (64-bit) floating-point element in a by the
4136 // lower double-precision (64-bit) floating-point element in b, store the result
4137 // in the lower element of dst, and copy the upper element from a to the upper
4138 // element of dst.
4139 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd
_mm_div_sd(__m128d a,__m128d b)4140 FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
4141 {
4142 #if defined(__aarch64__)
4143 float64x2_t tmp =
4144 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4145 return vreinterpretq_m128d_f64(
4146 vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4147 #else
4148 return _mm_move_sd(a, _mm_div_pd(a, b));
4149 #endif
4150 }
4151
4152 // Extracts the selected signed or unsigned 16-bit integer from a and zero
4153 // extends.
4154 // https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
4155 // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
4156 #define _mm_extract_epi16(a, imm) \
4157 vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4158
4159 // Inserts the least significant 16 bits of b into the selected 16-bit integer
4160 // of a.
4161 // https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
4162 // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
4163 // __constrange(0,8) int imm)
4164 #define _mm_insert_epi16(a, b, imm) \
4165 __extension__({ \
4166 vreinterpretq_m128i_s16( \
4167 vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
4168 })
4169
4170 // Loads two double-precision from 16-byte aligned memory, floating-point
4171 // values.
4172 //
4173 // dst[127:0] := MEM[mem_addr+127:mem_addr]
4174 //
4175 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
_mm_load_pd(const double * p)4176 FORCE_INLINE __m128d _mm_load_pd(const double *p)
4177 {
4178 #if defined(__aarch64__)
4179 return vreinterpretq_m128d_f64(vld1q_f64(p));
4180 #else
4181 const float *fp = (const float *) p;
4182 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
4183 return vreinterpretq_m128d_f32(vld1q_f32(data));
4184 #endif
4185 }
4186
4187 // Load a double-precision (64-bit) floating-point element from memory into both
4188 // elements of dst.
4189 //
4190 // dst[63:0] := MEM[mem_addr+63:mem_addr]
4191 // dst[127:64] := MEM[mem_addr+63:mem_addr]
4192 //
4193 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
4194 #define _mm_load_pd1 _mm_load1_pd
4195
4196 // Load a double-precision (64-bit) floating-point element from memory into the
4197 // lower of dst, and zero the upper element. mem_addr does not need to be
4198 // aligned on any particular boundary.
4199 //
4200 // dst[63:0] := MEM[mem_addr+63:mem_addr]
4201 // dst[127:64] := 0
4202 //
4203 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
_mm_load_sd(const double * p)4204 FORCE_INLINE __m128d _mm_load_sd(const double *p)
4205 {
4206 #if defined(__aarch64__)
4207 return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4208 #else
4209 const float *fp = (const float *) p;
4210 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
4211 return vreinterpretq_m128d_f32(vld1q_f32(data));
4212 #endif
4213 }
4214
4215 // Loads 128-bit value. :
4216 // https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
_mm_load_si128(const __m128i * p)4217 FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
4218 {
4219 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4220 }
4221
4222 // Load a double-precision (64-bit) floating-point element from memory into both
4223 // elements of dst.
4224 //
4225 // dst[63:0] := MEM[mem_addr+63:mem_addr]
4226 // dst[127:64] := MEM[mem_addr+63:mem_addr]
4227 //
4228 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
_mm_load1_pd(const double * p)4229 FORCE_INLINE __m128d _mm_load1_pd(const double *p)
4230 {
4231 #if defined(__aarch64__)
4232 return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4233 #else
4234 return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4235 #endif
4236 }
4237
4238 // Load a double-precision (64-bit) floating-point element from memory into the
4239 // upper element of dst, and copy the lower element from a to dst. mem_addr does
4240 // not need to be aligned on any particular boundary.
4241 //
4242 // dst[63:0] := a[63:0]
4243 // dst[127:64] := MEM[mem_addr+63:mem_addr]
4244 //
4245 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
_mm_loadh_pd(__m128d a,const double * p)4246 FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
4247 {
4248 #if defined(__aarch64__)
4249 return vreinterpretq_m128d_f64(
4250 vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4251 #else
4252 return vreinterpretq_m128d_f32(vcombine_f32(
4253 vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4254 #endif
4255 }
4256
4257 // Load 64-bit integer from memory into the first element of dst.
4258 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
_mm_loadl_epi64(__m128i const * p)4259 FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
4260 {
4261 /* Load the lower 64 bits of the value pointed to by p into the
4262 * lower 64 bits of the result, zeroing the upper 64 bits of the result.
4263 */
4264 return vreinterpretq_m128i_s32(
4265 vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
4266 }
4267
4268 // Load a double-precision (64-bit) floating-point element from memory into the
4269 // lower element of dst, and copy the upper element from a to dst. mem_addr does
4270 // not need to be aligned on any particular boundary.
4271 //
4272 // dst[63:0] := MEM[mem_addr+63:mem_addr]
4273 // dst[127:64] := a[127:64]
4274 //
4275 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
_mm_loadl_pd(__m128d a,const double * p)4276 FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
4277 {
4278 #if defined(__aarch64__)
4279 return vreinterpretq_m128d_f64(
4280 vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4281 #else
4282 return vreinterpretq_m128d_f32(
4283 vcombine_f32(vld1_f32((const float *) p),
4284 vget_high_f32(vreinterpretq_f32_m128d(a))));
4285 #endif
4286 }
4287
4288 // Load 2 double-precision (64-bit) floating-point elements from memory into dst
4289 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
4290 // general-protection exception may be generated.
4291 //
4292 // dst[63:0] := MEM[mem_addr+127:mem_addr+64]
4293 // dst[127:64] := MEM[mem_addr+63:mem_addr]
4294 //
4295 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
_mm_loadr_pd(const double * p)4296 FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
4297 {
4298 #if defined(__aarch64__)
4299 float64x2_t v = vld1q_f64(p);
4300 return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4301 #else
4302 int64x2_t v = vld1q_s64((const int64_t *) p);
4303 return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
4304 #endif
4305 }
4306
4307 // Loads two double-precision from unaligned memory, floating-point values.
4308 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
_mm_loadu_pd(const double * p)4309 FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
4310 {
4311 return _mm_load_pd(p);
4312 }
4313
4314 // Loads 128-bit value. :
4315 // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
_mm_loadu_si128(const __m128i * p)4316 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
4317 {
4318 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4319 }
4320
4321 // Load unaligned 32-bit integer from memory into the first element of dst.
4322 //
4323 // dst[31:0] := MEM[mem_addr+31:mem_addr]
4324 // dst[MAX:32] := 0
4325 //
4326 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
_mm_loadu_si32(const void * p)4327 FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
4328 {
4329 return vreinterpretq_m128i_s32(
4330 vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4331 }
4332
4333 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
4334 // integers from b.
4335 //
4336 // r0 := (a0 * b0) + (a1 * b1)
4337 // r1 := (a2 * b2) + (a3 * b3)
4338 // r2 := (a4 * b4) + (a5 * b5)
4339 // r3 := (a6 * b6) + (a7 * b7)
4340 // https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
_mm_madd_epi16(__m128i a,__m128i b)4341 FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
4342 {
4343 int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
4344 vget_low_s16(vreinterpretq_s16_m128i(b)));
4345 int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
4346 vget_high_s16(vreinterpretq_s16_m128i(b)));
4347
4348 int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4349 int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4350
4351 return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
4352 }
4353
4354 // Conditionally store 8-bit integer elements from a into memory using mask
4355 // (elements are not stored when the highest bit is not set in the corresponding
4356 // element) and a non-temporal memory hint. mem_addr does not need to be aligned
4357 // on any particular boundary.
4358 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128
_mm_maskmoveu_si128(__m128i a,__m128i mask,char * mem_addr)4359 FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
4360 {
4361 int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
4362 __m128 b = _mm_load_ps((const float *) mem_addr);
4363 int8x16_t masked =
4364 vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
4365 vreinterpretq_s8_m128(b));
4366 vst1q_s8((int8_t *) mem_addr, masked);
4367 }
4368
4369 // Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
4370 // signed 16-bit integers from b.
4371 // https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
_mm_max_epi16(__m128i a,__m128i b)4372 FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
4373 {
4374 return vreinterpretq_m128i_s16(
4375 vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4376 }
4377
4378 // Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
4379 // 16 unsigned 8-bit integers from b.
4380 // https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
_mm_max_epu8(__m128i a,__m128i b)4381 FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
4382 {
4383 return vreinterpretq_m128i_u8(
4384 vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4385 }
4386
4387 // Compare packed double-precision (64-bit) floating-point elements in a and b,
4388 // and store packed maximum values in dst.
4389 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd
_mm_max_pd(__m128d a,__m128d b)4390 FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
4391 {
4392 #if defined(__aarch64__)
4393 return vreinterpretq_m128d_f64(
4394 vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4395 #else
4396 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4397 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4398 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4399 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4400 uint64_t d[2];
4401 d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
4402 d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
4403
4404 return vreinterpretq_m128d_u64(vld1q_u64(d));
4405 #endif
4406 }
4407
4408 // Compare the lower double-precision (64-bit) floating-point elements in a and
4409 // b, store the maximum value in the lower element of dst, and copy the upper
4410 // element from a to the upper element of dst.
4411 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd
_mm_max_sd(__m128d a,__m128d b)4412 FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
4413 {
4414 #if defined(__aarch64__)
4415 return _mm_move_sd(a, _mm_max_pd(a, b));
4416 #else
4417 double *da = (double *) &a;
4418 double *db = (double *) &b;
4419 double c[2] = {fmax(da[0], db[0]), da[1]};
4420 return vld1q_f32((float32_t *) c);
4421 #endif
4422 }
4423
4424 // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
4425 // signed 16-bit integers from b.
4426 // https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
_mm_min_epi16(__m128i a,__m128i b)4427 FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
4428 {
4429 return vreinterpretq_m128i_s16(
4430 vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4431 }
4432
4433 // Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
4434 // 16 unsigned 8-bit integers from b.
4435 // https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
_mm_min_epu8(__m128i a,__m128i b)4436 FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
4437 {
4438 return vreinterpretq_m128i_u8(
4439 vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4440 }
4441
4442 // Compare packed double-precision (64-bit) floating-point elements in a and b,
4443 // and store packed minimum values in dst.
4444 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd
_mm_min_pd(__m128d a,__m128d b)4445 FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
4446 {
4447 #if defined(__aarch64__)
4448 return vreinterpretq_m128d_f64(
4449 vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4450 #else
4451 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4452 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4453 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4454 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4455 uint64_t d[2];
4456 d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
4457 d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
4458 return vreinterpretq_m128d_u64(vld1q_u64(d));
4459 #endif
4460 }
4461
4462 // Compare the lower double-precision (64-bit) floating-point elements in a and
4463 // b, store the minimum value in the lower element of dst, and copy the upper
4464 // element from a to the upper element of dst.
4465 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd
_mm_min_sd(__m128d a,__m128d b)4466 FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
4467 {
4468 #if defined(__aarch64__)
4469 return _mm_move_sd(a, _mm_min_pd(a, b));
4470 #else
4471 double *da = (double *) &a;
4472 double *db = (double *) &b;
4473 double c[2] = {fmin(da[0], db[0]), da[1]};
4474 return vld1q_f32((float32_t *) c);
4475 #endif
4476 }
4477
4478 // Copy the lower 64-bit integer in a to the lower element of dst, and zero the
4479 // upper element.
4480 //
4481 // dst[63:0] := a[63:0]
4482 // dst[127:64] := 0
4483 //
4484 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
_mm_move_epi64(__m128i a)4485 FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
4486 {
4487 return vreinterpretq_m128i_s64(
4488 vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
4489 }
4490
4491 // Move the lower double-precision (64-bit) floating-point element from b to the
4492 // lower element of dst, and copy the upper element from a to the upper element
4493 // of dst.
4494 //
4495 // dst[63:0] := b[63:0]
4496 // dst[127:64] := a[127:64]
4497 //
4498 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
_mm_move_sd(__m128d a,__m128d b)4499 FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
4500 {
4501 return vreinterpretq_m128d_f32(
4502 vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
4503 vget_high_f32(vreinterpretq_f32_m128d(a))));
4504 }
4505
4506 // NEON does not provide a version of this function.
4507 // Creates a 16-bit mask from the most significant bits of the 16 signed or
4508 // unsigned 8-bit integers in a and zero extends the upper bits.
4509 // https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
_mm_movemask_epi8(__m128i a)4510 FORCE_INLINE int _mm_movemask_epi8(__m128i a)
4511 {
4512 // Use increasingly wide shifts+adds to collect the sign bits
4513 // together.
4514 // Since the widening shifts would be rather confusing to follow in little
4515 // endian, everything will be illustrated in big endian order instead. This
4516 // has a different result - the bits would actually be reversed on a big
4517 // endian machine.
4518
4519 // Starting input (only half the elements are shown):
4520 // 89 ff 1d c0 00 10 99 33
4521 uint8x16_t input = vreinterpretq_u8_m128i(a);
4522
4523 // Shift out everything but the sign bits with an unsigned shift right.
4524 //
4525 // Bytes of the vector::
4526 // 89 ff 1d c0 00 10 99 33
4527 // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
4528 // | | | | | | | |
4529 // 01 01 00 01 00 00 01 00
4530 //
4531 // Bits of first important lane(s):
4532 // 10001001 (89)
4533 // \______
4534 // |
4535 // 00000001 (01)
4536 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4537
4538 // Merge the even lanes together with a 16-bit unsigned shift right + add.
4539 // 'xx' represents garbage data which will be ignored in the final result.
4540 // In the important bytes, the add functions like a binary OR.
4541 //
4542 // 01 01 00 01 00 00 01 00
4543 // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
4544 // \| \| \| \|
4545 // xx 03 xx 01 xx 00 xx 02
4546 //
4547 // 00000001 00000001 (01 01)
4548 // \_______ |
4549 // \|
4550 // xxxxxxxx xxxxxx11 (xx 03)
4551 uint32x4_t paired16 =
4552 vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4553
4554 // Repeat with a wider 32-bit shift + add.
4555 // xx 03 xx 01 xx 00 xx 02
4556 // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>
4557 // 14))
4558 // \| \|
4559 // xx xx xx 0d xx xx xx 02
4560 //
4561 // 00000011 00000001 (03 01)
4562 // \\_____ ||
4563 // '----.\||
4564 // xxxxxxxx xxxx1101 (xx 0d)
4565 uint64x2_t paired32 =
4566 vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
4567
4568 // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
4569 // lanes. xx xx xx 0d xx xx xx 02
4570 // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>
4571 // 28))
4572 // \|
4573 // xx xx xx xx xx xx xx d2
4574 //
4575 // 00001101 00000010 (0d 02)
4576 // \ \___ | |
4577 // '---. \| |
4578 // xxxxxxxx 11010010 (xx d2)
4579 uint8x16_t paired64 =
4580 vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
4581
4582 // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
4583 // xx xx xx xx xx xx xx d2
4584 // || return paired64[0]
4585 // d2
4586 // Note: Little endian would return the correct value 4b (01001011) instead.
4587 return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
4588 }
4589
4590 // Set each bit of mask dst based on the most significant bit of the
4591 // corresponding packed double-precision (64-bit) floating-point element in a.
4592 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd
_mm_movemask_pd(__m128d a)4593 FORCE_INLINE int _mm_movemask_pd(__m128d a)
4594 {
4595 uint64x2_t input = vreinterpretq_u64_m128d(a);
4596 uint64x2_t high_bits = vshrq_n_u64(input, 63);
4597 return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
4598 }
4599
4600 // Copy the lower 64-bit integer in a to dst.
4601 //
4602 // dst[63:0] := a[63:0]
4603 //
4604 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
_mm_movepi64_pi64(__m128i a)4605 FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
4606 {
4607 return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
4608 }
4609
4610 // Copy the 64-bit integer a to the lower element of dst, and zero the upper
4611 // element.
4612 //
4613 // dst[63:0] := a[63:0]
4614 // dst[127:64] := 0
4615 //
4616 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
_mm_movpi64_epi64(__m64 a)4617 FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
4618 {
4619 return vreinterpretq_m128i_s64(
4620 vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
4621 }
4622
4623 // Multiply the low unsigned 32-bit integers from each packed 64-bit element in
4624 // a and b, and store the unsigned 64-bit results in dst.
4625 //
4626 // r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
4627 // r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
_mm_mul_epu32(__m128i a,__m128i b)4628 FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
4629 {
4630 // vmull_u32 upcasts instead of masking, so we downcast.
4631 uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
4632 uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
4633 return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
4634 }
4635
4636 // Multiply packed double-precision (64-bit) floating-point elements in a and b,
4637 // and store the results in dst.
4638 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
_mm_mul_pd(__m128d a,__m128d b)4639 FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
4640 {
4641 #if defined(__aarch64__)
4642 return vreinterpretq_m128d_f64(
4643 vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4644 #else
4645 double *da = (double *) &a;
4646 double *db = (double *) &b;
4647 double c[2];
4648 c[0] = da[0] * db[0];
4649 c[1] = da[1] * db[1];
4650 return vld1q_f32((float32_t *) c);
4651 #endif
4652 }
4653
4654 // Multiply the lower double-precision (64-bit) floating-point element in a and
4655 // b, store the result in the lower element of dst, and copy the upper element
4656 // from a to the upper element of dst.
4657 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
_mm_mul_sd(__m128d a,__m128d b)4658 FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
4659 {
4660 return _mm_move_sd(a, _mm_mul_pd(a, b));
4661 }
4662
4663 // Multiply the low unsigned 32-bit integers from a and b, and store the
4664 // unsigned 64-bit result in dst.
4665 //
4666 // dst[63:0] := a[31:0] * b[31:0]
4667 //
4668 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
_mm_mul_su32(__m64 a,__m64 b)4669 FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
4670 {
4671 return vreinterpret_m64_u64(vget_low_u64(
4672 vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
4673 }
4674
4675 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
4676 // integers from b.
4677 //
4678 // r0 := (a0 * b0)[31:16]
4679 // r1 := (a1 * b1)[31:16]
4680 // ...
4681 // r7 := (a7 * b7)[31:16]
4682 //
4683 // https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
_mm_mulhi_epi16(__m128i a,__m128i b)4684 FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
4685 {
4686 /* FIXME: issue with large values because of result saturation */
4687 // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
4688 // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
4689 // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
4690 int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
4691 int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
4692 int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4693 int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
4694 int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
4695 int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4696 uint16x8x2_t r =
4697 vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4698 return vreinterpretq_m128i_u16(r.val[1]);
4699 }
4700
4701 // Multiply the packed unsigned 16-bit integers in a and b, producing
4702 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
4703 // integers in dst.
4704 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
_mm_mulhi_epu16(__m128i a,__m128i b)4705 FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
4706 {
4707 uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
4708 uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
4709 uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4710 #if defined(__aarch64__)
4711 uint32x4_t ab7654 =
4712 vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
4713 uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4714 vreinterpretq_u16_u32(ab7654));
4715 return vreinterpretq_m128i_u16(r);
4716 #else
4717 uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
4718 uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
4719 uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4720 uint16x8x2_t r =
4721 vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4722 return vreinterpretq_m128i_u16(r.val[1]);
4723 #endif
4724 }
4725
4726 // Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
4727 // unsigned 16-bit integers from b.
4728 //
4729 // r0 := (a0 * b0)[15:0]
4730 // r1 := (a1 * b1)[15:0]
4731 // ...
4732 // r7 := (a7 * b7)[15:0]
4733 //
4734 // https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
_mm_mullo_epi16(__m128i a,__m128i b)4735 FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
4736 {
4737 return vreinterpretq_m128i_s16(
4738 vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4739 }
4740
4741 // Compute the bitwise OR of packed double-precision (64-bit) floating-point
4742 // elements in a and b, and store the results in dst.
4743 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
_mm_or_pd(__m128d a,__m128d b)4744 FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
4745 {
4746 return vreinterpretq_m128d_s64(
4747 vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
4748 }
4749
4750 // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
4751 //
4752 // r := a | b
4753 //
4754 // https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
_mm_or_si128(__m128i a,__m128i b)4755 FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
4756 {
4757 return vreinterpretq_m128i_s32(
4758 vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4759 }
4760
4761 // Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
4762 // saturates.
4763 // https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
_mm_packs_epi16(__m128i a,__m128i b)4764 FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
4765 {
4766 return vreinterpretq_m128i_s8(
4767 vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
4768 vqmovn_s16(vreinterpretq_s16_m128i(b))));
4769 }
4770
4771 // Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
4772 // and saturates.
4773 //
4774 // r0 := SignedSaturate(a0)
4775 // r1 := SignedSaturate(a1)
4776 // r2 := SignedSaturate(a2)
4777 // r3 := SignedSaturate(a3)
4778 // r4 := SignedSaturate(b0)
4779 // r5 := SignedSaturate(b1)
4780 // r6 := SignedSaturate(b2)
4781 // r7 := SignedSaturate(b3)
4782 //
4783 // https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
_mm_packs_epi32(__m128i a,__m128i b)4784 FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
4785 {
4786 return vreinterpretq_m128i_s16(
4787 vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
4788 vqmovn_s32(vreinterpretq_s32_m128i(b))));
4789 }
4790
4791 // Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
4792 // integers and saturates.
4793 //
4794 // r0 := UnsignedSaturate(a0)
4795 // r1 := UnsignedSaturate(a1)
4796 // ...
4797 // r7 := UnsignedSaturate(a7)
4798 // r8 := UnsignedSaturate(b0)
4799 // r9 := UnsignedSaturate(b1)
4800 // ...
4801 // r15 := UnsignedSaturate(b7)
4802 //
4803 // https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
_mm_packus_epi16(const __m128i a,const __m128i b)4804 FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
4805 {
4806 return vreinterpretq_m128i_u8(
4807 vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
4808 vqmovun_s16(vreinterpretq_s16_m128i(b))));
4809 }
4810
4811 // Pause the processor. This is typically used in spin-wait loops and depending
4812 // on the x86 processor typical values are in the 40-100 cycle range. The
4813 // 'yield' instruction isn't a good fit beacuse it's effectively a nop on most
4814 // Arm cores. Experience with several databases has shown has shown an 'isb' is
4815 // a reasonable approximation.
_mm_pause()4816 FORCE_INLINE void _mm_pause()
4817 {
4818 __asm__ __volatile__("isb\n");
4819 }
4820
4821 // Compute the absolute differences of packed unsigned 8-bit integers in a and
4822 // b, then horizontally sum each consecutive 8 differences to produce two
4823 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
4824 // 16 bits of 64-bit elements in dst.
4825 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
_mm_sad_epu8(__m128i a,__m128i b)4826 FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
4827 {
4828 uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
4829 return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
4830 }
4831
4832 // Sets the 8 signed 16-bit integer values.
4833 // https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
_mm_set_epi16(short i7,short i6,short i5,short i4,short i3,short i2,short i1,short i0)4834 FORCE_INLINE __m128i _mm_set_epi16(short i7,
4835 short i6,
4836 short i5,
4837 short i4,
4838 short i3,
4839 short i2,
4840 short i1,
4841 short i0)
4842 {
4843 int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
4844 return vreinterpretq_m128i_s16(vld1q_s16(data));
4845 }
4846
4847 // Sets the 4 signed 32-bit integer values.
4848 // https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
_mm_set_epi32(int i3,int i2,int i1,int i0)4849 FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
4850 {
4851 int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
4852 return vreinterpretq_m128i_s32(vld1q_s32(data));
4853 }
4854
4855 // Returns the __m128i structure with its two 64-bit integer values
4856 // initialized to the values of the two 64-bit integers passed in.
4857 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
_mm_set_epi64(__m64 i1,__m64 i2)4858 FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
4859 {
4860 return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
4861 }
4862
4863 // Returns the __m128i structure with its two 64-bit integer values
4864 // initialized to the values of the two 64-bit integers passed in.
4865 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
_mm_set_epi64x(int64_t i1,int64_t i2)4866 FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
4867 {
4868 return vreinterpretq_m128i_s64(
4869 vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
4870 }
4871
4872 // Sets the 16 signed 8-bit integer values.
4873 // https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
_mm_set_epi8(signed char b15,signed char b14,signed char b13,signed char b12,signed char b11,signed char b10,signed char b9,signed char b8,signed char b7,signed char b6,signed char b5,signed char b4,signed char b3,signed char b2,signed char b1,signed char b0)4874 FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
4875 signed char b14,
4876 signed char b13,
4877 signed char b12,
4878 signed char b11,
4879 signed char b10,
4880 signed char b9,
4881 signed char b8,
4882 signed char b7,
4883 signed char b6,
4884 signed char b5,
4885 signed char b4,
4886 signed char b3,
4887 signed char b2,
4888 signed char b1,
4889 signed char b0)
4890 {
4891 int8_t ALIGN_STRUCT(16)
4892 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
4893 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
4894 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
4895 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
4896 return (__m128i) vld1q_s8(data);
4897 }
4898
4899 // Set packed double-precision (64-bit) floating-point elements in dst with the
4900 // supplied values.
4901 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
_mm_set_pd(double e1,double e0)4902 FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
4903 {
4904 double ALIGN_STRUCT(16) data[2] = {e0, e1};
4905 #if defined(__aarch64__)
4906 return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
4907 #else
4908 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
4909 #endif
4910 }
4911
4912 // Broadcast double-precision (64-bit) floating-point value a to all elements of
4913 // dst.
4914 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
4915 #define _mm_set_pd1 _mm_set1_pd
4916
4917 // Copy double-precision (64-bit) floating-point element a to the lower element
4918 // of dst, and zero the upper element.
4919 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
_mm_set_sd(double a)4920 FORCE_INLINE __m128d _mm_set_sd(double a)
4921 {
4922 return _mm_set_pd(0, a);
4923 }
4924
4925 // Sets the 8 signed 16-bit integer values to w.
4926 //
4927 // r0 := w
4928 // r1 := w
4929 // ...
4930 // r7 := w
4931 //
4932 // https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
_mm_set1_epi16(short w)4933 FORCE_INLINE __m128i _mm_set1_epi16(short w)
4934 {
4935 return vreinterpretq_m128i_s16(vdupq_n_s16(w));
4936 }
4937
4938 // Sets the 4 signed 32-bit integer values to i.
4939 //
4940 // r0 := i
4941 // r1 := i
4942 // r2 := i
4943 // r3 := I
4944 //
4945 // https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
_mm_set1_epi32(int _i)4946 FORCE_INLINE __m128i _mm_set1_epi32(int _i)
4947 {
4948 return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
4949 }
4950
4951 // Sets the 2 signed 64-bit integer values to i.
4952 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
_mm_set1_epi64(__m64 _i)4953 FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
4954 {
4955 return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
4956 }
4957
4958 // Sets the 2 signed 64-bit integer values to i.
4959 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
_mm_set1_epi64x(int64_t _i)4960 FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
4961 {
4962 return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
4963 }
4964
4965 // Sets the 16 signed 8-bit integer values to b.
4966 //
4967 // r0 := b
4968 // r1 := b
4969 // ...
4970 // r15 := b
4971 //
4972 // https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
_mm_set1_epi8(signed char w)4973 FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
4974 {
4975 return vreinterpretq_m128i_s8(vdupq_n_s8(w));
4976 }
4977
4978 // Broadcast double-precision (64-bit) floating-point value a to all elements of
4979 // dst.
4980 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
_mm_set1_pd(double d)4981 FORCE_INLINE __m128d _mm_set1_pd(double d)
4982 {
4983 #if defined(__aarch64__)
4984 return vreinterpretq_m128d_f64(vdupq_n_f64(d));
4985 #else
4986 return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
4987 #endif
4988 }
4989
4990 // Sets the 8 signed 16-bit integer values in reverse order.
4991 //
4992 // Return Value
4993 // r0 := w0
4994 // r1 := w1
4995 // ...
4996 // r7 := w7
_mm_setr_epi16(short w0,short w1,short w2,short w3,short w4,short w5,short w6,short w7)4997 FORCE_INLINE __m128i _mm_setr_epi16(short w0,
4998 short w1,
4999 short w2,
5000 short w3,
5001 short w4,
5002 short w5,
5003 short w6,
5004 short w7)
5005 {
5006 int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
5007 return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
5008 }
5009
5010 // Sets the 4 signed 32-bit integer values in reverse order
5011 // https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
_mm_setr_epi32(int i3,int i2,int i1,int i0)5012 FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
5013 {
5014 int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
5015 return vreinterpretq_m128i_s32(vld1q_s32(data));
5016 }
5017
5018 // Set packed 64-bit integers in dst with the supplied values in reverse order.
5019 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
_mm_setr_epi64(__m64 e1,__m64 e0)5020 FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
5021 {
5022 return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
5023 }
5024
5025 // Sets the 16 signed 8-bit integer values in reverse order.
5026 // https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
_mm_setr_epi8(signed char b0,signed char b1,signed char b2,signed char b3,signed char b4,signed char b5,signed char b6,signed char b7,signed char b8,signed char b9,signed char b10,signed char b11,signed char b12,signed char b13,signed char b14,signed char b15)5027 FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
5028 signed char b1,
5029 signed char b2,
5030 signed char b3,
5031 signed char b4,
5032 signed char b5,
5033 signed char b6,
5034 signed char b7,
5035 signed char b8,
5036 signed char b9,
5037 signed char b10,
5038 signed char b11,
5039 signed char b12,
5040 signed char b13,
5041 signed char b14,
5042 signed char b15)
5043 {
5044 int8_t ALIGN_STRUCT(16)
5045 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5046 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5047 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5048 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5049 return (__m128i) vld1q_s8(data);
5050 }
5051
5052 // Set packed double-precision (64-bit) floating-point elements in dst with the
5053 // supplied values in reverse order.
5054 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
_mm_setr_pd(double e1,double e0)5055 FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
5056 {
5057 return _mm_set_pd(e0, e1);
5058 }
5059
5060 // Return vector of type __m128d with all elements set to zero.
5061 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
_mm_setzero_pd(void)5062 FORCE_INLINE __m128d _mm_setzero_pd(void)
5063 {
5064 #if defined(__aarch64__)
5065 return vreinterpretq_m128d_f64(vdupq_n_f64(0));
5066 #else
5067 return vreinterpretq_m128d_f32(vdupq_n_f32(0));
5068 #endif
5069 }
5070
5071 // Sets the 128-bit value to zero
5072 // https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
_mm_setzero_si128(void)5073 FORCE_INLINE __m128i _mm_setzero_si128(void)
5074 {
5075 return vreinterpretq_m128i_s32(vdupq_n_s32(0));
5076 }
5077
5078 // Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
5079 // https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
5080 // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
5081 // __constrange(0,255) int imm)
5082 #if __has_builtin(__builtin_shufflevector)
5083 #define _mm_shuffle_epi32(a, imm) \
5084 __extension__({ \
5085 int32x4_t _input = vreinterpretq_s32_m128i(a); \
5086 int32x4_t _shuf = __builtin_shufflevector( \
5087 _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
5088 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
5089 vreinterpretq_m128i_s32(_shuf); \
5090 })
5091 #else // generic
5092 #define _mm_shuffle_epi32(a, imm) \
5093 __extension__({ \
5094 __m128i ret; \
5095 switch (imm) { \
5096 case _MM_SHUFFLE(1, 0, 3, 2): \
5097 ret = _mm_shuffle_epi_1032((a)); \
5098 break; \
5099 case _MM_SHUFFLE(2, 3, 0, 1): \
5100 ret = _mm_shuffle_epi_2301((a)); \
5101 break; \
5102 case _MM_SHUFFLE(0, 3, 2, 1): \
5103 ret = _mm_shuffle_epi_0321((a)); \
5104 break; \
5105 case _MM_SHUFFLE(2, 1, 0, 3): \
5106 ret = _mm_shuffle_epi_2103((a)); \
5107 break; \
5108 case _MM_SHUFFLE(1, 0, 1, 0): \
5109 ret = _mm_shuffle_epi_1010((a)); \
5110 break; \
5111 case _MM_SHUFFLE(1, 0, 0, 1): \
5112 ret = _mm_shuffle_epi_1001((a)); \
5113 break; \
5114 case _MM_SHUFFLE(0, 1, 0, 1): \
5115 ret = _mm_shuffle_epi_0101((a)); \
5116 break; \
5117 case _MM_SHUFFLE(2, 2, 1, 1): \
5118 ret = _mm_shuffle_epi_2211((a)); \
5119 break; \
5120 case _MM_SHUFFLE(0, 1, 2, 2): \
5121 ret = _mm_shuffle_epi_0122((a)); \
5122 break; \
5123 case _MM_SHUFFLE(3, 3, 3, 2): \
5124 ret = _mm_shuffle_epi_3332((a)); \
5125 break; \
5126 case _MM_SHUFFLE(0, 0, 0, 0): \
5127 ret = _mm_shuffle_epi32_splat((a), 0); \
5128 break; \
5129 case _MM_SHUFFLE(1, 1, 1, 1): \
5130 ret = _mm_shuffle_epi32_splat((a), 1); \
5131 break; \
5132 case _MM_SHUFFLE(2, 2, 2, 2): \
5133 ret = _mm_shuffle_epi32_splat((a), 2); \
5134 break; \
5135 case _MM_SHUFFLE(3, 3, 3, 3): \
5136 ret = _mm_shuffle_epi32_splat((a), 3); \
5137 break; \
5138 default: \
5139 ret = _mm_shuffle_epi32_default((a), (imm)); \
5140 break; \
5141 } \
5142 ret; \
5143 })
5144 #endif
5145
5146 // Shuffle double-precision (64-bit) floating-point elements using the control
5147 // in imm8, and store the results in dst.
5148 //
5149 // dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
5150 // dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
5151 //
5152 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
5153 #if __has_builtin(__builtin_shufflevector)
5154 #define _mm_shuffle_pd(a, b, imm8) \
5155 vreinterpretq_m128d_s64(__builtin_shufflevector( \
5156 vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
5157 ((imm8 & 0x2) >> 1) + 2))
5158 #else
5159 #define _mm_shuffle_pd(a, b, imm8) \
5160 _mm_castsi128_pd(_mm_set_epi64x( \
5161 vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
5162 vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
5163 #endif
5164
5165 // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
5166 // __constrange(0,255) int imm)
5167 #if __has_builtin(__builtin_shufflevector)
5168 #define _mm_shufflehi_epi16(a, imm) \
5169 __extension__({ \
5170 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5171 int16x8_t _shuf = __builtin_shufflevector( \
5172 _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
5173 (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
5174 (((imm) >> 6) & 0x3) + 4); \
5175 vreinterpretq_m128i_s16(_shuf); \
5176 })
5177 #else // generic
5178 #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
5179 #endif
5180
5181 // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
5182 // __constrange(0,255) int imm)
5183 #if __has_builtin(__builtin_shufflevector)
5184 #define _mm_shufflelo_epi16(a, imm) \
5185 __extension__({ \
5186 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5187 int16x8_t _shuf = __builtin_shufflevector( \
5188 _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
5189 (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
5190 vreinterpretq_m128i_s16(_shuf); \
5191 })
5192 #else // generic
5193 #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
5194 #endif
5195
5196 // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
5197 // shifting in zeros.
5198 //
5199 // r0 := a0 << count
5200 // r1 := a1 << count
5201 // ...
5202 // r7 := a7 << count
5203 //
5204 // https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
_mm_sll_epi16(__m128i a,__m128i count)5205 FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
5206 {
5207 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5208 if (unlikely(c > 15))
5209 return _mm_setzero_si128();
5210
5211 int16x8_t vc = vdupq_n_s16((int16_t) c);
5212 return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
5213 }
5214
5215 // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
5216 // shifting in zeros.
5217 //
5218 // r0 := a0 << count
5219 // r1 := a1 << count
5220 // r2 := a2 << count
5221 // r3 := a3 << count
5222 //
5223 // https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
_mm_sll_epi32(__m128i a,__m128i count)5224 FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
5225 {
5226 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5227 if (unlikely(c > 31))
5228 return _mm_setzero_si128();
5229
5230 int32x4_t vc = vdupq_n_s32((int32_t) c);
5231 return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
5232 }
5233
5234 // Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
5235 // shifting in zeros.
5236 //
5237 // r0 := a0 << count
5238 // r1 := a1 << count
5239 //
5240 // https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
_mm_sll_epi64(__m128i a,__m128i count)5241 FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
5242 {
5243 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5244 if (unlikely(c > 63))
5245 return _mm_setzero_si128();
5246
5247 int64x2_t vc = vdupq_n_s64((int64_t) c);
5248 return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
5249 }
5250
5251 // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
5252 // shifting in zeros.
5253 //
5254 // r0 := a0 << count
5255 // r1 := a1 << count
5256 // ...
5257 // r7 := a7 << count
5258 //
5259 // https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
5260 #define _mm_slli_epi16(a, imm) \
5261 __extension__({ \
5262 __m128i ret; \
5263 if (unlikely((imm)) <= 0) { \
5264 ret = a; \
5265 } \
5266 if (unlikely((imm) > 15)) { \
5267 ret = _mm_setzero_si128(); \
5268 } else { \
5269 ret = vreinterpretq_m128i_s16( \
5270 vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
5271 } \
5272 ret; \
5273 })
5274
5275 // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
5276 // shifting in zeros. :
5277 // https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
5278 // FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
_mm_slli_epi32(__m128i a,int imm)5279 FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
5280 {
5281 if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */
5282 return a;
5283 if (unlikely(imm > 31))
5284 return _mm_setzero_si128();
5285 return vreinterpretq_m128i_s32(
5286 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
5287 }
5288
5289 // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
5290 // store the results in dst.
_mm_slli_epi64(__m128i a,int imm)5291 FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
5292 {
5293 if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */
5294 return a;
5295 if (unlikely(imm > 63))
5296 return _mm_setzero_si128();
5297 return vreinterpretq_m128i_s64(
5298 vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
5299 }
5300
5301 // Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
5302 // must be an immediate.
5303 //
5304 // r := a << (imm * 8)
5305 //
5306 // https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
5307 // FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
5308 #define _mm_slli_si128(a, imm) \
5309 __extension__({ \
5310 __m128i ret; \
5311 if (unlikely((imm) <= 0)) { \
5312 ret = a; \
5313 } \
5314 if (unlikely((imm) > 15)) { \
5315 ret = _mm_setzero_si128(); \
5316 } else { \
5317 ret = vreinterpretq_m128i_s8(vextq_s8( \
5318 vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
5319 } \
5320 ret; \
5321 })
5322
5323 // Compute the square root of packed double-precision (64-bit) floating-point
5324 // elements in a, and store the results in dst.
5325 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd
_mm_sqrt_pd(__m128d a)5326 FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
5327 {
5328 #if defined(__aarch64__)
5329 return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
5330 #else
5331 double a0 = sqrt(((double *) &a)[0]);
5332 double a1 = sqrt(((double *) &a)[1]);
5333 return _mm_set_pd(a1, a0);
5334 #endif
5335 }
5336
5337 // Compute the square root of the lower double-precision (64-bit) floating-point
5338 // element in b, store the result in the lower element of dst, and copy the
5339 // upper element from a to the upper element of dst.
5340 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd
_mm_sqrt_sd(__m128d a,__m128d b)5341 FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
5342 {
5343 #if defined(__aarch64__)
5344 return _mm_move_sd(a, _mm_sqrt_pd(b));
5345 #else
5346 return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
5347 #endif
5348 }
5349
5350 // Shifts the 8 signed 16-bit integers in a right by count bits while shifting
5351 // in the sign bit.
5352 //
5353 // r0 := a0 >> count
5354 // r1 := a1 >> count
5355 // ...
5356 // r7 := a7 >> count
5357 //
5358 // https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
_mm_sra_epi16(__m128i a,__m128i count)5359 FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
5360 {
5361 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5362 if (unlikely(c > 15))
5363 return _mm_cmplt_epi16(a, _mm_setzero_si128());
5364 return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
5365 }
5366
5367 // Shifts the 4 signed 32-bit integers in a right by count bits while shifting
5368 // in the sign bit.
5369 //
5370 // r0 := a0 >> count
5371 // r1 := a1 >> count
5372 // r2 := a2 >> count
5373 // r3 := a3 >> count
5374 //
5375 // https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
_mm_sra_epi32(__m128i a,__m128i count)5376 FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
5377 {
5378 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5379 if (unlikely(c > 31))
5380 return _mm_cmplt_epi32(a, _mm_setzero_si128());
5381 return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
5382 }
5383
5384 // Shift packed 16-bit integers in a right by imm while shifting in sign
5385 // bits, and store the results in dst.
5386 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
_mm_srai_epi16(__m128i a,int imm)5387 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
5388 {
5389 const int count = (imm & ~15) ? 15 : imm;
5390 return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
5391 }
5392
5393 // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
5394 // and store the results in dst.
5395 //
5396 // FOR j := 0 to 3
5397 // i := j*32
5398 // IF imm8[7:0] > 31
5399 // dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
5400 // ELSE
5401 // dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
5402 // FI
5403 // ENDFOR
5404 //
5405 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
5406 // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
5407 #define _mm_srai_epi32(a, imm) \
5408 __extension__({ \
5409 __m128i ret; \
5410 if (unlikely((imm) == 0)) { \
5411 ret = a; \
5412 } else if (likely(0 < (imm) && (imm) < 32)) { \
5413 ret = vreinterpretq_m128i_s32( \
5414 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
5415 } else { \
5416 ret = vreinterpretq_m128i_s32( \
5417 vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \
5418 } \
5419 ret; \
5420 })
5421
5422 // Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
5423 // while shifting in zeros.
5424 //
5425 // r0 := srl(a0, count)
5426 // r1 := srl(a1, count)
5427 // ...
5428 // r7 := srl(a7, count)
5429 //
5430 // https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
_mm_srl_epi16(__m128i a,__m128i count)5431 FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
5432 {
5433 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5434 if (unlikely(c > 15))
5435 return _mm_setzero_si128();
5436
5437 int16x8_t vc = vdupq_n_s16(-(int16_t) c);
5438 return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
5439 }
5440
5441 // Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
5442 // while shifting in zeros.
5443 //
5444 // r0 := srl(a0, count)
5445 // r1 := srl(a1, count)
5446 // r2 := srl(a2, count)
5447 // r3 := srl(a3, count)
5448 //
5449 // https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
_mm_srl_epi32(__m128i a,__m128i count)5450 FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
5451 {
5452 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5453 if (unlikely(c > 31))
5454 return _mm_setzero_si128();
5455
5456 int32x4_t vc = vdupq_n_s32(-(int32_t) c);
5457 return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
5458 }
5459
5460 // Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
5461 // while shifting in zeros.
5462 //
5463 // r0 := srl(a0, count)
5464 // r1 := srl(a1, count)
5465 //
5466 // https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
_mm_srl_epi64(__m128i a,__m128i count)5467 FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
5468 {
5469 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5470 if (unlikely(c > 63))
5471 return _mm_setzero_si128();
5472
5473 int64x2_t vc = vdupq_n_s64(-(int64_t) c);
5474 return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
5475 }
5476
5477 // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
5478 // store the results in dst.
5479 //
5480 // FOR j := 0 to 7
5481 // i := j*16
5482 // IF imm8[7:0] > 15
5483 // dst[i+15:i] := 0
5484 // ELSE
5485 // dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
5486 // FI
5487 // ENDFOR
5488 //
5489 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
5490 #define _mm_srli_epi16(a, imm) \
5491 __extension__({ \
5492 __m128i ret; \
5493 if (unlikely(imm) == 0) { \
5494 ret = a; \
5495 } else if (likely(0 < (imm) && (imm) < 16)) { \
5496 ret = vreinterpretq_m128i_u16( \
5497 vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
5498 } else { \
5499 ret = _mm_setzero_si128(); \
5500 } \
5501 ret; \
5502 })
5503
5504 // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
5505 // store the results in dst.
5506 //
5507 // FOR j := 0 to 3
5508 // i := j*32
5509 // IF imm8[7:0] > 31
5510 // dst[i+31:i] := 0
5511 // ELSE
5512 // dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
5513 // FI
5514 // ENDFOR
5515 //
5516 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
5517 // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
5518 #define _mm_srli_epi32(a, imm) \
5519 __extension__({ \
5520 __m128i ret; \
5521 if (unlikely((imm) == 0)) { \
5522 ret = a; \
5523 } else if (likely(0 < (imm) && (imm) < 32)) { \
5524 ret = vreinterpretq_m128i_u32( \
5525 vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
5526 } else { \
5527 ret = _mm_setzero_si128(); \
5528 } \
5529 ret; \
5530 })
5531
5532 // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
5533 // store the results in dst.
5534 //
5535 // FOR j := 0 to 1
5536 // i := j*64
5537 // IF imm8[7:0] > 63
5538 // dst[i+63:i] := 0
5539 // ELSE
5540 // dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
5541 // FI
5542 // ENDFOR
5543 //
5544 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
5545 #define _mm_srli_epi64(a, imm) \
5546 __extension__({ \
5547 __m128i ret; \
5548 if (unlikely((imm) == 0)) { \
5549 ret = a; \
5550 } else if (likely(0 < (imm) && (imm) < 64)) { \
5551 ret = vreinterpretq_m128i_u64( \
5552 vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
5553 } else { \
5554 ret = _mm_setzero_si128(); \
5555 } \
5556 ret; \
5557 })
5558
5559 // Shifts the 128 - bit value in a right by imm bytes while shifting in
5560 // zeros.imm must be an immediate.
5561 //
5562 // r := srl(a, imm*8)
5563 //
5564 // https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
5565 // FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
5566 #define _mm_srli_si128(a, imm) \
5567 __extension__({ \
5568 __m128i ret; \
5569 if (unlikely((imm) <= 0)) { \
5570 ret = a; \
5571 } \
5572 if (unlikely((imm) > 15)) { \
5573 ret = _mm_setzero_si128(); \
5574 } else { \
5575 ret = vreinterpretq_m128i_s8( \
5576 vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
5577 } \
5578 ret; \
5579 })
5580
5581 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5582 // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
5583 // or a general-protection exception may be generated.
5584 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
_mm_store_pd(double * mem_addr,__m128d a)5585 FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
5586 {
5587 #if defined(__aarch64__)
5588 vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
5589 #else
5590 vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
5591 #endif
5592 }
5593
5594 // Store the lower double-precision (64-bit) floating-point element from a into
5595 // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5596 // boundary or a general-protection exception may be generated.
5597 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
_mm_store_pd1(double * mem_addr,__m128d a)5598 FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
5599 {
5600 #if defined(__aarch64__)
5601 float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
5602 vst1q_f64((float64_t *) mem_addr,
5603 vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
5604 #else
5605 float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
5606 vst1q_f32((float32_t *) mem_addr,
5607 vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
5608 #endif
5609 }
5610
5611 // Store the lower double-precision (64-bit) floating-point element from a into
5612 // memory. mem_addr does not need to be aligned on any particular boundary.
5613 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_store_sd
_mm_store_sd(double * mem_addr,__m128d a)5614 FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
5615 {
5616 #if defined(__aarch64__)
5617 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5618 #else
5619 vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
5620 #endif
5621 }
5622
5623 // Stores four 32-bit integer values as (as a __m128i value) at the address p.
5624 // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
_mm_store_si128(__m128i * p,__m128i a)5625 FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
5626 {
5627 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5628 }
5629
5630 // Store the lower double-precision (64-bit) floating-point element from a into
5631 // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5632 // boundary or a general-protection exception may be generated.
5633 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
5634 #define _mm_store1_pd _mm_store_pd1
5635
5636 // Store the upper double-precision (64-bit) floating-point element from a into
5637 // memory.
5638 //
5639 // MEM[mem_addr+63:mem_addr] := a[127:64]
5640 //
5641 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
_mm_storeh_pd(double * mem_addr,__m128d a)5642 FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
5643 {
5644 #if defined(__aarch64__)
5645 vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
5646 #else
5647 vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
5648 #endif
5649 }
5650
5651 // Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
5652 // https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
_mm_storel_epi64(__m128i * a,__m128i b)5653 FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
5654 {
5655 uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
5656 uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
5657 *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
5658 }
5659
5660 // Store the lower double-precision (64-bit) floating-point element from a into
5661 // memory.
5662 //
5663 // MEM[mem_addr+63:mem_addr] := a[63:0]
5664 //
5665 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
_mm_storel_pd(double * mem_addr,__m128d a)5666 FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
5667 {
5668 #if defined(__aarch64__)
5669 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5670 #else
5671 vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
5672 #endif
5673 }
5674
5675 // Store 2 double-precision (64-bit) floating-point elements from a into memory
5676 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
5677 // general-protection exception may be generated.
5678 //
5679 // MEM[mem_addr+63:mem_addr] := a[127:64]
5680 // MEM[mem_addr+127:mem_addr+64] := a[63:0]
5681 //
5682 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
_mm_storer_pd(double * mem_addr,__m128d a)5683 FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
5684 {
5685 float32x4_t f = vreinterpretq_f32_m128d(a);
5686 _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
5687 }
5688
5689 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5690 // elements) from a into memory. mem_addr does not need to be aligned on any
5691 // particular boundary.
5692 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
_mm_storeu_pd(double * mem_addr,__m128d a)5693 FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
5694 {
5695 _mm_store_pd(mem_addr, a);
5696 }
5697
5698 // Stores 128-bits of integer data a at the address p.
5699 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128
_mm_storeu_si128(__m128i * p,__m128i a)5700 FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
5701 {
5702 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5703 }
5704
5705 // Stores 32-bits of integer data a at the address p.
5706 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32
_mm_storeu_si32(void * p,__m128i a)5707 FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
5708 {
5709 vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
5710 }
5711
5712 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5713 // elements) from a into memory using a non-temporal memory hint. mem_addr must
5714 // be aligned on a 16-byte boundary or a general-protection exception may be
5715 // generated.
5716 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd
_mm_stream_pd(double * p,__m128d a)5717 FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
5718 {
5719 #if __has_builtin(__builtin_nontemporal_store)
5720 __builtin_nontemporal_store(a, (float32x4_t *) p);
5721 #elif defined(__aarch64__)
5722 vst1q_f64(p, vreinterpretq_f64_m128d(a));
5723 #else
5724 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
5725 #endif
5726 }
5727
5728 // Stores the data in a to the address p without polluting the caches. If the
5729 // cache line containing address p is already in the cache, the cache will be
5730 // updated.
5731 // https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
_mm_stream_si128(__m128i * p,__m128i a)5732 FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
5733 {
5734 #if __has_builtin(__builtin_nontemporal_store)
5735 __builtin_nontemporal_store(a, p);
5736 #else
5737 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
5738 #endif
5739 }
5740
5741 // Store 32-bit integer a into memory using a non-temporal hint to minimize
5742 // cache pollution. If the cache line containing address mem_addr is already in
5743 // the cache, the cache will be updated.
5744 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32
_mm_stream_si32(int * p,int a)5745 FORCE_INLINE void _mm_stream_si32(int *p, int a)
5746 {
5747 vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
5748 }
5749
5750 // Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
5751 // store the results in dst.
5752 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
_mm_sub_epi16(__m128i a,__m128i b)5753 FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
5754 {
5755 return vreinterpretq_m128i_s16(
5756 vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5757 }
5758
5759 // Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
5760 // unsigned 32-bit integers of a.
5761 //
5762 // r0 := a0 - b0
5763 // r1 := a1 - b1
5764 // r2 := a2 - b2
5765 // r3 := a3 - b3
5766 //
5767 // https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
_mm_sub_epi32(__m128i a,__m128i b)5768 FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
5769 {
5770 return vreinterpretq_m128i_s32(
5771 vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5772 }
5773
5774 // Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
5775 // and store the results in dst.
5776 // r0 := a0 - b0
5777 // r1 := a1 - b1
_mm_sub_epi64(__m128i a,__m128i b)5778 FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
5779 {
5780 return vreinterpretq_m128i_s64(
5781 vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
5782 }
5783
5784 // Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
5785 // store the results in dst.
5786 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
_mm_sub_epi8(__m128i a,__m128i b)5787 FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
5788 {
5789 return vreinterpretq_m128i_s8(
5790 vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5791 }
5792
5793 // Subtract packed double-precision (64-bit) floating-point elements in b from
5794 // packed double-precision (64-bit) floating-point elements in a, and store the
5795 // results in dst.
5796 //
5797 // FOR j := 0 to 1
5798 // i := j*64
5799 // dst[i+63:i] := a[i+63:i] - b[i+63:i]
5800 // ENDFOR
5801 //
5802 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
_mm_sub_pd(__m128d a,__m128d b)5803 FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
5804 {
5805 #if defined(__aarch64__)
5806 return vreinterpretq_m128d_f64(
5807 vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5808 #else
5809 double *da = (double *) &a;
5810 double *db = (double *) &b;
5811 double c[2];
5812 c[0] = da[0] - db[0];
5813 c[1] = da[1] - db[1];
5814 return vld1q_f32((float32_t *) c);
5815 #endif
5816 }
5817
5818 // Subtract the lower double-precision (64-bit) floating-point element in b from
5819 // the lower double-precision (64-bit) floating-point element in a, store the
5820 // result in the lower element of dst, and copy the upper element from a to the
5821 // upper element of dst.
5822 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
_mm_sub_sd(__m128d a,__m128d b)5823 FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
5824 {
5825 return _mm_move_sd(a, _mm_sub_pd(a, b));
5826 }
5827
5828 // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
5829 //
5830 // dst[63:0] := a[63:0] - b[63:0]
5831 //
5832 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
_mm_sub_si64(__m64 a,__m64 b)5833 FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
5834 {
5835 return vreinterpret_m64_s64(
5836 vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
5837 }
5838
5839 // Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
5840 // of a and saturates.
5841 //
5842 // r0 := SignedSaturate(a0 - b0)
5843 // r1 := SignedSaturate(a1 - b1)
5844 // ...
5845 // r7 := SignedSaturate(a7 - b7)
5846 //
5847 // https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
_mm_subs_epi16(__m128i a,__m128i b)5848 FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
5849 {
5850 return vreinterpretq_m128i_s16(
5851 vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5852 }
5853
5854 // Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
5855 // of a and saturates.
5856 //
5857 // r0 := SignedSaturate(a0 - b0)
5858 // r1 := SignedSaturate(a1 - b1)
5859 // ...
5860 // r15 := SignedSaturate(a15 - b15)
5861 //
5862 // https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
_mm_subs_epi8(__m128i a,__m128i b)5863 FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
5864 {
5865 return vreinterpretq_m128i_s8(
5866 vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5867 }
5868
5869 // Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
5870 // integers of a and saturates..
5871 // https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
_mm_subs_epu16(__m128i a,__m128i b)5872 FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
5873 {
5874 return vreinterpretq_m128i_u16(
5875 vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
5876 }
5877
5878 // Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
5879 // integers of a and saturates.
5880 //
5881 // r0 := UnsignedSaturate(a0 - b0)
5882 // r1 := UnsignedSaturate(a1 - b1)
5883 // ...
5884 // r15 := UnsignedSaturate(a15 - b15)
5885 //
5886 // https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
_mm_subs_epu8(__m128i a,__m128i b)5887 FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
5888 {
5889 return vreinterpretq_m128i_u8(
5890 vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
5891 }
5892
5893 #define _mm_ucomieq_sd _mm_comieq_sd
5894 #define _mm_ucomige_sd _mm_comige_sd
5895 #define _mm_ucomigt_sd _mm_comigt_sd
5896 #define _mm_ucomile_sd _mm_comile_sd
5897 #define _mm_ucomilt_sd _mm_comilt_sd
5898 #define _mm_ucomineq_sd _mm_comineq_sd
5899
5900 // Return vector of type __m128d with undefined elements.
5901 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd
_mm_undefined_pd(void)5902 FORCE_INLINE __m128d _mm_undefined_pd(void)
5903 {
5904 #if defined(__GNUC__) || defined(__clang__)
5905 #pragma GCC diagnostic push
5906 #pragma GCC diagnostic ignored "-Wuninitialized"
5907 #endif
5908 __m128d a;
5909 return a;
5910 #if defined(__GNUC__) || defined(__clang__)
5911 #pragma GCC diagnostic pop
5912 #endif
5913 }
5914
5915 // Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
5916 // upper 4 signed or unsigned 16-bit integers in b.
5917 //
5918 // r0 := a4
5919 // r1 := b4
5920 // r2 := a5
5921 // r3 := b5
5922 // r4 := a6
5923 // r5 := b6
5924 // r6 := a7
5925 // r7 := b7
5926 //
5927 // https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
_mm_unpackhi_epi16(__m128i a,__m128i b)5928 FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
5929 {
5930 #if defined(__aarch64__)
5931 return vreinterpretq_m128i_s16(
5932 vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5933 #else
5934 int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
5935 int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
5936 int16x4x2_t result = vzip_s16(a1, b1);
5937 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5938 #endif
5939 }
5940
5941 // Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
5942 // upper 2 signed or unsigned 32-bit integers in b.
5943 // https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
_mm_unpackhi_epi32(__m128i a,__m128i b)5944 FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
5945 {
5946 #if defined(__aarch64__)
5947 return vreinterpretq_m128i_s32(
5948 vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5949 #else
5950 int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
5951 int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
5952 int32x2x2_t result = vzip_s32(a1, b1);
5953 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5954 #endif
5955 }
5956
5957 // Interleaves the upper signed or unsigned 64-bit integer in a with the
5958 // upper signed or unsigned 64-bit integer in b.
5959 //
5960 // r0 := a1
5961 // r1 := b1
_mm_unpackhi_epi64(__m128i a,__m128i b)5962 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
5963 {
5964 int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
5965 int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
5966 return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
5967 }
5968
5969 // Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
5970 // 8 signed or unsigned 8-bit integers in b.
5971 //
5972 // r0 := a8
5973 // r1 := b8
5974 // r2 := a9
5975 // r3 := b9
5976 // ...
5977 // r14 := a15
5978 // r15 := b15
5979 //
5980 // https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
_mm_unpackhi_epi8(__m128i a,__m128i b)5981 FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
5982 {
5983 #if defined(__aarch64__)
5984 return vreinterpretq_m128i_s8(
5985 vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5986 #else
5987 int8x8_t a1 =
5988 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
5989 int8x8_t b1 =
5990 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
5991 int8x8x2_t result = vzip_s8(a1, b1);
5992 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5993 #endif
5994 }
5995
5996 // Unpack and interleave double-precision (64-bit) floating-point elements from
5997 // the high half of a and b, and store the results in dst.
5998 //
5999 // DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
6000 // dst[63:0] := src1[127:64]
6001 // dst[127:64] := src2[127:64]
6002 // RETURN dst[127:0]
6003 // }
6004 // dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
6005 //
6006 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
_mm_unpackhi_pd(__m128d a,__m128d b)6007 FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
6008 {
6009 #if defined(__aarch64__)
6010 return vreinterpretq_m128d_f64(
6011 vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6012 #else
6013 return vreinterpretq_m128d_s64(
6014 vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
6015 vget_high_s64(vreinterpretq_s64_m128d(b))));
6016 #endif
6017 }
6018
6019 // Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
6020 // lower 4 signed or unsigned 16-bit integers in b.
6021 //
6022 // r0 := a0
6023 // r1 := b0
6024 // r2 := a1
6025 // r3 := b1
6026 // r4 := a2
6027 // r5 := b2
6028 // r6 := a3
6029 // r7 := b3
6030 //
6031 // https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
_mm_unpacklo_epi16(__m128i a,__m128i b)6032 FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
6033 {
6034 #if defined(__aarch64__)
6035 return vreinterpretq_m128i_s16(
6036 vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
6037 #else
6038 int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
6039 int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
6040 int16x4x2_t result = vzip_s16(a1, b1);
6041 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
6042 #endif
6043 }
6044
6045 // Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
6046 // lower 2 signed or unsigned 32 - bit integers in b.
6047 //
6048 // r0 := a0
6049 // r1 := b0
6050 // r2 := a1
6051 // r3 := b1
6052 //
6053 // https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
_mm_unpacklo_epi32(__m128i a,__m128i b)6054 FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
6055 {
6056 #if defined(__aarch64__)
6057 return vreinterpretq_m128i_s32(
6058 vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6059 #else
6060 int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
6061 int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
6062 int32x2x2_t result = vzip_s32(a1, b1);
6063 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
6064 #endif
6065 }
6066
_mm_unpacklo_epi64(__m128i a,__m128i b)6067 FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
6068 {
6069 int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
6070 int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
6071 return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
6072 }
6073
6074 // Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
6075 // 8 signed or unsigned 8-bit integers in b.
6076 //
6077 // r0 := a0
6078 // r1 := b0
6079 // r2 := a1
6080 // r3 := b1
6081 // ...
6082 // r14 := a7
6083 // r15 := b7
6084 //
6085 // https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
_mm_unpacklo_epi8(__m128i a,__m128i b)6086 FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
6087 {
6088 #if defined(__aarch64__)
6089 return vreinterpretq_m128i_s8(
6090 vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
6091 #else
6092 int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
6093 int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
6094 int8x8x2_t result = vzip_s8(a1, b1);
6095 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
6096 #endif
6097 }
6098
6099 // Unpack and interleave double-precision (64-bit) floating-point elements from
6100 // the low half of a and b, and store the results in dst.
6101 //
6102 // DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
6103 // dst[63:0] := src1[63:0]
6104 // dst[127:64] := src2[63:0]
6105 // RETURN dst[127:0]
6106 // }
6107 // dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
6108 //
6109 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
_mm_unpacklo_pd(__m128d a,__m128d b)6110 FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
6111 {
6112 #if defined(__aarch64__)
6113 return vreinterpretq_m128d_f64(
6114 vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6115 #else
6116 return vreinterpretq_m128d_s64(
6117 vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
6118 vget_low_s64(vreinterpretq_s64_m128d(b))));
6119 #endif
6120 }
6121
6122 // Compute the bitwise XOR of packed double-precision (64-bit) floating-point
6123 // elements in a and b, and store the results in dst.
6124 //
6125 // FOR j := 0 to 1
6126 // i := j*64
6127 // dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
6128 // ENDFOR
6129 //
6130 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
_mm_xor_pd(__m128d a,__m128d b)6131 FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
6132 {
6133 return vreinterpretq_m128d_s64(
6134 veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
6135 }
6136
6137 // Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
6138 // b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
_mm_xor_si128(__m128i a,__m128i b)6139 FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
6140 {
6141 return vreinterpretq_m128i_s32(
6142 veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
6143 }
6144
6145 /* SSE3 */
6146
6147 // Alternatively add and subtract packed double-precision (64-bit)
6148 // floating-point elements in a to/from packed elements in b, and store the
6149 // results in dst.
6150 //
6151 // FOR j := 0 to 1
6152 // i := j*64
6153 // IF ((j & 1) == 0)
6154 // dst[i+63:i] := a[i+63:i] - b[i+63:i]
6155 // ELSE
6156 // dst[i+63:i] := a[i+63:i] + b[i+63:i]
6157 // FI
6158 // ENDFOR
6159 //
6160 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd
_mm_addsub_pd(__m128d a,__m128d b)6161 FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
6162 {
6163 __m128d mask = _mm_set_pd(1.0f, -1.0f);
6164 #if defined(__aarch64__)
6165 return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
6166 vreinterpretq_f64_m128d(b),
6167 vreinterpretq_f64_m128d(mask)));
6168 #else
6169 return _mm_add_pd(_mm_mul_pd(b, mask), a);
6170 #endif
6171 }
6172
6173 // Alternatively add and subtract packed single-precision (32-bit)
6174 // floating-point elements in a to/from packed elements in b, and store the
6175 // results in dst.
6176 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
_mm_addsub_ps(__m128 a,__m128 b)6177 FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
6178 {
6179 __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
6180 #if defined(__aarch64__) || defined(__ARM_FEATURE_FMA) /* VFPv4+ */
6181 return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
6182 vreinterpretq_f32_m128(mask),
6183 vreinterpretq_f32_m128(b)));
6184 #else
6185 return _mm_add_ps(_mm_mul_ps(b, mask), a);
6186 #endif
6187 }
6188
6189 // Horizontally add adjacent pairs of double-precision (64-bit) floating-point
6190 // elements in a and b, and pack the results in dst.
6191 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
_mm_hadd_pd(__m128d a,__m128d b)6192 FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
6193 {
6194 #if defined(__aarch64__)
6195 return vreinterpretq_m128d_f64(
6196 vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
6197 #else
6198 double *da = (double *) &a;
6199 double *db = (double *) &b;
6200 double c[] = {da[0] + da[1], db[0] + db[1]};
6201 return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6202 #endif
6203 }
6204
6205 // Computes pairwise add of each argument as single-precision, floating-point
6206 // values a and b.
6207 // https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
_mm_hadd_ps(__m128 a,__m128 b)6208 FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
6209 {
6210 #if defined(__aarch64__)
6211 return vreinterpretq_m128_f32(
6212 vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
6213 #else
6214 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
6215 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
6216 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
6217 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
6218 return vreinterpretq_m128_f32(
6219 vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
6220 #endif
6221 }
6222
6223 // Horizontally subtract adjacent pairs of double-precision (64-bit)
6224 // floating-point elements in a and b, and pack the results in dst.
6225 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd
_mm_hsub_pd(__m128d _a,__m128d _b)6226 FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
6227 {
6228 #if defined(__aarch64__)
6229 return vreinterpretq_m128d_f64(vsubq_f64(
6230 vuzp1q_f64(vreinterpretq_f64_m128d(_a), vreinterpretq_f64_m128d(_b)),
6231 vuzp2q_f64(vreinterpretq_f64_m128d(_a), vreinterpretq_f64_m128d(_b))));
6232 #else
6233 double *da = (double *) &_a;
6234 double *db = (double *) &_b;
6235 double c[] = {da[0] - da[1], db[0] - db[1]};
6236 return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
6237 #endif
6238 }
6239
6240 // Horizontally substract adjacent pairs of single-precision (32-bit)
6241 // floating-point elements in a and b, and pack the results in dst.
6242 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
_mm_hsub_ps(__m128 _a,__m128 _b)6243 FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
6244 {
6245 #if defined(__aarch64__)
6246 return vreinterpretq_m128_f32(vsubq_f32(
6247 vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
6248 vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
6249 #else
6250 float32x4x2_t c =
6251 vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
6252 return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
6253 #endif
6254 }
6255
6256 // Load 128-bits of integer data from unaligned memory into dst. This intrinsic
6257 // may perform better than _mm_loadu_si128 when the data crosses a cache line
6258 // boundary.
6259 //
6260 // dst[127:0] := MEM[mem_addr+127:mem_addr]
6261 //
6262 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
6263 #define _mm_lddqu_si128 _mm_loadu_si128
6264
6265 // Load a double-precision (64-bit) floating-point element from memory into both
6266 // elements of dst.
6267 //
6268 // dst[63:0] := MEM[mem_addr+63:mem_addr]
6269 // dst[127:64] := MEM[mem_addr+63:mem_addr]
6270 //
6271 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
6272 #define _mm_loaddup_pd _mm_load1_pd
6273
6274 // Duplicate the low double-precision (64-bit) floating-point element from a,
6275 // and store the results in dst.
6276 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
_mm_movedup_pd(__m128d a)6277 FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
6278 {
6279 #if (__aarch64__)
6280 return vreinterpretq_m128d_f64(
6281 vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
6282 #else
6283 return vreinterpretq_m128d_u64(
6284 vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
6285 #endif
6286 }
6287
6288 // Duplicate odd-indexed single-precision (32-bit) floating-point elements
6289 // from a, and store the results in dst.
6290 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
_mm_movehdup_ps(__m128 a)6291 FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
6292 {
6293 #if __has_builtin(__builtin_shufflevector)
6294 return vreinterpretq_m128_f32(__builtin_shufflevector(
6295 vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
6296 #else
6297 float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
6298 float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
6299 float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
6300 return vreinterpretq_m128_f32(vld1q_f32(data));
6301 #endif
6302 }
6303
6304 // Duplicate even-indexed single-precision (32-bit) floating-point elements
6305 // from a, and store the results in dst.
6306 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
_mm_moveldup_ps(__m128 a)6307 FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
6308 {
6309 #if __has_builtin(__builtin_shufflevector)
6310 return vreinterpretq_m128_f32(__builtin_shufflevector(
6311 vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
6312 #else
6313 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
6314 float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
6315 float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
6316 return vreinterpretq_m128_f32(vld1q_f32(data));
6317 #endif
6318 }
6319
6320 /* SSSE3 */
6321
6322 // Compute the absolute value of packed signed 16-bit integers in a, and store
6323 // the unsigned results in dst.
6324 //
6325 // FOR j := 0 to 7
6326 // i := j*16
6327 // dst[i+15:i] := ABS(a[i+15:i])
6328 // ENDFOR
6329 //
6330 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
_mm_abs_epi16(__m128i a)6331 FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
6332 {
6333 return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
6334 }
6335
6336 // Compute the absolute value of packed signed 32-bit integers in a, and store
6337 // the unsigned results in dst.
6338 //
6339 // FOR j := 0 to 3
6340 // i := j*32
6341 // dst[i+31:i] := ABS(a[i+31:i])
6342 // ENDFOR
6343 //
6344 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
_mm_abs_epi32(__m128i a)6345 FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
6346 {
6347 return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
6348 }
6349
6350 // Compute the absolute value of packed signed 8-bit integers in a, and store
6351 // the unsigned results in dst.
6352 //
6353 // FOR j := 0 to 15
6354 // i := j*8
6355 // dst[i+7:i] := ABS(a[i+7:i])
6356 // ENDFOR
6357 //
6358 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
_mm_abs_epi8(__m128i a)6359 FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
6360 {
6361 return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
6362 }
6363
6364 // Compute the absolute value of packed signed 16-bit integers in a, and store
6365 // the unsigned results in dst.
6366 //
6367 // FOR j := 0 to 3
6368 // i := j*16
6369 // dst[i+15:i] := ABS(a[i+15:i])
6370 // ENDFOR
6371 //
6372 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
_mm_abs_pi16(__m64 a)6373 FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
6374 {
6375 return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
6376 }
6377
6378 // Compute the absolute value of packed signed 32-bit integers in a, and store
6379 // the unsigned results in dst.
6380 //
6381 // FOR j := 0 to 1
6382 // i := j*32
6383 // dst[i+31:i] := ABS(a[i+31:i])
6384 // ENDFOR
6385 //
6386 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
_mm_abs_pi32(__m64 a)6387 FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
6388 {
6389 return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
6390 }
6391
6392 // Compute the absolute value of packed signed 8-bit integers in a, and store
6393 // the unsigned results in dst.
6394 //
6395 // FOR j := 0 to 7
6396 // i := j*8
6397 // dst[i+7:i] := ABS(a[i+7:i])
6398 // ENDFOR
6399 //
6400 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
_mm_abs_pi8(__m64 a)6401 FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
6402 {
6403 return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
6404 }
6405
6406 // Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
6407 // the result right by imm8 bytes, and store the low 16 bytes in dst.
6408 //
6409 // tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
6410 // dst[127:0] := tmp[127:0]
6411 //
6412 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
6413 #define _mm_alignr_epi8(a, b, imm) \
6414 __extension__({ \
6415 __m128i ret; \
6416 if (unlikely((imm) >= 32)) { \
6417 ret = _mm_setzero_si128(); \
6418 } else { \
6419 uint8x16_t tmp_low, tmp_high; \
6420 if (imm >= 16) { \
6421 const int idx = imm - 16; \
6422 tmp_low = vreinterpretq_u8_m128i(a); \
6423 tmp_high = vdupq_n_u8(0); \
6424 ret = \
6425 vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
6426 } else { \
6427 const int idx = imm; \
6428 tmp_low = vreinterpretq_u8_m128i(b); \
6429 tmp_high = vreinterpretq_u8_m128i(a); \
6430 ret = \
6431 vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
6432 } \
6433 } \
6434 ret; \
6435 })
6436
6437 // Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
6438 // the result right by imm8 bytes, and store the low 8 bytes in dst.
6439 //
6440 // tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
6441 // dst[63:0] := tmp[63:0]
6442 //
6443 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
6444 #define _mm_alignr_pi8(a, b, imm) \
6445 __extension__({ \
6446 __m64 ret; \
6447 if (unlikely((imm) >= 16)) { \
6448 ret = vreinterpret_m64_s8(vdup_n_s8(0)); \
6449 } else { \
6450 uint8x8_t tmp_low, tmp_high; \
6451 if (imm >= 8) { \
6452 const int idx = imm - 8; \
6453 tmp_low = vreinterpret_u8_m64(a); \
6454 tmp_high = vdup_n_u8(0); \
6455 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6456 } else { \
6457 const int idx = imm; \
6458 tmp_low = vreinterpret_u8_m64(b); \
6459 tmp_high = vreinterpret_u8_m64(a); \
6460 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6461 } \
6462 } \
6463 ret; \
6464 })
6465
6466 // Computes pairwise add of each argument as a 16-bit signed or unsigned integer
6467 // values a and b.
_mm_hadd_epi16(__m128i _a,__m128i _b)6468 FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
6469 {
6470 int16x8_t a = vreinterpretq_s16_m128i(_a);
6471 int16x8_t b = vreinterpretq_s16_m128i(_b);
6472 #if defined(__aarch64__)
6473 return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
6474 #else
6475 return vreinterpretq_m128i_s16(
6476 vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
6477 vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
6478 #endif
6479 }
6480
6481 // Computes pairwise add of each argument as a 32-bit signed or unsigned integer
6482 // values a and b.
_mm_hadd_epi32(__m128i _a,__m128i _b)6483 FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
6484 {
6485 int32x4_t a = vreinterpretq_s32_m128i(_a);
6486 int32x4_t b = vreinterpretq_s32_m128i(_b);
6487 return vreinterpretq_m128i_s32(
6488 vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
6489 vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
6490 }
6491
6492 // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
6493 // signed 16-bit results in dst.
6494 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
_mm_hadd_pi16(__m64 a,__m64 b)6495 FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
6496 {
6497 return vreinterpret_m64_s16(
6498 vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
6499 }
6500
6501 // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
6502 // signed 32-bit results in dst.
6503 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
_mm_hadd_pi32(__m64 a,__m64 b)6504 FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
6505 {
6506 return vreinterpret_m64_s32(
6507 vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
6508 }
6509
6510 // Computes saturated pairwise sub of each argument as a 16-bit signed
6511 // integer values a and b.
_mm_hadds_epi16(__m128i _a,__m128i _b)6512 FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
6513 {
6514 #if defined(__aarch64__)
6515 int16x8_t a = vreinterpretq_s16_m128i(_a);
6516 int16x8_t b = vreinterpretq_s16_m128i(_b);
6517 return vreinterpretq_s64_s16(
6518 vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6519 #else
6520 int32x4_t a = vreinterpretq_s32_m128i(_a);
6521 int32x4_t b = vreinterpretq_s32_m128i(_b);
6522 // Interleave using vshrn/vmovn
6523 // [a0|a2|a4|a6|b0|b2|b4|b6]
6524 // [a1|a3|a5|a7|b1|b3|b5|b7]
6525 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6526 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6527 // Saturated add
6528 return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
6529 #endif
6530 }
6531
6532 // Horizontally add adjacent pairs of signed 16-bit integers in a and b using
6533 // saturation, and pack the signed 16-bit results in dst.
6534 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_pi16
_mm_hadds_pi16(__m64 _a,__m64 _b)6535 FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
6536 {
6537 int16x4_t a = vreinterpret_s16_m64(_a);
6538 int16x4_t b = vreinterpret_s16_m64(_b);
6539 #if defined(__aarch64__)
6540 return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6541 #else
6542 int16x4x2_t res = vuzp_s16(a, b);
6543 return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
6544 #endif
6545 }
6546
6547 // Computes pairwise difference of each argument as a 16-bit signed or unsigned
6548 // integer values a and b.
_mm_hsub_epi16(__m128i _a,__m128i _b)6549 FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
6550 {
6551 int32x4_t a = vreinterpretq_s32_m128i(_a);
6552 int32x4_t b = vreinterpretq_s32_m128i(_b);
6553 // Interleave using vshrn/vmovn
6554 // [a0|a2|a4|a6|b0|b2|b4|b6]
6555 // [a1|a3|a5|a7|b1|b3|b5|b7]
6556 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6557 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6558 // Subtract
6559 return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
6560 }
6561
6562 // Computes pairwise difference of each argument as a 32-bit signed or unsigned
6563 // integer values a and b.
_mm_hsub_epi32(__m128i _a,__m128i _b)6564 FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
6565 {
6566 int64x2_t a = vreinterpretq_s64_m128i(_a);
6567 int64x2_t b = vreinterpretq_s64_m128i(_b);
6568 // Interleave using vshrn/vmovn
6569 // [a0|a2|b0|b2]
6570 // [a1|a2|b1|b3]
6571 int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
6572 int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
6573 // Subtract
6574 return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
6575 }
6576
6577 // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6578 // the signed 16-bit results in dst.
6579 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pi16
_mm_hsub_pi16(__m64 _a,__m64 _b)6580 FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
6581 {
6582 int32x4_t ab =
6583 vcombine_s32(vreinterpret_s32_m64(_a), vreinterpret_s32_m64(_b));
6584
6585 int16x4_t ab_low_bits = vmovn_s32(ab);
6586 int16x4_t ab_high_bits = vshrn_n_s32(ab, 16);
6587
6588 return vreinterpret_m64_s16(vsub_s16(ab_low_bits, ab_high_bits));
6589 }
6590
6591 // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6592 // the signed 32-bit results in dst.
6593 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_hsub_pi32
_mm_hsub_pi32(__m64 _a,__m64 _b)6594 FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
6595 {
6596 #if defined(__aarch64__)
6597 int32x2_t a = vreinterpret_s32_m64(_a);
6598 int32x2_t b = vreinterpret_s32_m64(_b);
6599 return vreinterpret_m64_s32(vsub_s32(vtrn1_s32(a, b), vtrn2_s32(a, b)));
6600 #else
6601 int32x2x2_t trn_ab =
6602 vtrn_s32(vreinterpret_s32_m64(_a), vreinterpret_s32_m64(_b));
6603 return vreinterpret_m64_s32(vsub_s32(trn_ab.val[0], trn_ab.val[1]));
6604 #endif
6605 }
6606
6607 // Computes saturated pairwise difference of each argument as a 16-bit signed
6608 // integer values a and b.
6609 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
_mm_hsubs_epi16(__m128i _a,__m128i _b)6610 FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
6611 {
6612 #if defined(__aarch64__)
6613 int16x8_t a = vreinterpretq_s16_m128i(_a);
6614 int16x8_t b = vreinterpretq_s16_m128i(_b);
6615 return vreinterpretq_s64_s16(
6616 vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6617 #else
6618 int32x4_t a = vreinterpretq_s32_m128i(_a);
6619 int32x4_t b = vreinterpretq_s32_m128i(_b);
6620 // Interleave using vshrn/vmovn
6621 // [a0|a2|a4|a6|b0|b2|b4|b6]
6622 // [a1|a3|a5|a7|b1|b3|b5|b7]
6623 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6624 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6625 // Saturated subtract
6626 return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
6627 #endif
6628 }
6629
6630 // Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
6631 // using saturation, and pack the signed 16-bit results in dst.
6632 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_pi16
_mm_hsubs_pi16(__m64 _a,__m64 _b)6633 FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
6634 {
6635 int16x4_t a = vreinterpret_s16_m64(_a);
6636 int16x4_t b = vreinterpret_s16_m64(_b);
6637 #if defined(__aarch64__)
6638 return vreinterpret_s64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6639 #else
6640 int16x4x2_t res = vuzp_s16(a, b);
6641 return vreinterpret_s64_s16(vqsub_s16(res.val[0], res.val[1]));
6642 #endif
6643 }
6644
6645 // Vertically multiply each unsigned 8-bit integer from a with the corresponding
6646 // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6647 // Horizontally add adjacent pairs of intermediate signed 16-bit integers,
6648 // and pack the saturated results in dst.
6649 //
6650 // FOR j := 0 to 7
6651 // i := j*16
6652 // dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
6653 // a[i+7:i]*b[i+7:i] )
6654 // ENDFOR
_mm_maddubs_epi16(__m128i _a,__m128i _b)6655 FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
6656 {
6657 #if defined(__aarch64__)
6658 uint8x16_t a = vreinterpretq_u8_m128i(_a);
6659 int8x16_t b = vreinterpretq_s8_m128i(_b);
6660 int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
6661 vmovl_s8(vget_low_s8(b)));
6662 int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
6663 vmovl_s8(vget_high_s8(b)));
6664 return vreinterpretq_m128i_s16(
6665 vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
6666 #else
6667 // This would be much simpler if x86 would choose to zero extend OR sign
6668 // extend, not both. This could probably be optimized better.
6669 uint16x8_t a = vreinterpretq_u16_m128i(_a);
6670 int16x8_t b = vreinterpretq_s16_m128i(_b);
6671
6672 // Zero extend a
6673 int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
6674 int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
6675
6676 // Sign extend by shifting left then shifting right.
6677 int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
6678 int16x8_t b_odd = vshrq_n_s16(b, 8);
6679
6680 // multiply
6681 int16x8_t prod1 = vmulq_s16(a_even, b_even);
6682 int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
6683
6684 // saturated add
6685 return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
6686 #endif
6687 }
6688
6689 // Vertically multiply each unsigned 8-bit integer from a with the corresponding
6690 // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6691 // Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
6692 // pack the saturated results in dst.
6693 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_pi16
_mm_maddubs_pi16(__m64 _a,__m64 _b)6694 FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
6695 {
6696 uint16x4_t a = vreinterpret_u16_m64(_a);
6697 int16x4_t b = vreinterpret_s16_m64(_b);
6698
6699 // Zero extend a
6700 int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
6701 int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
6702
6703 // Sign extend by shifting left then shifting right.
6704 int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
6705 int16x4_t b_odd = vshr_n_s16(b, 8);
6706
6707 // multiply
6708 int16x4_t prod1 = vmul_s16(a_even, b_even);
6709 int16x4_t prod2 = vmul_s16(a_odd, b_odd);
6710
6711 // saturated add
6712 return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
6713 }
6714
6715 // Multiply packed signed 16-bit integers in a and b, producing intermediate
6716 // signed 32-bit integers. Shift right by 15 bits while rounding up, and store
6717 // the packed 16-bit integers in dst.
6718 //
6719 // r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
6720 // r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
6721 // r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
6722 // ...
6723 // r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
_mm_mulhrs_epi16(__m128i a,__m128i b)6724 FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
6725 {
6726 // Has issues due to saturation
6727 // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
6728
6729 // Multiply
6730 int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
6731 vget_low_s16(vreinterpretq_s16_m128i(b)));
6732 int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
6733 vget_high_s16(vreinterpretq_s16_m128i(b)));
6734
6735 // Rounding narrowing shift right
6736 // narrow = (int16_t)((mul + 16384) >> 15);
6737 int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
6738 int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
6739
6740 // Join together
6741 return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
6742 }
6743
6744 // Multiply packed signed 16-bit integers in a and b, producing intermediate
6745 // signed 32-bit integers. Truncate each intermediate integer to the 18 most
6746 // significant bits, round by adding 1, and store bits [16:1] to dst.
6747 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_pi16
_mm_mulhrs_pi16(__m64 a,__m64 b)6748 FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
6749 {
6750 int32x4_t mul_extend =
6751 vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
6752
6753 // Rounding narrowing shift right
6754 return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
6755 }
6756
6757 // Shuffle packed 8-bit integers in a according to shuffle control mask in the
6758 // corresponding 8-bit element of b, and store the results in dst.
6759 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
_mm_shuffle_epi8(__m128i a,__m128i b)6760 FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
6761 {
6762 int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
6763 uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
6764 uint8x16_t idx_masked =
6765 vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
6766 #if defined(__aarch64__)
6767 return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
6768 #elif defined(__GNUC__)
6769 int8x16_t ret;
6770 // %e and %f represent the even and odd D registers
6771 // respectively.
6772 __asm__ __volatile__(
6773 "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
6774 "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
6775 : [ret] "=&w"(ret)
6776 : [tbl] "w"(tbl), [idx] "w"(idx_masked));
6777 return vreinterpretq_m128i_s8(ret);
6778 #else
6779 // use this line if testing on aarch64
6780 int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
6781 return vreinterpretq_m128i_s8(
6782 vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
6783 vtbl2_s8(a_split, vget_high_u8(idx_masked))));
6784 #endif
6785 }
6786
6787 // Negate packed 16-bit integers in a when the corresponding signed
6788 // 16-bit integer in b is negative, and store the results in dst.
6789 // Element in dst are zeroed out when the corresponding element
6790 // in b is zero.
6791 //
6792 // for i in 0..7
6793 // if b[i] < 0
6794 // r[i] := -a[i]
6795 // else if b[i] == 0
6796 // r[i] := 0
6797 // else
6798 // r[i] := a[i]
6799 // fi
6800 // done
_mm_sign_epi16(__m128i _a,__m128i _b)6801 FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
6802 {
6803 int16x8_t a = vreinterpretq_s16_m128i(_a);
6804 int16x8_t b = vreinterpretq_s16_m128i(_b);
6805
6806 // signed shift right: faster than vclt
6807 // (b < 0) ? 0xFFFF : 0
6808 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
6809 // (b == 0) ? 0xFFFF : 0
6810 #if defined(__aarch64__)
6811 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
6812 #else
6813 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
6814 #endif
6815
6816 // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
6817 // 'a') based on ltMask
6818 int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
6819 // res = masked & (~zeroMask)
6820 int16x8_t res = vbicq_s16(masked, zeroMask);
6821 return vreinterpretq_m128i_s16(res);
6822 }
6823
6824 // Negate packed 32-bit integers in a when the corresponding signed
6825 // 32-bit integer in b is negative, and store the results in dst.
6826 // Element in dst are zeroed out when the corresponding element
6827 // in b is zero.
6828 //
6829 // for i in 0..3
6830 // if b[i] < 0
6831 // r[i] := -a[i]
6832 // else if b[i] == 0
6833 // r[i] := 0
6834 // else
6835 // r[i] := a[i]
6836 // fi
6837 // done
_mm_sign_epi32(__m128i _a,__m128i _b)6838 FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
6839 {
6840 int32x4_t a = vreinterpretq_s32_m128i(_a);
6841 int32x4_t b = vreinterpretq_s32_m128i(_b);
6842
6843 // signed shift right: faster than vclt
6844 // (b < 0) ? 0xFFFFFFFF : 0
6845 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
6846
6847 // (b == 0) ? 0xFFFFFFFF : 0
6848 #if defined(__aarch64__)
6849 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
6850 #else
6851 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
6852 #endif
6853
6854 // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
6855 // 'a') based on ltMask
6856 int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
6857 // res = masked & (~zeroMask)
6858 int32x4_t res = vbicq_s32(masked, zeroMask);
6859 return vreinterpretq_m128i_s32(res);
6860 }
6861
6862 // Negate packed 8-bit integers in a when the corresponding signed
6863 // 8-bit integer in b is negative, and store the results in dst.
6864 // Element in dst are zeroed out when the corresponding element
6865 // in b is zero.
6866 //
6867 // for i in 0..15
6868 // if b[i] < 0
6869 // r[i] := -a[i]
6870 // else if b[i] == 0
6871 // r[i] := 0
6872 // else
6873 // r[i] := a[i]
6874 // fi
6875 // done
_mm_sign_epi8(__m128i _a,__m128i _b)6876 FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
6877 {
6878 int8x16_t a = vreinterpretq_s8_m128i(_a);
6879 int8x16_t b = vreinterpretq_s8_m128i(_b);
6880
6881 // signed shift right: faster than vclt
6882 // (b < 0) ? 0xFF : 0
6883 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
6884
6885 // (b == 0) ? 0xFF : 0
6886 #if defined(__aarch64__)
6887 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
6888 #else
6889 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
6890 #endif
6891
6892 // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
6893 // based on ltMask
6894 int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
6895 // res = masked & (~zeroMask)
6896 int8x16_t res = vbicq_s8(masked, zeroMask);
6897
6898 return vreinterpretq_m128i_s8(res);
6899 }
6900
6901 // Negate packed 16-bit integers in a when the corresponding signed 16-bit
6902 // integer in b is negative, and store the results in dst. Element in dst are
6903 // zeroed out when the corresponding element in b is zero.
6904 //
6905 // FOR j := 0 to 3
6906 // i := j*16
6907 // IF b[i+15:i] < 0
6908 // dst[i+15:i] := -(a[i+15:i])
6909 // ELSE IF b[i+15:i] == 0
6910 // dst[i+15:i] := 0
6911 // ELSE
6912 // dst[i+15:i] := a[i+15:i]
6913 // FI
6914 // ENDFOR
6915 //
6916 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
_mm_sign_pi16(__m64 _a,__m64 _b)6917 FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
6918 {
6919 int16x4_t a = vreinterpret_s16_m64(_a);
6920 int16x4_t b = vreinterpret_s16_m64(_b);
6921
6922 // signed shift right: faster than vclt
6923 // (b < 0) ? 0xFFFF : 0
6924 uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
6925
6926 // (b == 0) ? 0xFFFF : 0
6927 #if defined(__aarch64__)
6928 int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
6929 #else
6930 int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
6931 #endif
6932
6933 // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
6934 // based on ltMask
6935 int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
6936 // res = masked & (~zeroMask)
6937 int16x4_t res = vbic_s16(masked, zeroMask);
6938
6939 return vreinterpret_m64_s16(res);
6940 }
6941
6942 // Negate packed 32-bit integers in a when the corresponding signed 32-bit
6943 // integer in b is negative, and store the results in dst. Element in dst are
6944 // zeroed out when the corresponding element in b is zero.
6945 //
6946 // FOR j := 0 to 1
6947 // i := j*32
6948 // IF b[i+31:i] < 0
6949 // dst[i+31:i] := -(a[i+31:i])
6950 // ELSE IF b[i+31:i] == 0
6951 // dst[i+31:i] := 0
6952 // ELSE
6953 // dst[i+31:i] := a[i+31:i]
6954 // FI
6955 // ENDFOR
6956 //
6957 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
_mm_sign_pi32(__m64 _a,__m64 _b)6958 FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
6959 {
6960 int32x2_t a = vreinterpret_s32_m64(_a);
6961 int32x2_t b = vreinterpret_s32_m64(_b);
6962
6963 // signed shift right: faster than vclt
6964 // (b < 0) ? 0xFFFFFFFF : 0
6965 uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
6966
6967 // (b == 0) ? 0xFFFFFFFF : 0
6968 #if defined(__aarch64__)
6969 int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
6970 #else
6971 int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
6972 #endif
6973
6974 // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
6975 // based on ltMask
6976 int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
6977 // res = masked & (~zeroMask)
6978 int32x2_t res = vbic_s32(masked, zeroMask);
6979
6980 return vreinterpret_m64_s32(res);
6981 }
6982
6983 // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
6984 // in b is negative, and store the results in dst. Element in dst are zeroed out
6985 // when the corresponding element in b is zero.
6986 //
6987 // FOR j := 0 to 7
6988 // i := j*8
6989 // IF b[i+7:i] < 0
6990 // dst[i+7:i] := -(a[i+7:i])
6991 // ELSE IF b[i+7:i] == 0
6992 // dst[i+7:i] := 0
6993 // ELSE
6994 // dst[i+7:i] := a[i+7:i]
6995 // FI
6996 // ENDFOR
6997 //
6998 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
_mm_sign_pi8(__m64 _a,__m64 _b)6999 FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
7000 {
7001 int8x8_t a = vreinterpret_s8_m64(_a);
7002 int8x8_t b = vreinterpret_s8_m64(_b);
7003
7004 // signed shift right: faster than vclt
7005 // (b < 0) ? 0xFF : 0
7006 uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
7007
7008 // (b == 0) ? 0xFF : 0
7009 #if defined(__aarch64__)
7010 int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
7011 #else
7012 int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
7013 #endif
7014
7015 // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
7016 // based on ltMask
7017 int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
7018 // res = masked & (~zeroMask)
7019 int8x8_t res = vbic_s8(masked, zeroMask);
7020
7021 return vreinterpret_m64_s8(res);
7022 }
7023
7024 /* SSE4.1 */
7025
7026 // Blend packed 16-bit integers from a and b using control mask imm8, and store
7027 // the results in dst.
7028 //
7029 // FOR j := 0 to 7
7030 // i := j*16
7031 // IF imm8[j]
7032 // dst[i+15:i] := b[i+15:i]
7033 // ELSE
7034 // dst[i+15:i] := a[i+15:i]
7035 // FI
7036 // ENDFOR
7037 // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
7038 // __constrange(0,255) int imm)
7039 #define _mm_blend_epi16(a, b, imm) \
7040 __extension__({ \
7041 const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \
7042 ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \
7043 ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \
7044 ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \
7045 ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \
7046 ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \
7047 ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \
7048 ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
7049 uint16x8_t _mask_vec = vld1q_u16(_mask); \
7050 uint16x8_t _a = vreinterpretq_u16_m128i(a); \
7051 uint16x8_t _b = vreinterpretq_u16_m128i(b); \
7052 vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
7053 })
7054
7055 // Blend packed double-precision (64-bit) floating-point elements from a and b
7056 // using control mask imm8, and store the results in dst.
7057 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd
7058 #define _mm_blend_pd(a, b, imm) \
7059 __extension__({ \
7060 const uint64_t _mask[2] = { \
7061 ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \
7062 ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)}; \
7063 uint64x2_t _mask_vec = vld1q_u64(_mask); \
7064 uint64x2_t _a = vreinterpretq_u64_m128d(a); \
7065 uint64x2_t _b = vreinterpretq_u64_m128d(b); \
7066 vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
7067 })
7068
7069 // Blend packed single-precision (32-bit) floating-point elements from a and b
7070 // using mask, and store the results in dst.
7071 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps
_mm_blend_ps(__m128 _a,__m128 _b,const char imm8)7072 FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
7073 {
7074 const uint32_t ALIGN_STRUCT(16)
7075 data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
7076 ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
7077 ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
7078 ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
7079 uint32x4_t mask = vld1q_u32(data);
7080 float32x4_t a = vreinterpretq_f32_m128(_a);
7081 float32x4_t b = vreinterpretq_f32_m128(_b);
7082 return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
7083 }
7084
7085 // Blend packed 8-bit integers from a and b using mask, and store the results in
7086 // dst.
7087 //
7088 // FOR j := 0 to 15
7089 // i := j*8
7090 // IF mask[i+7]
7091 // dst[i+7:i] := b[i+7:i]
7092 // ELSE
7093 // dst[i+7:i] := a[i+7:i]
7094 // FI
7095 // ENDFOR
_mm_blendv_epi8(__m128i _a,__m128i _b,__m128i _mask)7096 FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
7097 {
7098 // Use a signed shift right to create a mask with the sign bit
7099 uint8x16_t mask =
7100 vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
7101 uint8x16_t a = vreinterpretq_u8_m128i(_a);
7102 uint8x16_t b = vreinterpretq_u8_m128i(_b);
7103 return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
7104 }
7105
7106 // Blend packed double-precision (64-bit) floating-point elements from a and b
7107 // using mask, and store the results in dst.
7108 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
_mm_blendv_pd(__m128d _a,__m128d _b,__m128d _mask)7109 FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
7110 {
7111 uint64x2_t mask =
7112 vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
7113 #if defined(__aarch64__)
7114 float64x2_t a = vreinterpretq_f64_m128d(_a);
7115 float64x2_t b = vreinterpretq_f64_m128d(_b);
7116 return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
7117 #else
7118 uint64x2_t a = vreinterpretq_u64_m128d(_a);
7119 uint64x2_t b = vreinterpretq_u64_m128d(_b);
7120 return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
7121 #endif
7122 }
7123
7124 // Blend packed single-precision (32-bit) floating-point elements from a and b
7125 // using mask, and store the results in dst.
7126 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
_mm_blendv_ps(__m128 _a,__m128 _b,__m128 _mask)7127 FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
7128 {
7129 // Use a signed shift right to create a mask with the sign bit
7130 uint32x4_t mask =
7131 vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
7132 float32x4_t a = vreinterpretq_f32_m128(_a);
7133 float32x4_t b = vreinterpretq_f32_m128(_b);
7134 return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
7135 }
7136
7137 // Round the packed double-precision (64-bit) floating-point elements in a up
7138 // to an integer value, and store the results as packed double-precision
7139 // floating-point elements in dst.
7140 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd
_mm_ceil_pd(__m128d a)7141 FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
7142 {
7143 #if defined(__aarch64__)
7144 return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
7145 #else
7146 double *f = (double *) &a;
7147 return _mm_set_pd(ceil(f[1]), ceil(f[0]));
7148 #endif
7149 }
7150
7151 // Round the packed single-precision (32-bit) floating-point elements in a up to
7152 // an integer value, and store the results as packed single-precision
7153 // floating-point elements in dst.
7154 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
_mm_ceil_ps(__m128 a)7155 FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
7156 {
7157 #if defined(__aarch64__)
7158 return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
7159 #else
7160 float *f = (float *) &a;
7161 return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
7162 #endif
7163 }
7164
7165 // Round the lower double-precision (64-bit) floating-point element in b up to
7166 // an integer value, store the result as a double-precision floating-point
7167 // element in the lower element of dst, and copy the upper element from a to the
7168 // upper element of dst.
7169 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd
_mm_ceil_sd(__m128d a,__m128d b)7170 FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
7171 {
7172 return _mm_move_sd(a, _mm_ceil_pd(b));
7173 }
7174
7175 // Round the lower single-precision (32-bit) floating-point element in b up to
7176 // an integer value, store the result as a single-precision floating-point
7177 // element in the lower element of dst, and copy the upper 3 packed elements
7178 // from a to the upper elements of dst.
7179 //
7180 // dst[31:0] := CEIL(b[31:0])
7181 // dst[127:32] := a[127:32]
7182 //
7183 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss
_mm_ceil_ss(__m128 a,__m128 b)7184 FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
7185 {
7186 return _mm_move_ss(a, _mm_ceil_ps(b));
7187 }
7188
7189 // Compare packed 64-bit integers in a and b for equality, and store the results
7190 // in dst
_mm_cmpeq_epi64(__m128i a,__m128i b)7191 FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
7192 {
7193 #if defined(__aarch64__)
7194 return vreinterpretq_m128i_u64(
7195 vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
7196 #else
7197 // ARMv7 lacks vceqq_u64
7198 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
7199 uint32x4_t cmp =
7200 vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
7201 uint32x4_t swapped = vrev64q_u32(cmp);
7202 return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
7203 #endif
7204 }
7205
7206 // Converts the four signed 16-bit integers in the lower 64 bits to four signed
7207 // 32-bit integers.
_mm_cvtepi16_epi32(__m128i a)7208 FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
7209 {
7210 return vreinterpretq_m128i_s32(
7211 vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
7212 }
7213
7214 // Converts the two signed 16-bit integers in the lower 32 bits two signed
7215 // 32-bit integers.
_mm_cvtepi16_epi64(__m128i a)7216 FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
7217 {
7218 int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
7219 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
7220 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
7221 return vreinterpretq_m128i_s64(s64x2);
7222 }
7223
7224 // Converts the two signed 32-bit integers in the lower 64 bits to two signed
7225 // 64-bit integers.
_mm_cvtepi32_epi64(__m128i a)7226 FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
7227 {
7228 return vreinterpretq_m128i_s64(
7229 vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
7230 }
7231
7232 // Converts the four unsigned 8-bit integers in the lower 16 bits to four
7233 // unsigned 32-bit integers.
_mm_cvtepi8_epi16(__m128i a)7234 FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
7235 {
7236 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
7237 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7238 return vreinterpretq_m128i_s16(s16x8);
7239 }
7240
7241 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
7242 // unsigned 32-bit integers.
_mm_cvtepi8_epi32(__m128i a)7243 FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
7244 {
7245 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
7246 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7247 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
7248 return vreinterpretq_m128i_s32(s32x4);
7249 }
7250
7251 // Converts the two signed 8-bit integers in the lower 32 bits to four
7252 // signed 64-bit integers.
_mm_cvtepi8_epi64(__m128i a)7253 FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
7254 {
7255 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
7256 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
7257 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
7258 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
7259 return vreinterpretq_m128i_s64(s64x2);
7260 }
7261
7262 // Converts the four unsigned 16-bit integers in the lower 64 bits to four
7263 // unsigned 32-bit integers.
_mm_cvtepu16_epi32(__m128i a)7264 FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
7265 {
7266 return vreinterpretq_m128i_u32(
7267 vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
7268 }
7269
7270 // Converts the two unsigned 16-bit integers in the lower 32 bits to two
7271 // unsigned 64-bit integers.
_mm_cvtepu16_epi64(__m128i a)7272 FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
7273 {
7274 uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
7275 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
7276 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
7277 return vreinterpretq_m128i_u64(u64x2);
7278 }
7279
7280 // Converts the two unsigned 32-bit integers in the lower 64 bits to two
7281 // unsigned 64-bit integers.
_mm_cvtepu32_epi64(__m128i a)7282 FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
7283 {
7284 return vreinterpretq_m128i_u64(
7285 vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
7286 }
7287
7288 // Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
7289 // and store the results in dst.
7290 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16
_mm_cvtepu8_epi16(__m128i a)7291 FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
7292 {
7293 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */
7294 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
7295 return vreinterpretq_m128i_u16(u16x8);
7296 }
7297
7298 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
7299 // unsigned 32-bit integers.
7300 // https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
_mm_cvtepu8_epi32(__m128i a)7301 FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
7302 {
7303 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
7304 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
7305 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
7306 return vreinterpretq_m128i_u32(u32x4);
7307 }
7308
7309 // Converts the two unsigned 8-bit integers in the lower 16 bits to two
7310 // unsigned 64-bit integers.
_mm_cvtepu8_epi64(__m128i a)7311 FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
7312 {
7313 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
7314 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
7315 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
7316 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
7317 return vreinterpretq_m128i_u64(u64x2);
7318 }
7319
7320 // Conditionally multiply the packed single-precision (32-bit) floating-point
7321 // elements in a and b using the high 4 bits in imm8, sum the four products,
7322 // and conditionally store the sum in dst using the low 4 bits of imm.
7323 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
_mm_dp_ps(__m128 a,__m128 b,const int imm)7324 FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
7325 {
7326 #if defined(__aarch64__)
7327 /* shortcuts */
7328 if (imm == 0xFF) {
7329 return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
7330 }
7331 if (imm == 0x7F) {
7332 float32x4_t m = _mm_mul_ps(a, b);
7333 m[3] = 0;
7334 return _mm_set1_ps(vaddvq_f32(m));
7335 }
7336 #endif
7337
7338 float s = 0, c = 0;
7339 float32x4_t f32a = vreinterpretq_f32_m128(a);
7340 float32x4_t f32b = vreinterpretq_f32_m128(b);
7341
7342 /* To improve the accuracy of floating-point summation, Kahan algorithm
7343 * is used for each operation.
7344 */
7345 if (imm & (1 << 4))
7346 _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
7347 if (imm & (1 << 5))
7348 _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
7349 if (imm & (1 << 6))
7350 _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
7351 if (imm & (1 << 7))
7352 _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
7353 s += c;
7354
7355 float32x4_t res = {
7356 (imm & 0x1) ? s : 0,
7357 (imm & 0x2) ? s : 0,
7358 (imm & 0x4) ? s : 0,
7359 (imm & 0x8) ? s : 0,
7360 };
7361 return vreinterpretq_m128_f32(res);
7362 }
7363
7364 // Extracts the selected signed or unsigned 32-bit integer from a and zero
7365 // extends.
7366 // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
7367 #define _mm_extract_epi32(a, imm) \
7368 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
7369
7370 // Extracts the selected signed or unsigned 64-bit integer from a and zero
7371 // extends.
7372 // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
7373 #define _mm_extract_epi64(a, imm) \
7374 vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
7375
7376 // Extracts the selected signed or unsigned 8-bit integer from a and zero
7377 // extends.
7378 // FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
7379 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8
7380 #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
7381
7382 // Extracts the selected single-precision (32-bit) floating-point from a.
7383 // FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
7384 #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
7385
7386 // Round the packed double-precision (64-bit) floating-point elements in a down
7387 // to an integer value, and store the results as packed double-precision
7388 // floating-point elements in dst.
7389 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd
_mm_floor_pd(__m128d a)7390 FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
7391 {
7392 #if defined(__aarch64__)
7393 return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
7394 #else
7395 double *f = (double *) &a;
7396 return _mm_set_pd(floor(f[1]), floor(f[0]));
7397 #endif
7398 }
7399
7400 // Round the packed single-precision (32-bit) floating-point elements in a down
7401 // to an integer value, and store the results as packed single-precision
7402 // floating-point elements in dst.
7403 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
_mm_floor_ps(__m128 a)7404 FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
7405 {
7406 #if defined(__aarch64__)
7407 return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
7408 #else
7409 float *f = (float *) &a;
7410 return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
7411 #endif
7412 }
7413
7414 // Round the lower double-precision (64-bit) floating-point element in b down to
7415 // an integer value, store the result as a double-precision floating-point
7416 // element in the lower element of dst, and copy the upper element from a to the
7417 // upper element of dst.
7418 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd
_mm_floor_sd(__m128d a,__m128d b)7419 FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
7420 {
7421 return _mm_move_sd(a, _mm_floor_pd(b));
7422 }
7423
7424 // Round the lower single-precision (32-bit) floating-point element in b down to
7425 // an integer value, store the result as a single-precision floating-point
7426 // element in the lower element of dst, and copy the upper 3 packed elements
7427 // from a to the upper elements of dst.
7428 //
7429 // dst[31:0] := FLOOR(b[31:0])
7430 // dst[127:32] := a[127:32]
7431 //
7432 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
_mm_floor_ss(__m128 a,__m128 b)7433 FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
7434 {
7435 return _mm_move_ss(a, _mm_floor_ps(b));
7436 }
7437
7438 // Inserts the least significant 32 bits of b into the selected 32-bit integer
7439 // of a.
7440 // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
7441 // __constrange(0,4) int imm)
7442 #define _mm_insert_epi32(a, b, imm) \
7443 __extension__({ \
7444 vreinterpretq_m128i_s32( \
7445 vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
7446 })
7447
7448 // Inserts the least significant 64 bits of b into the selected 64-bit integer
7449 // of a.
7450 // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
7451 // __constrange(0,2) int imm)
7452 #define _mm_insert_epi64(a, b, imm) \
7453 __extension__({ \
7454 vreinterpretq_m128i_s64( \
7455 vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
7456 })
7457
7458 // Inserts the least significant 8 bits of b into the selected 8-bit integer
7459 // of a.
7460 // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
7461 // __constrange(0,16) int imm)
7462 #define _mm_insert_epi8(a, b, imm) \
7463 __extension__({ \
7464 vreinterpretq_m128i_s8( \
7465 vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
7466 })
7467
7468 // Copy a to tmp, then insert a single-precision (32-bit) floating-point
7469 // element from b into tmp using the control in imm8. Store tmp to dst using
7470 // the mask in imm8 (elements are zeroed out when the corresponding bit is set).
7471 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=insert_ps
7472 #define _mm_insert_ps(a, b, imm8) \
7473 __extension__({ \
7474 float32x4_t tmp1 = vsetq_lane_f32(vgetq_lane_f32(b, (imm >> 6) & 0x3), \
7475 vreinterpretq_f32_m128(a), 0); \
7476 float32x4_t tmp2 = \
7477 vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), vreinterpretq_f32_m128(a), \
7478 ((imm >> 4) & 0x3)); \
7479 const uint32_t data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
7480 ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
7481 ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
7482 ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; \
7483 uint32x4_t mask = vld1q_u32(data); \
7484 float32x4_t all_zeros = vdupq_n_f32(0); \
7485 \
7486 vreinterpretq_m128_f32( \
7487 vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))); \
7488 })
7489
7490 // epi versions of min/max
7491 // Computes the pariwise maximums of the four signed 32-bit integer values of a
7492 // and b.
7493 //
7494 // A 128-bit parameter that can be defined with the following equations:
7495 // r0 := (a0 > b0) ? a0 : b0
7496 // r1 := (a1 > b1) ? a1 : b1
7497 // r2 := (a2 > b2) ? a2 : b2
7498 // r3 := (a3 > b3) ? a3 : b3
7499 //
7500 // https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
_mm_max_epi32(__m128i a,__m128i b)7501 FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
7502 {
7503 return vreinterpretq_m128i_s32(
7504 vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7505 }
7506
7507 // Compare packed signed 8-bit integers in a and b, and store packed maximum
7508 // values in dst.
7509 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
_mm_max_epi8(__m128i a,__m128i b)7510 FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
7511 {
7512 return vreinterpretq_m128i_s8(
7513 vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
7514 }
7515
7516 // Compare packed unsigned 16-bit integers in a and b, and store packed maximum
7517 // values in dst.
7518 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
_mm_max_epu16(__m128i a,__m128i b)7519 FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
7520 {
7521 return vreinterpretq_m128i_u16(
7522 vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
7523 }
7524
7525 // Compare packed unsigned 32-bit integers in a and b, and store packed maximum
7526 // values in dst.
7527 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
_mm_max_epu32(__m128i a,__m128i b)7528 FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
7529 {
7530 return vreinterpretq_m128i_u32(
7531 vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
7532 }
7533
7534 // Computes the pariwise minima of the four signed 32-bit integer values of a
7535 // and b.
7536 //
7537 // A 128-bit parameter that can be defined with the following equations:
7538 // r0 := (a0 < b0) ? a0 : b0
7539 // r1 := (a1 < b1) ? a1 : b1
7540 // r2 := (a2 < b2) ? a2 : b2
7541 // r3 := (a3 < b3) ? a3 : b3
7542 //
7543 // https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
_mm_min_epi32(__m128i a,__m128i b)7544 FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
7545 {
7546 return vreinterpretq_m128i_s32(
7547 vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7548 }
7549
7550 // Compare packed signed 8-bit integers in a and b, and store packed minimum
7551 // values in dst.
7552 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
_mm_min_epi8(__m128i a,__m128i b)7553 FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
7554 {
7555 return vreinterpretq_m128i_s8(
7556 vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
7557 }
7558
7559 // Compare packed unsigned 16-bit integers in a and b, and store packed minimum
7560 // values in dst.
7561 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
_mm_min_epu16(__m128i a,__m128i b)7562 FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
7563 {
7564 return vreinterpretq_m128i_u16(
7565 vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
7566 }
7567
7568 // Compare packed unsigned 32-bit integers in a and b, and store packed minimum
7569 // values in dst.
7570 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
_mm_min_epu32(__m128i a,__m128i b)7571 FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
7572 {
7573 return vreinterpretq_m128i_u32(
7574 vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
7575 }
7576
7577 // Horizontally compute the minimum amongst the packed unsigned 16-bit integers
7578 // in a, store the minimum and index in dst, and zero the remaining bits in dst.
7579 //
7580 // index[2:0] := 0
7581 // min[15:0] := a[15:0]
7582 // FOR j := 0 to 7
7583 // i := j*16
7584 // IF a[i+15:i] < min[15:0]
7585 // index[2:0] := j
7586 // min[15:0] := a[i+15:i]
7587 // FI
7588 // ENDFOR
7589 // dst[15:0] := min[15:0]
7590 // dst[18:16] := index[2:0]
7591 // dst[127:19] := 0
7592 //
7593 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
_mm_minpos_epu16(__m128i a)7594 FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
7595 {
7596 __m128i dst;
7597 uint16_t min, idx = 0;
7598 // Find the minimum value
7599 #if defined(__aarch64__)
7600 min = vminvq_u16(vreinterpretq_u16_m128i(a));
7601 #else
7602 __m64 tmp;
7603 tmp = vreinterpret_m64_u16(
7604 vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
7605 vget_high_u16(vreinterpretq_u16_m128i(a))));
7606 tmp = vreinterpret_m64_u16(
7607 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7608 tmp = vreinterpret_m64_u16(
7609 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7610 min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
7611 #endif
7612 // Get the index of the minimum value
7613 int i;
7614 for (i = 0; i < 8; i++) {
7615 if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
7616 idx = (uint16_t) i;
7617 break;
7618 }
7619 a = _mm_srli_si128(a, 2);
7620 }
7621 // Generate result
7622 dst = _mm_setzero_si128();
7623 dst = vreinterpretq_m128i_u16(
7624 vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
7625 dst = vreinterpretq_m128i_u16(
7626 vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
7627 return dst;
7628 }
7629
7630 // Multiply the low signed 32-bit integers from each packed 64-bit element in
7631 // a and b, and store the signed 64-bit results in dst.
7632 //
7633 // r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
7634 // r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
_mm_mul_epi32(__m128i a,__m128i b)7635 FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
7636 {
7637 // vmull_s32 upcasts instead of masking, so we downcast.
7638 int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
7639 int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
7640 return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
7641 }
7642
7643 // Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
7644 // unsigned 32-bit integers from b.
7645 // https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
_mm_mullo_epi32(__m128i a,__m128i b)7646 FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
7647 {
7648 return vreinterpretq_m128i_s32(
7649 vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7650 }
7651
7652 // Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
7653 // integers and saturates.
7654 //
7655 // r0 := UnsignedSaturate(a0)
7656 // r1 := UnsignedSaturate(a1)
7657 // r2 := UnsignedSaturate(a2)
7658 // r3 := UnsignedSaturate(a3)
7659 // r4 := UnsignedSaturate(b0)
7660 // r5 := UnsignedSaturate(b1)
7661 // r6 := UnsignedSaturate(b2)
7662 // r7 := UnsignedSaturate(b3)
_mm_packus_epi32(__m128i a,__m128i b)7663 FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
7664 {
7665 return vreinterpretq_m128i_u16(
7666 vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
7667 vqmovun_s32(vreinterpretq_s32_m128i(b))));
7668 }
7669
7670 // Round the packed double-precision (64-bit) floating-point elements in a using
7671 // the rounding parameter, and store the results as packed double-precision
7672 // floating-point elements in dst.
7673 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd
_mm_round_pd(__m128d a,int rounding)7674 FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
7675 {
7676 #if defined(__aarch64__)
7677 switch (rounding) {
7678 case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
7679 return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
7680 case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
7681 return _mm_floor_pd(a);
7682 case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
7683 return _mm_ceil_pd(a);
7684 case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
7685 return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
7686 default: //_MM_FROUND_CUR_DIRECTION
7687 return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
7688 }
7689 #else
7690 double *v_double = (double *) &a;
7691
7692 if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
7693 (rounding == _MM_FROUND_CUR_DIRECTION &&
7694 _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
7695 double res[2], tmp;
7696 for (int i = 0; i < 2; i++) {
7697 tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
7698 double roundDown = floor(tmp); // Round down value
7699 double roundUp = ceil(tmp); // Round up value
7700 double diffDown = tmp - roundDown;
7701 double diffUp = roundUp - tmp;
7702 if (diffDown < diffUp) {
7703 /* If it's closer to the round down value, then use it */
7704 res[i] = roundDown;
7705 } else if (diffDown > diffUp) {
7706 /* If it's closer to the round up value, then use it */
7707 res[i] = roundUp;
7708 } else {
7709 /* If it's equidistant between round up and round down value,
7710 * pick the one which is an even number */
7711 double half = roundDown / 2;
7712 if (half != floor(half)) {
7713 /* If the round down value is odd, return the round up value
7714 */
7715 res[i] = roundUp;
7716 } else {
7717 /* If the round up value is odd, return the round down value
7718 */
7719 res[i] = roundDown;
7720 }
7721 }
7722 res[i] = (v_double[i] < 0) ? -res[i] : res[i];
7723 }
7724 return _mm_set_pd(res[1], res[0]);
7725 } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
7726 (rounding == _MM_FROUND_CUR_DIRECTION &&
7727 _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
7728 return _mm_floor_pd(a);
7729 } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
7730 (rounding == _MM_FROUND_CUR_DIRECTION &&
7731 _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
7732 return _mm_ceil_pd(a);
7733 }
7734 return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
7735 v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
7736 #endif
7737 }
7738
7739 // Round the packed single-precision (32-bit) floating-point elements in a using
7740 // the rounding parameter, and store the results as packed single-precision
7741 // floating-point elements in dst.
7742 // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
_mm_round_ps(__m128 a,int rounding)7743 FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
7744 {
7745 #if defined(__aarch64__)
7746 switch (rounding) {
7747 case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
7748 return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
7749 case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
7750 return _mm_floor_ps(a);
7751 case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
7752 return _mm_ceil_ps(a);
7753 case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
7754 return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
7755 default: //_MM_FROUND_CUR_DIRECTION
7756 return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
7757 }
7758 #else
7759 float *v_float = (float *) &a;
7760
7761 if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
7762 (rounding == _MM_FROUND_CUR_DIRECTION &&
7763 _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
7764 uint32x4_t signmask = vdupq_n_u32(0x80000000);
7765 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
7766 vdupq_n_f32(0.5f)); /* +/- 0.5 */
7767 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
7768 vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
7769 int32x4_t r_trunc = vcvtq_s32_f32(
7770 vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
7771 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
7772 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
7773 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
7774 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
7775 float32x4_t delta = vsubq_f32(
7776 vreinterpretq_f32_m128(a),
7777 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
7778 uint32x4_t is_delta_half =
7779 vceqq_f32(delta, half); /* delta == +/- 0.5 */
7780 return vreinterpretq_m128_f32(
7781 vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
7782 } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
7783 (rounding == _MM_FROUND_CUR_DIRECTION &&
7784 _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
7785 return _mm_floor_ps(a);
7786 } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
7787 (rounding == _MM_FROUND_CUR_DIRECTION &&
7788 _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
7789 return _mm_ceil_ps(a);
7790 }
7791 return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
7792 v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
7793 v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
7794 v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
7795 #endif
7796 }
7797
7798 // Round the lower double-precision (64-bit) floating-point element in b using
7799 // the rounding parameter, store the result as a double-precision floating-point
7800 // element in the lower element of dst, and copy the upper element from a to the
7801 // upper element of dst.
7802 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd
_mm_round_sd(__m128d a,__m128d b,int rounding)7803 FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
7804 {
7805 return _mm_move_sd(a, _mm_round_pd(b, rounding));
7806 }
7807
7808 // Round the lower single-precision (32-bit) floating-point element in b using
7809 // the rounding parameter, store the result as a single-precision floating-point
7810 // element in the lower element of dst, and copy the upper 3 packed elements
7811 // from a to the upper elements of dst. Rounding is done according to the
7812 // rounding[3:0] parameter, which can be one of:
7813 // (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
7814 // suppress exceptions
7815 // (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and
7816 // suppress exceptions
7817 // (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress
7818 // exceptions
7819 // (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress
7820 // exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
7821 // _MM_SET_ROUNDING_MODE
7822 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss
_mm_round_ss(__m128 a,__m128 b,int rounding)7823 FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
7824 {
7825 return _mm_move_ss(a, _mm_round_ps(b, rounding));
7826 }
7827
7828 // Load 128-bits of integer data from memory into dst using a non-temporal
7829 // memory hint. mem_addr must be aligned on a 16-byte boundary or a
7830 // general-protection exception may be generated.
7831 //
7832 // dst[127:0] := MEM[mem_addr+127:mem_addr]
7833 //
7834 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
_mm_stream_load_si128(__m128i * p)7835 FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
7836 {
7837 #if __has_builtin(__builtin_nontemporal_store)
7838 return __builtin_nontemporal_load(p);
7839 #else
7840 return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
7841 #endif
7842 }
7843
7844 // Compute the bitwise NOT of a and then AND with a 128-bit vector containing
7845 // all 1's, and return 1 if the result is zero, otherwise return 0.
7846 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
_mm_test_all_ones(__m128i a)7847 FORCE_INLINE int _mm_test_all_ones(__m128i a)
7848 {
7849 return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
7850 ~(uint64_t) 0;
7851 }
7852
7853 // Compute the bitwise AND of 128 bits (representing integer data) in a and
7854 // mask, and return 1 if the result is zero, otherwise return 0.
7855 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
_mm_test_all_zeros(__m128i a,__m128i mask)7856 FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
7857 {
7858 int64x2_t a_and_mask =
7859 vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
7860 return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
7861 }
7862
7863 // Compute the bitwise AND of 128 bits (representing integer data) in a and
7864 // mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
7865 // the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
7866 // zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
7867 // otherwise return 0.
7868 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_test_mix_ones_zero
_mm_test_mix_ones_zeros(__m128i a,__m128i mask)7869 FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
7870 {
7871 uint64x2_t zf =
7872 vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
7873 uint64x2_t cf =
7874 vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
7875 uint64x2_t result = vandq_u64(zf, cf);
7876 return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
7877 }
7878
7879 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7880 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7881 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7882 // otherwise set CF to 0. Return the CF value.
7883 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
_mm_testc_si128(__m128i a,__m128i b)7884 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
7885 {
7886 int64x2_t s64 =
7887 vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
7888 vreinterpretq_s64_m128i(b));
7889 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
7890 }
7891
7892 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7893 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7894 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7895 // otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
7896 // otherwise return 0.
7897 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128
7898 #define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
7899
7900 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7901 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7902 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7903 // otherwise set CF to 0. Return the ZF value.
7904 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
_mm_testz_si128(__m128i a,__m128i b)7905 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
7906 {
7907 int64x2_t s64 =
7908 vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
7909 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
7910 }
7911
7912 /* SSE4.2 */
7913
7914 // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
7915 // in b for greater than.
_mm_cmpgt_epi64(__m128i a,__m128i b)7916 FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
7917 {
7918 #if defined(__aarch64__)
7919 return vreinterpretq_m128i_u64(
7920 vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
7921 #else
7922 return vreinterpretq_m128i_s64(vshrq_n_s64(
7923 vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
7924 63));
7925 #endif
7926 }
7927
7928 // Starting with the initial value in crc, accumulates a CRC32 value for
7929 // unsigned 16-bit integer v.
7930 // https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
_mm_crc32_u16(uint32_t crc,uint16_t v)7931 FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
7932 {
7933 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
7934 __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
7935 : [c] "+r"(crc)
7936 : [v] "r"(v));
7937 #else
7938 crc = _mm_crc32_u8(crc, v & 0xff);
7939 crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
7940 #endif
7941 return crc;
7942 }
7943
7944 // Starting with the initial value in crc, accumulates a CRC32 value for
7945 // unsigned 32-bit integer v.
7946 // https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
_mm_crc32_u32(uint32_t crc,uint32_t v)7947 FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
7948 {
7949 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
7950 __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
7951 : [c] "+r"(crc)
7952 : [v] "r"(v));
7953 #else
7954 crc = _mm_crc32_u16(crc, v & 0xffff);
7955 crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
7956 #endif
7957 return crc;
7958 }
7959
7960 // Starting with the initial value in crc, accumulates a CRC32 value for
7961 // unsigned 64-bit integer v.
7962 // https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
_mm_crc32_u64(uint64_t crc,uint64_t v)7963 FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
7964 {
7965 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
7966 __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
7967 : [c] "+r"(crc)
7968 : [v] "r"(v));
7969 #else
7970 crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
7971 crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
7972 #endif
7973 return crc;
7974 }
7975
7976 // Starting with the initial value in crc, accumulates a CRC32 value for
7977 // unsigned 8-bit integer v.
7978 // https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
_mm_crc32_u8(uint32_t crc,uint8_t v)7979 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
7980 {
7981 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
7982 __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
7983 : [c] "+r"(crc)
7984 : [v] "r"(v));
7985 #else
7986 crc ^= v;
7987 for (int bit = 0; bit < 8; bit++) {
7988 if (crc & 1)
7989 crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
7990 else
7991 crc = (crc >> 1);
7992 }
7993 #endif
7994 return crc;
7995 }
7996
7997 /* AES */
7998
7999 #if !defined(__ARM_FEATURE_CRYPTO)
8000 /* clang-format off */
8001 #define SSE2NEON_AES_DATA(w) \
8002 { \
8003 w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
8004 w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
8005 w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
8006 w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
8007 w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
8008 w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
8009 w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
8010 w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
8011 w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
8012 w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
8013 w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
8014 w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
8015 w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
8016 w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
8017 w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
8018 w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
8019 w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
8020 w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
8021 w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
8022 w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
8023 w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
8024 w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
8025 w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
8026 w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
8027 w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
8028 w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
8029 w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
8030 w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
8031 w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
8032 w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
8033 w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
8034 w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
8035 w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
8036 w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
8037 w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
8038 w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
8039 w(0xb0), w(0x54), w(0xbb), w(0x16) \
8040 }
8041 /* clang-format on */
8042
8043 /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
8044 #define SSE2NEON_AES_H0(x) (x)
8045 static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
8046 #undef SSE2NEON_AES_H0
8047
8048 // In the absence of crypto extensions, implement aesenc using regular neon
8049 // intrinsics instead. See:
8050 // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
8051 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
8052 // https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
8053 // for more information Reproduced with permission of the author.
_mm_aesenc_si128(__m128i EncBlock,__m128i RoundKey)8054 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
8055 {
8056 #if defined(__aarch64__)
8057 static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
8058 0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
8059 0xc, 0x1, 0x6, 0xb};
8060 static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8061 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
8062
8063 uint8x16_t v;
8064 uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
8065
8066 // shift rows
8067 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8068
8069 // sub bytes
8070 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
8071 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
8072 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
8073 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
8074
8075 // mix columns
8076 w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
8077 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8078 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8079
8080 // add round key
8081 return vreinterpretq_m128i_u8(w) ^ RoundKey;
8082
8083 #else /* ARMv7-A NEON implementation */
8084 #define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
8085 (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
8086 (b0))
8087 #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
8088 #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
8089 #define SSE2NEON_AES_U0(p) \
8090 SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
8091 #define SSE2NEON_AES_U1(p) \
8092 SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
8093 #define SSE2NEON_AES_U2(p) \
8094 SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
8095 #define SSE2NEON_AES_U3(p) \
8096 SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
8097 static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
8098 SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
8099 SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
8100 SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
8101 SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
8102 };
8103 #undef SSE2NEON_AES_B2W
8104 #undef SSE2NEON_AES_F2
8105 #undef SSE2NEON_AES_F3
8106 #undef SSE2NEON_AES_U0
8107 #undef SSE2NEON_AES_U1
8108 #undef SSE2NEON_AES_U2
8109 #undef SSE2NEON_AES_U3
8110
8111 uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
8112 uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
8113 uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
8114 uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
8115
8116 __m128i out = _mm_set_epi32(
8117 (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8118 aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8119 (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8120 aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8121 (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8122 aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8123 (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8124 aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
8125
8126 return _mm_xor_si128(out, RoundKey);
8127 #endif
8128 }
8129
8130 // Perform the last round of an AES encryption flow on data (state) in a using
8131 // the round key in RoundKey, and store the result in dst.
8132 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
_mm_aesenclast_si128(__m128i a,__m128i RoundKey)8133 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
8134 {
8135 /* FIXME: optimized for NEON */
8136 uint8_t v[4][4] = {
8137 {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
8138 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
8139 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
8140 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
8141 {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
8142 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
8143 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
8144 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
8145 {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
8146 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
8147 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
8148 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
8149 {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
8150 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
8151 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
8152 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
8153 };
8154 for (int i = 0; i < 16; i++)
8155 vreinterpretq_nth_u8_m128i(a, i) =
8156 v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
8157 return a;
8158 }
8159
8160 // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
8161 // This instruction generates a round key for AES encryption. See
8162 // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
8163 // for details.
8164 //
8165 // https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
_mm_aeskeygenassist_si128(__m128i key,const int rcon)8166 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
8167 {
8168 uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
8169 uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
8170 for (int i = 0; i < 4; ++i) {
8171 ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
8172 ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
8173 }
8174 return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
8175 ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
8176 }
8177 #undef SSE2NEON_AES_DATA
8178
8179 #else /* __ARM_FEATURE_CRYPTO */
8180 // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
8181 // AESMC and then manually applying the real key as an xor operation. This
8182 // unfortunately means an additional xor op; the compiler should be able to
8183 // optimize this away for repeated calls however. See
8184 // https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
8185 // for more details.
_mm_aesenc_si128(__m128i a,__m128i b)8186 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
8187 {
8188 return vreinterpretq_m128i_u8(
8189 vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
8190 vreinterpretq_u8_m128i(b));
8191 }
8192
8193 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
_mm_aesenclast_si128(__m128i a,__m128i RoundKey)8194 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
8195 {
8196 return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
8197 vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
8198 RoundKey);
8199 }
8200
_mm_aeskeygenassist_si128(__m128i a,const int rcon)8201 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
8202 {
8203 // AESE does ShiftRows and SubBytes on A
8204 uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
8205
8206 uint8x16_t dest = {
8207 // Undo ShiftRows step from AESE and extract X1 and X3
8208 u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
8209 u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
8210 u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
8211 u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
8212 };
8213 uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
8214 return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
8215 }
8216 #endif
8217
8218 /* Others */
8219
8220 // Perform a carry-less multiplication of two 64-bit integers, selected from a
8221 // and b according to imm8, and store the results in dst.
8222 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
_mm_clmulepi64_si128(__m128i _a,__m128i _b,const int imm)8223 FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
8224 {
8225 uint64x2_t a = vreinterpretq_u64_m128i(_a);
8226 uint64x2_t b = vreinterpretq_u64_m128i(_b);
8227 switch (imm & 0x11) {
8228 case 0x00:
8229 return vreinterpretq_m128i_u64(
8230 _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
8231 case 0x01:
8232 return vreinterpretq_m128i_u64(
8233 _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
8234 case 0x10:
8235 return vreinterpretq_m128i_u64(
8236 _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
8237 case 0x11:
8238 return vreinterpretq_m128i_u64(
8239 _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
8240 default:
8241 abort();
8242 }
8243 }
8244
8245 // Count the number of bits set to 1 in unsigned 32-bit integer a, and
8246 // return that count in dst.
8247 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
_mm_popcnt_u32(unsigned int a)8248 FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
8249 {
8250 #if defined(__aarch64__)
8251 #if __has_builtin(__builtin_popcount)
8252 return __builtin_popcount(a);
8253 #else
8254 return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
8255 #endif
8256 #else
8257 uint32_t count = 0;
8258 uint8x8_t input_val, count8x8_val;
8259 uint16x4_t count16x4_val;
8260 uint32x2_t count32x2_val;
8261
8262 input_val = vld1_u8((uint8_t *) &a);
8263 count8x8_val = vcnt_u8(input_val);
8264 count16x4_val = vpaddl_u8(count8x8_val);
8265 count32x2_val = vpaddl_u16(count16x4_val);
8266
8267 vst1_u32(&count, count32x2_val);
8268 return count;
8269 #endif
8270 }
8271
8272 // Count the number of bits set to 1 in unsigned 64-bit integer a, and
8273 // return that count in dst.
8274 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
_mm_popcnt_u64(uint64_t a)8275 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
8276 {
8277 #if defined(__aarch64__)
8278 #if __has_builtin(__builtin_popcountll)
8279 return __builtin_popcountll(a);
8280 #else
8281 return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
8282 #endif
8283 #else
8284 uint64_t count = 0;
8285 uint8x8_t input_val, count8x8_val;
8286 uint16x4_t count16x4_val;
8287 uint32x2_t count32x2_val;
8288 uint64x1_t count64x1_val;
8289
8290 input_val = vld1_u8((uint8_t *) &a);
8291 count8x8_val = vcnt_u8(input_val);
8292 count16x4_val = vpaddl_u8(count8x8_val);
8293 count32x2_val = vpaddl_u16(count16x4_val);
8294 count64x1_val = vpaddl_u32(count32x2_val);
8295 vst1_u64(&count, count64x1_val);
8296 return count;
8297 #endif
8298 }
8299
8300 #if defined(__GNUC__) || defined(__clang__)
8301 #pragma pop_macro("ALIGN_STRUCT")
8302 #pragma pop_macro("FORCE_INLINE")
8303 #endif
8304
8305 #if defined(__GNUC__) && !defined(__clang__)
8306 #pragma GCC pop_options
8307 #endif
8308
8309 #endif
8310