1 #ifndef SSE2NEON_H
2 #define SSE2NEON_H
3
4 // This header file provides a simple API translation layer
5 // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
6 //
7 // This header file does not yet translate all of the SSE intrinsics.
8 //
9 // Contributors to this work are:
10 // John W. Ratcliff <jratcliffscarab@gmail.com>
11 // Brandon Rowlett <browlett@nvidia.com>
12 // Ken Fast <kfast@gdeb.com>
13 // Eric van Beurden <evanbeurden@nvidia.com>
14 // Alexander Potylitsin <apotylitsin@nvidia.com>
15 // Hasindu Gamaarachchi <hasindu2008@gmail.com>
16 // Jim Huang <jserv@biilabs.io>
17 // Mark Cheng <marktwtn@biilabs.io>
18 // Malcolm James MacLeod <malcolm@gulden.com>
19 // Devin Hussey (easyaspi314) <husseydevin@gmail.com>
20 // Sebastian Pop <spop@amazon.com>
21 // Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
22 // Danila Kutenin <danilak@google.com>
23 // François Turban (JishinMaster) <francois.turban@gmail.com>
24 // Pei-Hsuan Hung <afcidk@gmail.com>
25 // Yang-Hao Yuan <yanghau@biilabs.io>
26
27 /*
28 * sse2neon is freely redistributable under the MIT License.
29 *
30 * Permission is hereby granted, free of charge, to any person obtaining a copy
31 * of this software and associated documentation files (the "Software"), to deal
32 * in the Software without restriction, including without limitation the rights
33 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
34 * copies of the Software, and to permit persons to whom the Software is
35 * furnished to do so, subject to the following conditions:
36 *
37 * The above copyright notice and this permission notice shall be included in
38 * all copies or substantial portions of the Software.
39 *
40 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
41 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
42 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
43 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
44 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
45 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
46 * SOFTWARE.
47 */
48
49 /* Tunable configurations */
50
51 /* Enable precise implementation of _mm_min_ps and _mm_max_ps
52 * This would slow down the computation a bit, but gives consistent result with
53 * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
54 */
55 #ifndef SSE2NEON_PRECISE_MINMAX
56 #define SSE2NEON_PRECISE_MINMAX (0)
57 #endif
58
59 #if defined(__GNUC__) || defined(__clang__)
60 #pragma push_macro("FORCE_INLINE")
61 #pragma push_macro("ALIGN_STRUCT")
62 #define FORCE_INLINE static inline __attribute__((always_inline))
63 #define ALIGN_STRUCT(x) __attribute__((aligned(x)))
64 #else
65 #error "Macro name collisions may happen with unsupported compiler."
66 #ifdef FORCE_INLINE
67 #undef FORCE_INLINE
68 #endif
69 #define FORCE_INLINE static inline
70 #ifndef ALIGN_STRUCT
71 #define ALIGN_STRUCT(x) __declspec(align(x))
72 #endif
73 #endif
74
75 #include <stdint.h>
76 #include <stdlib.h>
77
78 // These cause the build to fail on raspberry pi with 'unsupported target'
79 // and don't seem to do anything particularly useful
80 ///* Architecture-specific build options */
81 ///* FIXME: #pragma GCC push_options is only available on GCC */
82 //#if defined(__GNUC__)
83 //#if defined(__arm__) && __ARM_ARCH == 7
84 ///* According to ARM C Language Extensions Architecture specification,
85 // * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
86 // * architecture supported.
87 // */
88 //#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
89 //#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
90 //#endif
91 //#pragma GCC push_options
92 //#pragma GCC target("fpu=neon")
93 //#elif defined(__aarch64__)
94 //#pragma GCC push_options
95 //#pragma GCC target("+simd")
96 //#else
97 //#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
98 //#endif
99 //#endif
100
101 #include <arm_neon.h>
102
103 /* Rounding functions require either Aarch64 instructions or libm failback */
104 #if !defined(__aarch64__)
105 #include <math.h>
106 #endif
107
108 /* "__has_builtin" can be used to query support for built-in functions
109 * provided by gcc/clang and other compilers that support it.
110 */
111 #ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
112 /* Compatibility with gcc <= 9 */
113 #if __GNUC__ <= 9
114 #define __has_builtin(x) HAS##x
115 #define HAS__builtin_popcount 1
116 #define HAS__builtin_popcountll 1
117 #else
118 #define __has_builtin(x) 0
119 #endif
120 #endif
121
122 /**
123 * MACRO for shuffle parameter for _mm_shuffle_ps().
124 * Argument fp3 is a digit[0123] that represents the fp from argument "b"
125 * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
126 * for fp2 in result. fp1 is a digit[0123] that represents the fp from
127 * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
128 * fp0 is the same for fp0 of result.
129 */
130 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
131 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
132
133 /* Rounding mode macros. */
134 #define _MM_FROUND_TO_NEAREST_INT 0x00
135 #define _MM_FROUND_TO_NEG_INF 0x01
136 #define _MM_FROUND_TO_POS_INF 0x02
137 #define _MM_FROUND_TO_ZERO 0x03
138 #define _MM_FROUND_CUR_DIRECTION 0x04
139 #define _MM_FROUND_NO_EXC 0x08
140
141 /* indicate immediate constant argument in a given range */
142 #define __constrange(a, b) const
143
144 /* A few intrinsics accept traditional data types like ints or floats, but
145 * most operate on data types that are specific to SSE.
146 * If a vector type ends in d, it contains doubles, and if it does not have
147 * a suffix, it contains floats. An integer vector type can contain any type
148 * of integer, from chars to shorts to unsigned long longs.
149 */
150 typedef int64x1_t __m64;
151 typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
152 // On ARM 32-bit architecture, the float64x2_t is not supported.
153 // The data type __m128d should be represented in a different way for related
154 // intrinsic conversion.
155 #if defined(__aarch64__)
156 typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
157 #else
158 typedef float32x4_t __m128d;
159 #endif
160 typedef int64x2_t __m128i; /* 128-bit vector containing integers */
161
162 /* type-safe casting between types */
163
164 #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
165 #define vreinterpretq_m128_f32(x) (x)
166 #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
167
168 #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
169 #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
170 #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
171 #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
172
173 #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
174 #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
175 #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
176 #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
177
178 #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
179 #define vreinterpretq_f32_m128(x) (x)
180 #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
181
182 #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
183 #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
184 #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
185 #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
186
187 #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
188 #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
189 #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
190 #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
191
192 #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
193 #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
194 #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
195 #define vreinterpretq_m128i_s64(x) (x)
196
197 #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
198 #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
199 #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
200 #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
201
202 #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
203 #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
204 #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
205 #define vreinterpretq_s64_m128i(x) (x)
206
207 #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
208 #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
209 #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
210 #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
211
212 #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
213 #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
214 #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
215 #define vreinterpret_m64_s64(x) (x)
216
217 #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
218 #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
219 #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
220 #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
221
222 #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
223 #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
224 #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
225
226 #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
227 #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
228 #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
229 #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
230
231 #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
232 #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
233 #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
234 #define vreinterpret_s64_m64(x) (x)
235
236 #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
237
238 #if defined(__aarch64__)
239 #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
240 #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
241
242 #define vreinterpretq_m128d_f64(x) (x)
243
244 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
245
246 #define vreinterpretq_f64_m128d(x) (x)
247 #else
248 #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
249 #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
250
251 #define vreinterpretq_m128d_f32(x) (x)
252
253 #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
254
255 #define vreinterpretq_f32_m128d(x) (x)
256 #endif
257
258 // A struct is defined in this header file called 'SIMDVec' which can be used
259 // by applications which attempt to access the contents of an _m128 struct
260 // directly. It is important to note that accessing the __m128 struct directly
261 // is bad coding practice by Microsoft: @see:
262 // https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
263 //
264 // However, some legacy source code may try to access the contents of an __m128
265 // struct directly so the developer can use the SIMDVec as an alias for it. Any
266 // casting must be done manually by the developer, as you cannot cast or
267 // otherwise alias the base NEON data type for intrinsic operations.
268 //
269 // union intended to allow direct access to an __m128 variable using the names
270 // that the MSVC compiler provides. This union should really only be used when
271 // trying to access the members of the vector as integer values. GCC/clang
272 // allow native access to the float members through a simple array access
273 // operator (in C since 4.6, in C++ since 4.8).
274 //
275 // Ideally direct accesses to SIMD vectors should not be used since it can cause
276 // a performance hit. If it really is needed however, the original __m128
277 // variable can be aliased with a pointer to this union and used to access
278 // individual components. The use of this union should be hidden behind a macro
279 // that is used throughout the codebase to access the members instead of always
280 // declaring this type of variable.
281 typedef union ALIGN_STRUCT(16) SIMDVec {
282 float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
283 int8_t m128_i8[16]; // as signed 8-bit integers.
284 int16_t m128_i16[8]; // as signed 16-bit integers.
285 int32_t m128_i32[4]; // as signed 32-bit integers.
286 int64_t m128_i64[2]; // as signed 64-bit integers.
287 uint8_t m128_u8[16]; // as unsigned 8-bit integers.
288 uint16_t m128_u16[8]; // as unsigned 16-bit integers.
289 uint32_t m128_u32[4]; // as unsigned 32-bit integers.
290 uint64_t m128_u64[2]; // as unsigned 64-bit integers.
291 } SIMDVec;
292
293 // casting using SIMDVec
294 #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
295 #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
296 #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
297
298 /* Backwards compatibility for compilers with lack of specific type support */
299
300 // Older gcc does not define vld1q_u8_x4 type
301 #if defined(__GNUC__) && !defined(__clang__)
302 #if __GNUC__ <= 9
vld1q_u8_x4(const uint8_t * p)303 FORCE_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t *p)
304 {
305 uint8x16x4_t ret;
306 ret.val[0] = vld1q_u8(p + 0);
307 ret.val[1] = vld1q_u8(p + 16);
308 ret.val[2] = vld1q_u8(p + 32);
309 ret.val[3] = vld1q_u8(p + 48);
310 return ret;
311 }
312 #endif
313 #endif
314
315 /* Function Naming Conventions
316 * The naming convention of SSE intrinsics is straightforward. A generic SSE
317 * intrinsic function is given as follows:
318 * _mm_<name>_<data_type>
319 *
320 * The parts of this format are given as follows:
321 * 1. <name> describes the operation performed by the intrinsic
322 * 2. <data_type> identifies the data type of the function's primary arguments
323 *
324 * This last part, <data_type>, is a little complicated. It identifies the
325 * content of the input values, and can be set to any of the following values:
326 * + ps - vectors contain floats (ps stands for packed single-precision)
327 * + pd - vectors cantain doubles (pd stands for packed double-precision)
328 * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
329 * signed integers
330 * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
331 * unsigned integers
332 * + si128 - unspecified 128-bit vector or 256-bit vector
333 * + m128/m128i/m128d - identifies input vector types when they are different
334 * than the type of the returned vector
335 *
336 * For example, _mm_setzero_ps. The _mm implies that the function returns
337 * a 128-bit vector. The _ps at the end implies that the argument vectors
338 * contain floats.
339 *
340 * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
341 * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
342 * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
343 * // Set packed 8-bit integers
344 * // 128 bits, 16 chars, per 8 bits
345 * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
346 * 4, 5, 12, 13, 6, 7, 14, 15);
347 * // Shuffle packed 8-bit integers
348 * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
349 *
350 * Data (Number, Binary, Byte Index):
351 +------+------+-------------+------+------+-------------+
352 | 1 | 2 | 3 | 4 | Number
353 +------+------+------+------+------+------+------+------+
354 | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
355 +------+------+------+------+------+------+------+------+
356 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index
357 +------+------+------+------+------+------+------+------+
358
359 +------+------+------+------+------+------+------+------+
360 | 5 | 6 | 7 | 8 | Number
361 +------+------+------+------+------+------+------+------+
362 | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
363 +------+------+------+------+------+------+------+------+
364 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index
365 +------+------+------+------+------+------+------+------+
366 * Index (Byte Index):
367 +------+------+------+------+------+------+------+------+
368 | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 |
369 +------+------+------+------+------+------+------+------+
370
371 +------+------+------+------+------+------+------+------+
372 | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 |
373 +------+------+------+------+------+------+------+------+
374 * Result:
375 +------+------+------+------+------+------+------+------+
376 | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index
377 +------+------+------+------+------+------+------+------+
378 | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
379 +------+------+------+------+------+------+------+------+
380 | 256 | 2 | 5 | 6 | Number
381 +------+------+------+------+------+------+------+------+
382
383 +------+------+------+------+------+------+------+------+
384 | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index
385 +------+------+------+------+------+------+------+------+
386 | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
387 +------+------+------+------+------+------+------+------+
388 | 3 | 7 | 4 | 8 | Number
389 +------+------+------+------+------+------+-------------+
390 */
391
392 /* Set/get methods */
393
394 /* Constants for use with _mm_prefetch. */
395 enum _mm_hint {
396 _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
397 _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
398 _MM_HINT_T1 = 2, /* load data to L2 cache only */
399 _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
400 _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
401 _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */
402 _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */
403 _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */
404 };
405
406 // Loads one cache line of data from address p to a location closer to the
407 // processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
_mm_prefetch(const void * p,int i)408 FORCE_INLINE void _mm_prefetch(const void *p, int i)
409 {
410 (void) i;
411 __builtin_prefetch(p);
412 }
413
414 // Copy the lower single-precision (32-bit) floating-point element of a to dst.
415 //
416 // dst[31:0] := a[31:0]
417 //
418 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
_mm_cvtss_f32(__m128 a)419 FORCE_INLINE float _mm_cvtss_f32(__m128 a)
420 {
421 return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
422 }
423
424 // Convert the lower single-precision (32-bit) floating-point element in a to a
425 // 32-bit integer, and store the result in dst.
426 //
427 // dst[31:0] := Convert_FP32_To_Int32(a[31:0])
428 //
429 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
430 #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
431
432 // Convert the lower single-precision (32-bit) floating-point element in a to a
433 // 64-bit integer, and store the result in dst.
434 //
435 // dst[63:0] := Convert_FP32_To_Int64(a[31:0])
436 //
437 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
_mm_cvtss_si64(__m128 a)438 FORCE_INLINE int _mm_cvtss_si64(__m128 a)
439 {
440 #if defined(__aarch64__)
441 return vgetq_lane_s64(
442 vreinterpretq_s64_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))), 0);
443 #else
444 float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
445 float32_t diff = data - floor(data);
446 if (diff > 0.5)
447 return (int64_t) ceil(data);
448 if (diff == 0.5) {
449 int64_t f = (int64_t) floor(data);
450 int64_t c = (int64_t) ceil(data);
451 return c & 1 ? f : c;
452 }
453 return (int64_t) floor(data);
454 #endif
455 }
456
457 // Convert packed single-precision (32-bit) floating-point elements in a to
458 // packed 32-bit integers with truncation, and store the results in dst.
459 //
460 // FOR j := 0 to 1
461 // i := 32*j
462 // dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
463 // ENDFOR
464 //
465 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
_mm_cvtt_ps2pi(__m128 a)466 FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
467 {
468 return vreinterpret_m64_s32(
469 vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
470 }
471
472 // Convert the lower single-precision (32-bit) floating-point element in a to a
473 // 32-bit integer with truncation, and store the result in dst.
474 //
475 // dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
476 //
477 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
_mm_cvtt_ss2si(__m128 a)478 FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
479 {
480 return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
481 }
482
483 // Convert packed single-precision (32-bit) floating-point elements in a to
484 // packed 32-bit integers with truncation, and store the results in dst.
485 //
486 // FOR j := 0 to 1
487 // i := 32*j
488 // dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
489 // ENDFOR
490 //
491 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
492 #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
493
494 // Convert the lower single-precision (32-bit) floating-point element in a to a
495 // 32-bit integer with truncation, and store the result in dst.
496 //
497 // dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
498 //
499 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
500 #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
501
502 // Convert the lower single-precision (32-bit) floating-point element in a to a
503 // 64-bit integer with truncation, and store the result in dst.
504 //
505 // dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
506 //
507 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
_mm_cvttss_si64(__m128 a)508 FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
509 {
510 return vgetq_lane_s64(
511 vmovl_s32(vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))), 0);
512 }
513
514 // Sets the 128-bit value to zero
515 // https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
_mm_setzero_si128(void)516 FORCE_INLINE __m128i _mm_setzero_si128(void)
517 {
518 return vreinterpretq_m128i_s32(vdupq_n_s32(0));
519 }
520
521 // Clears the four single-precision, floating-point values.
522 // https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
_mm_setzero_ps(void)523 FORCE_INLINE __m128 _mm_setzero_ps(void)
524 {
525 return vreinterpretq_m128_f32(vdupq_n_f32(0));
526 }
527
528 // Return vector of type __m128d with all elements set to zero.
529 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
_mm_setzero_pd(void)530 FORCE_INLINE __m128d _mm_setzero_pd(void)
531 {
532 #if defined(__aarch64__)
533 return vreinterpretq_m128d_f64(vdupq_n_f64(0));
534 #else
535 return vreinterpretq_m128d_f32(vdupq_n_f32(0));
536 #endif
537 }
538
539 // Sets the four single-precision, floating-point values to w.
540 //
541 // r0 := r1 := r2 := r3 := w
542 //
543 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
_mm_set1_ps(float _w)544 FORCE_INLINE __m128 _mm_set1_ps(float _w)
545 {
546 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
547 }
548
549 // Sets the four single-precision, floating-point values to w.
550 // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
_mm_set_ps1(float _w)551 FORCE_INLINE __m128 _mm_set_ps1(float _w)
552 {
553 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
554 }
555
556 // Sets the four single-precision, floating-point values to the four inputs.
557 // https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
_mm_set_ps(float w,float z,float y,float x)558 FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
559 {
560 float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
561 return vreinterpretq_m128_f32(vld1q_f32(data));
562 }
563
564 // Copy single-precision (32-bit) floating-point element a to the lower element
565 // of dst, and zero the upper 3 elements.
566 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
_mm_set_ss(float a)567 FORCE_INLINE __m128 _mm_set_ss(float a)
568 {
569 float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
570 return vreinterpretq_m128_f32(vld1q_f32(data));
571 }
572
573 // Sets the four single-precision, floating-point values to the four inputs in
574 // reverse order.
575 // https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
_mm_setr_ps(float w,float z,float y,float x)576 FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
577 {
578 float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
579 return vreinterpretq_m128_f32(vld1q_f32(data));
580 }
581
582 // Sets the 8 signed 16-bit integer values in reverse order.
583 //
584 // Return Value
585 // r0 := w0
586 // r1 := w1
587 // ...
588 // r7 := w7
_mm_setr_epi16(short w0,short w1,short w2,short w3,short w4,short w5,short w6,short w7)589 FORCE_INLINE __m128i _mm_setr_epi16(short w0,
590 short w1,
591 short w2,
592 short w3,
593 short w4,
594 short w5,
595 short w6,
596 short w7)
597 {
598 int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
599 return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
600 }
601
602 // Sets the 4 signed 32-bit integer values in reverse order
603 // https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
_mm_setr_epi32(int i3,int i2,int i1,int i0)604 FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
605 {
606 int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
607 return vreinterpretq_m128i_s32(vld1q_s32(data));
608 }
609
610 // Set packed 64-bit integers in dst with the supplied values in reverse order.
611 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
_mm_setr_epi64(__m64 e1,__m64 e0)612 FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
613 {
614 return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
615 }
616
617 // Sets the 16 signed 8-bit integer values to b.
618 //
619 // r0 := b
620 // r1 := b
621 // ...
622 // r15 := b
623 //
624 // https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
_mm_set1_epi8(signed char w)625 FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
626 {
627 return vreinterpretq_m128i_s8(vdupq_n_s8(w));
628 }
629
630 // Sets the 8 signed 16-bit integer values to w.
631 //
632 // r0 := w
633 // r1 := w
634 // ...
635 // r7 := w
636 //
637 // https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
_mm_set1_epi16(short w)638 FORCE_INLINE __m128i _mm_set1_epi16(short w)
639 {
640 return vreinterpretq_m128i_s16(vdupq_n_s16(w));
641 }
642
643 // Sets the 16 signed 8-bit integer values.
644 // https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
_mm_set_epi8(signed char b15,signed char b14,signed char b13,signed char b12,signed char b11,signed char b10,signed char b9,signed char b8,signed char b7,signed char b6,signed char b5,signed char b4,signed char b3,signed char b2,signed char b1,signed char b0)645 FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
646 signed char b14,
647 signed char b13,
648 signed char b12,
649 signed char b11,
650 signed char b10,
651 signed char b9,
652 signed char b8,
653 signed char b7,
654 signed char b6,
655 signed char b5,
656 signed char b4,
657 signed char b3,
658 signed char b2,
659 signed char b1,
660 signed char b0)
661 {
662 int8_t ALIGN_STRUCT(16)
663 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
664 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
665 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
666 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
667 return (__m128i) vld1q_s8(data);
668 }
669
670 // Sets the 8 signed 16-bit integer values.
671 // https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
_mm_set_epi16(short i7,short i6,short i5,short i4,short i3,short i2,short i1,short i0)672 FORCE_INLINE __m128i _mm_set_epi16(short i7,
673 short i6,
674 short i5,
675 short i4,
676 short i3,
677 short i2,
678 short i1,
679 short i0)
680 {
681 int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
682 return vreinterpretq_m128i_s16(vld1q_s16(data));
683 }
684
685 // Sets the 16 signed 8-bit integer values in reverse order.
686 // https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
_mm_setr_epi8(signed char b0,signed char b1,signed char b2,signed char b3,signed char b4,signed char b5,signed char b6,signed char b7,signed char b8,signed char b9,signed char b10,signed char b11,signed char b12,signed char b13,signed char b14,signed char b15)687 FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
688 signed char b1,
689 signed char b2,
690 signed char b3,
691 signed char b4,
692 signed char b5,
693 signed char b6,
694 signed char b7,
695 signed char b8,
696 signed char b9,
697 signed char b10,
698 signed char b11,
699 signed char b12,
700 signed char b13,
701 signed char b14,
702 signed char b15)
703 {
704 int8_t ALIGN_STRUCT(16)
705 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
706 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
707 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
708 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
709 return (__m128i) vld1q_s8(data);
710 }
711
712 // Sets the 4 signed 32-bit integer values to i.
713 //
714 // r0 := i
715 // r1 := i
716 // r2 := i
717 // r3 := I
718 //
719 // https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
_mm_set1_epi32(int _i)720 FORCE_INLINE __m128i _mm_set1_epi32(int _i)
721 {
722 return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
723 }
724
725 // Sets the 2 signed 64-bit integer values to i.
726 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
_mm_set1_epi64(__m64 _i)727 FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
728 {
729 return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
730 }
731
732 // Sets the 2 signed 64-bit integer values to i.
733 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
_mm_set1_epi64x(int64_t _i)734 FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
735 {
736 return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
737 }
738
739 // Sets the 4 signed 32-bit integer values.
740 // https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
_mm_set_epi32(int i3,int i2,int i1,int i0)741 FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
742 {
743 int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
744 return vreinterpretq_m128i_s32(vld1q_s32(data));
745 }
746
747 // Returns the __m128i structure with its two 64-bit integer values
748 // initialized to the values of the two 64-bit integers passed in.
749 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
_mm_set_epi64x(int64_t i1,int64_t i2)750 FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
751 {
752 int64_t ALIGN_STRUCT(16) data[2] = {i2, i1};
753 return vreinterpretq_m128i_s64(vld1q_s64(data));
754 }
755
756 // Returns the __m128i structure with its two 64-bit integer values
757 // initialized to the values of the two 64-bit integers passed in.
758 // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
_mm_set_epi64(__m64 i1,__m64 i2)759 FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
760 {
761 return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
762 }
763
764 // Set packed double-precision (64-bit) floating-point elements in dst with the
765 // supplied values.
766 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
_mm_set_pd(double e1,double e0)767 FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
768 {
769 double ALIGN_STRUCT(16) data[2] = {e0, e1};
770 #if defined(__aarch64__)
771 return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
772 #else
773 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
774 #endif
775 }
776
777 // Stores four single-precision, floating-point values.
778 // https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
_mm_store_ps(float * p,__m128 a)779 FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
780 {
781 vst1q_f32(p, vreinterpretq_f32_m128(a));
782 }
783
784 // Stores four single-precision, floating-point values.
785 // https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
_mm_storeu_ps(float * p,__m128 a)786 FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
787 {
788 vst1q_f32(p, vreinterpretq_f32_m128(a));
789 }
790
791 // Stores four 32-bit integer values as (as a __m128i value) at the address p.
792 // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
_mm_store_si128(__m128i * p,__m128i a)793 FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
794 {
795 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
796 }
797
798 // Stores four 32-bit integer values as (as a __m128i value) at the address p.
799 // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
_mm_storeu_si128(__m128i * p,__m128i a)800 FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
801 {
802 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
803 }
804
805 // Stores the lower single - precision, floating - point value.
806 // https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
_mm_store_ss(float * p,__m128 a)807 FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
808 {
809 vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
810 }
811
812 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
813 // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
814 // or a general-protection exception may be generated.
815 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
_mm_store_pd(double * mem_addr,__m128d a)816 FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
817 {
818 #if defined(__aarch64__)
819 vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
820 #else
821 vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
822 #endif
823 }
824
825 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
826 // elements) from a into memory. mem_addr does not need to be aligned on any
827 // particular boundary.
828 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
_mm_storeu_pd(double * mem_addr,__m128d a)829 FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
830 {
831 _mm_store_pd(mem_addr, a);
832 }
833
834 // Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
835 // https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
_mm_storel_epi64(__m128i * a,__m128i b)836 FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
837 {
838 uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
839 uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
840 *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
841 }
842
843 // Stores the lower two single-precision floating point values of a to the
844 // address p.
845 //
846 // *p0 := a0
847 // *p1 := a1
848 //
849 // https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
_mm_storel_pi(__m64 * p,__m128 a)850 FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
851 {
852 *p = vreinterpret_m64_f32(vget_low_f32(a));
853 }
854
855 // Stores the upper two single-precision, floating-point values of a to the
856 // address p.
857 //
858 // *p0 := a2
859 // *p1 := a3
860 //
861 // https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
_mm_storeh_pi(__m64 * p,__m128 a)862 FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
863 {
864 *p = vreinterpret_m64_f32(vget_high_f32(a));
865 }
866
867 // Loads a single single-precision, floating-point value, copying it into all
868 // four words
869 // https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
_mm_load1_ps(const float * p)870 FORCE_INLINE __m128 _mm_load1_ps(const float *p)
871 {
872 return vreinterpretq_m128_f32(vld1q_dup_f32(p));
873 }
874
875 // Load a single-precision (32-bit) floating-point element from memory into all
876 // elements of dst.
877 //
878 // dst[31:0] := MEM[mem_addr+31:mem_addr]
879 // dst[63:32] := MEM[mem_addr+31:mem_addr]
880 // dst[95:64] := MEM[mem_addr+31:mem_addr]
881 // dst[127:96] := MEM[mem_addr+31:mem_addr]
882 //
883 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
884 #define _mm_load_ps1 _mm_load1_ps
885
886 // Sets the lower two single-precision, floating-point values with 64
887 // bits of data loaded from the address p; the upper two values are passed
888 // through from a.
889 //
890 // Return Value
891 // r0 := *p0
892 // r1 := *p1
893 // r2 := a2
894 // r3 := a3
895 //
896 // https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
_mm_loadl_pi(__m128 a,__m64 const * p)897 FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
898 {
899 return vreinterpretq_m128_f32(
900 vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
901 }
902
903 // Load 4 single-precision (32-bit) floating-point elements from memory into dst
904 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
905 // general-protection exception may be generated.
906 //
907 // dst[31:0] := MEM[mem_addr+127:mem_addr+96]
908 // dst[63:32] := MEM[mem_addr+95:mem_addr+64]
909 // dst[95:64] := MEM[mem_addr+63:mem_addr+32]
910 // dst[127:96] := MEM[mem_addr+31:mem_addr]
911 //
912 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
_mm_loadr_ps(const float * p)913 FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
914 {
915 float32x4_t v = vrev64q_f32(vld1q_f32(p));
916 return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
917 }
918
919 // Sets the upper two single-precision, floating-point values with 64
920 // bits of data loaded from the address p; the lower two values are passed
921 // through from a.
922 //
923 // r0 := a0
924 // r1 := a1
925 // r2 := *p0
926 // r3 := *p1
927 //
928 // https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
_mm_loadh_pi(__m128 a,__m64 const * p)929 FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
930 {
931 return vreinterpretq_m128_f32(
932 vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
933 }
934
935 // Loads four single-precision, floating-point values.
936 // https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
_mm_load_ps(const float * p)937 FORCE_INLINE __m128 _mm_load_ps(const float *p)
938 {
939 return vreinterpretq_m128_f32(vld1q_f32(p));
940 }
941
942 // Loads four single-precision, floating-point values.
943 // https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
_mm_loadu_ps(const float * p)944 FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
945 {
946 // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
947 // equivalent for neon
948 return vreinterpretq_m128_f32(vld1q_f32(p));
949 }
950
951 // Load unaligned 16-bit integer from memory into the first element of dst.
952 //
953 // dst[15:0] := MEM[mem_addr+15:mem_addr]
954 // dst[MAX:16] := 0
955 //
956 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
_mm_loadu_si16(const void * p)957 FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
958 {
959 return vreinterpretq_m128i_s16(
960 vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
961 }
962
963 // Load unaligned 64-bit integer from memory into the first element of dst.
964 //
965 // dst[63:0] := MEM[mem_addr+63:mem_addr]
966 // dst[MAX:64] := 0
967 //
968 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
_mm_loadu_si64(const void * p)969 FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
970 {
971 return vreinterpretq_m128i_s64(
972 vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
973 }
974
975 // Load a double-precision (64-bit) floating-point element from memory into the
976 // lower of dst, and zero the upper element. mem_addr does not need to be
977 // aligned on any particular boundary.
978 //
979 // dst[63:0] := MEM[mem_addr+63:mem_addr]
980 // dst[127:64] := 0
981 //
982 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
_mm_load_sd(const double * p)983 FORCE_INLINE __m128d _mm_load_sd(const double *p)
984 {
985 #if defined(__aarch64__)
986 return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
987 #else
988 const float *fp = (const float *) p;
989 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
990 return vreinterpretq_m128d_f32(vld1q_f32(data));
991 #endif
992 }
993
994 // Loads two double-precision from 16-byte aligned memory, floating-point
995 // values.
996 //
997 // dst[127:0] := MEM[mem_addr+127:mem_addr]
998 //
999 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
_mm_load_pd(const double * p)1000 FORCE_INLINE __m128d _mm_load_pd(const double *p)
1001 {
1002 #if defined(__aarch64__)
1003 return vreinterpretq_m128d_f64(vld1q_f64(p));
1004 #else
1005 const float *fp = (const float *) p;
1006 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
1007 return vreinterpretq_m128d_f32(vld1q_f32(data));
1008 #endif
1009 }
1010
1011 // Loads two double-precision from unaligned memory, floating-point values.
1012 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
_mm_loadu_pd(const double * p)1013 FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
1014 {
1015 return _mm_load_pd(p);
1016 }
1017
1018 // Loads an single - precision, floating - point value into the low word and
1019 // clears the upper three words.
1020 // https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
_mm_load_ss(const float * p)1021 FORCE_INLINE __m128 _mm_load_ss(const float *p)
1022 {
1023 return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1024 }
1025
_mm_loadl_epi64(__m128i const * p)1026 FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
1027 {
1028 /* Load the lower 64 bits of the value pointed to by p into the
1029 * lower 64 bits of the result, zeroing the upper 64 bits of the result.
1030 */
1031 return vreinterpretq_m128i_s32(
1032 vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
1033 }
1034
1035 // Load a double-precision (64-bit) floating-point element from memory into the
1036 // lower element of dst, and copy the upper element from a to dst. mem_addr does
1037 // not need to be aligned on any particular boundary.
1038 //
1039 // dst[63:0] := MEM[mem_addr+63:mem_addr]
1040 // dst[127:64] := a[127:64]
1041 //
1042 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
_mm_loadl_pd(__m128d a,const double * p)1043 FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
1044 {
1045 #if defined(__aarch64__)
1046 return vreinterpretq_m128d_f64(
1047 vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
1048 #else
1049 return vreinterpretq_m128d_f32(
1050 vcombine_f32(vld1_f32((const float *) p),
1051 vget_high_f32(vreinterpretq_f32_m128d(a))));
1052 #endif
1053 }
1054
1055 // Load 2 double-precision (64-bit) floating-point elements from memory into dst
1056 // in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1057 // general-protection exception may be generated.
1058 //
1059 // dst[63:0] := MEM[mem_addr+127:mem_addr+64]
1060 // dst[127:64] := MEM[mem_addr+63:mem_addr]
1061 //
1062 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
_mm_loadr_pd(const double * p)1063 FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
1064 {
1065 #if defined(__aarch64__)
1066 float64x2_t v = vld1q_f64(p);
1067 return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
1068 #else
1069 int64x2_t v = vld1q_s64((const int64_t *) p);
1070 return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
1071 #endif
1072 }
1073
1074 // Sets the low word to the single-precision, floating-point value of b
1075 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
_mm_move_ss(__m128 a,__m128 b)1076 FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
1077 {
1078 return vreinterpretq_m128_f32(
1079 vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
1080 vreinterpretq_f32_m128(a), 0));
1081 }
1082
1083 // Copy the lower 64-bit integer in a to the lower element of dst, and zero the
1084 // upper element.
1085 //
1086 // dst[63:0] := a[63:0]
1087 // dst[127:64] := 0
1088 //
1089 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
_mm_move_epi64(__m128i a)1090 FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
1091 {
1092 return vreinterpretq_m128i_s64(
1093 vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
1094 }
1095
1096 // Return vector of type __m128 with undefined elements.
1097 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
_mm_undefined_ps(void)1098 FORCE_INLINE __m128 _mm_undefined_ps(void)
1099 {
1100 __m128 a;
1101 return a;
1102 }
1103
1104 /* Logic/Binary operations */
1105
1106 // Computes the bitwise AND-NOT of the four single-precision, floating-point
1107 // values of a and b.
1108 //
1109 // r0 := ~a0 & b0
1110 // r1 := ~a1 & b1
1111 // r2 := ~a2 & b2
1112 // r3 := ~a3 & b3
1113 //
1114 // https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
_mm_andnot_ps(__m128 a,__m128 b)1115 FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
1116 {
1117 return vreinterpretq_m128_s32(
1118 vbicq_s32(vreinterpretq_s32_m128(b),
1119 vreinterpretq_s32_m128(a))); // *NOTE* argument swap
1120 }
1121
1122 // Compute the bitwise NOT of packed double-precision (64-bit) floating-point
1123 // elements in a and then AND with b, and store the results in dst.
1124 //
1125 // FOR j := 0 to 1
1126 // i := j*64
1127 // dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
1128 // ENDFOR
1129 //
1130 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
_mm_andnot_pd(__m128d a,__m128d b)1131 FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
1132 {
1133 // *NOTE* argument swap
1134 return vreinterpretq_m128d_s64(
1135 vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
1136 }
1137
1138 // Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
1139 // 128-bit value in a.
1140 //
1141 // r := (~a) & b
1142 //
1143 // https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
_mm_andnot_si128(__m128i a,__m128i b)1144 FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
1145 {
1146 return vreinterpretq_m128i_s32(
1147 vbicq_s32(vreinterpretq_s32_m128i(b),
1148 vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
1149 }
1150
1151 // Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
1152 // b.
1153 //
1154 // r := a & b
1155 //
1156 // https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
_mm_and_si128(__m128i a,__m128i b)1157 FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
1158 {
1159 return vreinterpretq_m128i_s32(
1160 vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1161 }
1162
1163 // Computes the bitwise AND of the four single-precision, floating-point values
1164 // of a and b.
1165 //
1166 // r0 := a0 & b0
1167 // r1 := a1 & b1
1168 // r2 := a2 & b2
1169 // r3 := a3 & b3
1170 //
1171 // https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
_mm_and_ps(__m128 a,__m128 b)1172 FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
1173 {
1174 return vreinterpretq_m128_s32(
1175 vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1176 }
1177
1178 // Compute the bitwise AND of packed double-precision (64-bit) floating-point
1179 // elements in a and b, and store the results in dst.
1180 //
1181 // FOR j := 0 to 1
1182 // i := j*64
1183 // dst[i+63:i] := a[i+63:i] AND b[i+63:i]
1184 // ENDFOR
1185 //
1186 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
_mm_and_pd(__m128d a,__m128d b)1187 FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
1188 {
1189 return vreinterpretq_m128d_s64(
1190 vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
1191 }
1192
1193 // Computes the bitwise OR of the four single-precision, floating-point values
1194 // of a and b.
1195 // https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
_mm_or_ps(__m128 a,__m128 b)1196 FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
1197 {
1198 return vreinterpretq_m128_s32(
1199 vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1200 }
1201
1202 // Computes bitwise EXOR (exclusive-or) of the four single-precision,
1203 // floating-point values of a and b.
1204 // https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
_mm_xor_ps(__m128 a,__m128 b)1205 FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
1206 {
1207 return vreinterpretq_m128_s32(
1208 veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1209 }
1210
1211 // Compute the bitwise XOR of packed double-precision (64-bit) floating-point
1212 // elements in a and b, and store the results in dst.
1213 //
1214 // FOR j := 0 to 1
1215 // i := j*64
1216 // dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
1217 // ENDFOR
1218 //
1219 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
_mm_xor_pd(__m128d a,__m128d b)1220 FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
1221 {
1222 return vreinterpretq_m128d_s64(
1223 veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
1224 }
1225
1226 // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
1227 //
1228 // r := a | b
1229 //
1230 // https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
_mm_or_si128(__m128i a,__m128i b)1231 FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
1232 {
1233 return vreinterpretq_m128i_s32(
1234 vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1235 }
1236
1237 // Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
1238 // b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
_mm_xor_si128(__m128i a,__m128i b)1239 FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
1240 {
1241 return vreinterpretq_m128i_s32(
1242 veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
1243 }
1244
1245 // Duplicate odd-indexed single-precision (32-bit) floating-point elements
1246 // from a, and store the results in dst.
1247 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
_mm_movehdup_ps(__m128 a)1248 FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
1249 {
1250 #if __has_builtin(__builtin_shufflevector)
1251 return vreinterpretq_m128_f32(__builtin_shufflevector(
1252 vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
1253 #else
1254 float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
1255 float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
1256 float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
1257 return vreinterpretq_m128_f32(vld1q_f32(data));
1258 #endif
1259 }
1260
1261 // Duplicate even-indexed single-precision (32-bit) floating-point elements
1262 // from a, and store the results in dst.
1263 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
_mm_moveldup_ps(__m128 a)1264 FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
1265 {
1266 #if __has_builtin(__builtin_shufflevector)
1267 return vreinterpretq_m128_f32(__builtin_shufflevector(
1268 vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
1269 #else
1270 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1271 float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
1272 float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
1273 return vreinterpretq_m128_f32(vld1q_f32(data));
1274 #endif
1275 }
1276
1277 // Moves the upper two values of B into the lower two values of A.
1278 //
1279 // r3 := a3
1280 // r2 := a2
1281 // r1 := b3
1282 // r0 := b2
_mm_movehl_ps(__m128 __A,__m128 __B)1283 FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
1284 {
1285 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
1286 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
1287 return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
1288 }
1289
1290 // Moves the lower two values of B into the upper two values of A.
1291 //
1292 // r3 := b1
1293 // r2 := b0
1294 // r1 := a1
1295 // r0 := a0
_mm_movelh_ps(__m128 __A,__m128 __B)1296 FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
1297 {
1298 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
1299 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
1300 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
1301 }
1302
1303 // Compute the absolute value of packed signed 32-bit integers in a, and store
1304 // the unsigned results in dst.
1305 //
1306 // FOR j := 0 to 3
1307 // i := j*32
1308 // dst[i+31:i] := ABS(a[i+31:i])
1309 // ENDFOR
1310 //
1311 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
_mm_abs_epi32(__m128i a)1312 FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
1313 {
1314 return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
1315 }
1316
1317 // Compute the absolute value of packed signed 16-bit integers in a, and store
1318 // the unsigned results in dst.
1319 //
1320 // FOR j := 0 to 7
1321 // i := j*16
1322 // dst[i+15:i] := ABS(a[i+15:i])
1323 // ENDFOR
1324 //
1325 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
_mm_abs_epi16(__m128i a)1326 FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
1327 {
1328 return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
1329 }
1330
1331 // Compute the absolute value of packed signed 8-bit integers in a, and store
1332 // the unsigned results in dst.
1333 //
1334 // FOR j := 0 to 15
1335 // i := j*8
1336 // dst[i+7:i] := ABS(a[i+7:i])
1337 // ENDFOR
1338 //
1339 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
_mm_abs_epi8(__m128i a)1340 FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
1341 {
1342 return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
1343 }
1344
1345 // Compute the absolute value of packed signed 32-bit integers in a, and store
1346 // the unsigned results in dst.
1347 //
1348 // FOR j := 0 to 1
1349 // i := j*32
1350 // dst[i+31:i] := ABS(a[i+31:i])
1351 // ENDFOR
1352 //
1353 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
_mm_abs_pi32(__m64 a)1354 FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
1355 {
1356 return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
1357 }
1358
1359 // Compute the absolute value of packed signed 16-bit integers in a, and store
1360 // the unsigned results in dst.
1361 //
1362 // FOR j := 0 to 3
1363 // i := j*16
1364 // dst[i+15:i] := ABS(a[i+15:i])
1365 // ENDFOR
1366 //
1367 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
_mm_abs_pi16(__m64 a)1368 FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
1369 {
1370 return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
1371 }
1372
1373 // Compute the absolute value of packed signed 8-bit integers in a, and store
1374 // the unsigned results in dst.
1375 //
1376 // FOR j := 0 to 7
1377 // i := j*8
1378 // dst[i+7:i] := ABS(a[i+7:i])
1379 // ENDFOR
1380 //
1381 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
_mm_abs_pi8(__m64 a)1382 FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
1383 {
1384 return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
1385 }
1386
1387 // Takes the upper 64 bits of a and places it in the low end of the result
1388 // Takes the lower 64 bits of b and places it into the high end of the result.
_mm_shuffle_ps_1032(__m128 a,__m128 b)1389 FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
1390 {
1391 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
1392 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1393 return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
1394 }
1395
1396 // takes the lower two 32-bit values from a and swaps them and places in high
1397 // end of result takes the higher two 32 bit values from b and swaps them and
1398 // places in low end of result.
_mm_shuffle_ps_2301(__m128 a,__m128 b)1399 FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
1400 {
1401 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1402 float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
1403 return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
1404 }
1405
_mm_shuffle_ps_0321(__m128 a,__m128 b)1406 FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
1407 {
1408 float32x2_t a21 = vget_high_f32(
1409 vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
1410 float32x2_t b03 = vget_low_f32(
1411 vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
1412 return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
1413 }
1414
_mm_shuffle_ps_2103(__m128 a,__m128 b)1415 FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
1416 {
1417 float32x2_t a03 = vget_low_f32(
1418 vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
1419 float32x2_t b21 = vget_high_f32(
1420 vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
1421 return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
1422 }
1423
_mm_shuffle_ps_1010(__m128 a,__m128 b)1424 FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
1425 {
1426 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1427 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1428 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
1429 }
1430
_mm_shuffle_ps_1001(__m128 a,__m128 b)1431 FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
1432 {
1433 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1434 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
1435 return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
1436 }
1437
_mm_shuffle_ps_0101(__m128 a,__m128 b)1438 FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
1439 {
1440 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1441 float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
1442 return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
1443 }
1444
1445 // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
1446 // high
_mm_shuffle_ps_3210(__m128 a,__m128 b)1447 FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
1448 {
1449 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1450 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
1451 return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
1452 }
1453
_mm_shuffle_ps_0011(__m128 a,__m128 b)1454 FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
1455 {
1456 float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
1457 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1458 return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
1459 }
1460
_mm_shuffle_ps_0022(__m128 a,__m128 b)1461 FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
1462 {
1463 float32x2_t a22 =
1464 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
1465 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1466 return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
1467 }
1468
_mm_shuffle_ps_2200(__m128 a,__m128 b)1469 FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
1470 {
1471 float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
1472 float32x2_t b22 =
1473 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
1474 return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
1475 }
1476
_mm_shuffle_ps_3202(__m128 a,__m128 b)1477 FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
1478 {
1479 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1480 float32x2_t a22 =
1481 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
1482 float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
1483 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
1484 return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
1485 }
1486
_mm_shuffle_ps_1133(__m128 a,__m128 b)1487 FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
1488 {
1489 float32x2_t a33 =
1490 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
1491 float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
1492 return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
1493 }
1494
_mm_shuffle_ps_2010(__m128 a,__m128 b)1495 FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
1496 {
1497 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
1498 float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
1499 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1500 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1501 return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
1502 }
1503
_mm_shuffle_ps_2001(__m128 a,__m128 b)1504 FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
1505 {
1506 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
1507 float32_t b2 = vgetq_lane_f32(b, 2);
1508 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1509 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1510 return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
1511 }
1512
_mm_shuffle_ps_2032(__m128 a,__m128 b)1513 FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
1514 {
1515 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
1516 float32_t b2 = vgetq_lane_f32(b, 2);
1517 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
1518 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
1519 return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
1520 }
1521
1522 // NEON does not support a general purpose permute intrinsic
1523 // Selects four specific single-precision, floating-point values from a and b,
1524 // based on the mask i.
1525 //
1526 // C equivalent:
1527 // __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
1528 // __constrange(0, 255) int imm) {
1529 // __m128 ret;
1530 // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
1531 // ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];
1532 // return ret;
1533 // }
1534 //
1535 // https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
1536 #define _mm_shuffle_ps_default(a, b, imm) \
1537 __extension__({ \
1538 float32x4_t ret; \
1539 ret = vmovq_n_f32( \
1540 vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \
1541 ret = vsetq_lane_f32( \
1542 vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
1543 ret, 1); \
1544 ret = vsetq_lane_f32( \
1545 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
1546 ret, 2); \
1547 ret = vsetq_lane_f32( \
1548 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
1549 ret, 3); \
1550 vreinterpretq_m128_f32(ret); \
1551 })
1552
1553 // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
1554 // int imm)
1555 #if __has_builtin(__builtin_shufflevector)
1556 #define _mm_shuffle_ps(a, b, imm) \
1557 __extension__({ \
1558 float32x4_t _input1 = vreinterpretq_f32_m128(a); \
1559 float32x4_t _input2 = vreinterpretq_f32_m128(b); \
1560 float32x4_t _shuf = __builtin_shufflevector( \
1561 _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
1562 (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
1563 vreinterpretq_m128_f32(_shuf); \
1564 })
1565 #else // generic
1566 #define _mm_shuffle_ps(a, b, imm) \
1567 __extension__({ \
1568 __m128 ret; \
1569 switch (imm) { \
1570 case _MM_SHUFFLE(1, 0, 3, 2): \
1571 ret = _mm_shuffle_ps_1032((a), (b)); \
1572 break; \
1573 case _MM_SHUFFLE(2, 3, 0, 1): \
1574 ret = _mm_shuffle_ps_2301((a), (b)); \
1575 break; \
1576 case _MM_SHUFFLE(0, 3, 2, 1): \
1577 ret = _mm_shuffle_ps_0321((a), (b)); \
1578 break; \
1579 case _MM_SHUFFLE(2, 1, 0, 3): \
1580 ret = _mm_shuffle_ps_2103((a), (b)); \
1581 break; \
1582 case _MM_SHUFFLE(1, 0, 1, 0): \
1583 ret = _mm_movelh_ps((a), (b)); \
1584 break; \
1585 case _MM_SHUFFLE(1, 0, 0, 1): \
1586 ret = _mm_shuffle_ps_1001((a), (b)); \
1587 break; \
1588 case _MM_SHUFFLE(0, 1, 0, 1): \
1589 ret = _mm_shuffle_ps_0101((a), (b)); \
1590 break; \
1591 case _MM_SHUFFLE(3, 2, 1, 0): \
1592 ret = _mm_shuffle_ps_3210((a), (b)); \
1593 break; \
1594 case _MM_SHUFFLE(0, 0, 1, 1): \
1595 ret = _mm_shuffle_ps_0011((a), (b)); \
1596 break; \
1597 case _MM_SHUFFLE(0, 0, 2, 2): \
1598 ret = _mm_shuffle_ps_0022((a), (b)); \
1599 break; \
1600 case _MM_SHUFFLE(2, 2, 0, 0): \
1601 ret = _mm_shuffle_ps_2200((a), (b)); \
1602 break; \
1603 case _MM_SHUFFLE(3, 2, 0, 2): \
1604 ret = _mm_shuffle_ps_3202((a), (b)); \
1605 break; \
1606 case _MM_SHUFFLE(3, 2, 3, 2): \
1607 ret = _mm_movehl_ps((b), (a)); \
1608 break; \
1609 case _MM_SHUFFLE(1, 1, 3, 3): \
1610 ret = _mm_shuffle_ps_1133((a), (b)); \
1611 break; \
1612 case _MM_SHUFFLE(2, 0, 1, 0): \
1613 ret = _mm_shuffle_ps_2010((a), (b)); \
1614 break; \
1615 case _MM_SHUFFLE(2, 0, 0, 1): \
1616 ret = _mm_shuffle_ps_2001((a), (b)); \
1617 break; \
1618 case _MM_SHUFFLE(2, 0, 3, 2): \
1619 ret = _mm_shuffle_ps_2032((a), (b)); \
1620 break; \
1621 default: \
1622 ret = _mm_shuffle_ps_default((a), (b), (imm)); \
1623 break; \
1624 } \
1625 ret; \
1626 })
1627 #endif
1628
1629 // Takes the upper 64 bits of a and places it in the low end of the result
1630 // Takes the lower 64 bits of a and places it into the high end of the result.
_mm_shuffle_epi_1032(__m128i a)1631 FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
1632 {
1633 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1634 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1635 return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
1636 }
1637
1638 // takes the lower two 32-bit values from a and swaps them and places in low end
1639 // of result takes the higher two 32 bit values from a and swaps them and places
1640 // in high end of result.
_mm_shuffle_epi_2301(__m128i a)1641 FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
1642 {
1643 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1644 int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
1645 return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
1646 }
1647
1648 // rotates the least significant 32 bits into the most signficant 32 bits, and
1649 // shifts the rest down
_mm_shuffle_epi_0321(__m128i a)1650 FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
1651 {
1652 return vreinterpretq_m128i_s32(
1653 vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
1654 }
1655
1656 // rotates the most significant 32 bits into the least signficant 32 bits, and
1657 // shifts the rest up
_mm_shuffle_epi_2103(__m128i a)1658 FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
1659 {
1660 return vreinterpretq_m128i_s32(
1661 vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
1662 }
1663
1664 // gets the lower 64 bits of a, and places it in the upper 64 bits
1665 // gets the lower 64 bits of a and places it in the lower 64 bits
_mm_shuffle_epi_1010(__m128i a)1666 FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
1667 {
1668 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1669 return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
1670 }
1671
1672 // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
1673 // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
_mm_shuffle_epi_1001(__m128i a)1674 FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
1675 {
1676 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1677 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1678 return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
1679 }
1680
1681 // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
1682 // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
1683 // places it in the lower 64 bits
_mm_shuffle_epi_0101(__m128i a)1684 FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
1685 {
1686 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1687 return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
1688 }
1689
_mm_shuffle_epi_2211(__m128i a)1690 FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
1691 {
1692 int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
1693 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1694 return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
1695 }
1696
_mm_shuffle_epi_0122(__m128i a)1697 FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
1698 {
1699 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1700 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1701 return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
1702 }
1703
_mm_shuffle_epi_3332(__m128i a)1704 FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
1705 {
1706 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1707 int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
1708 return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
1709 }
1710
1711 // Shuffle packed 8-bit integers in a according to shuffle control mask in the
1712 // corresponding 8-bit element of b, and store the results in dst.
1713 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
_mm_shuffle_epi8(__m128i a,__m128i b)1714 FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
1715 {
1716 int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
1717 uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
1718 uint8x16_t idx_masked =
1719 vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
1720 #if defined(__aarch64__)
1721 return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
1722 #elif defined(__GNUC__)
1723 int8x16_t ret;
1724 // %e and %f represent the even and odd D registers
1725 // respectively.
1726 __asm__ __volatile__(
1727 "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
1728 "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
1729 : [ret] "=&w"(ret)
1730 : [tbl] "w"(tbl), [idx] "w"(idx_masked));
1731 return vreinterpretq_m128i_s8(ret);
1732 #else
1733 // use this line if testing on aarch64
1734 int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
1735 return vreinterpretq_m128i_s8(
1736 vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
1737 vtbl2_s8(a_split, vget_high_u8(idx_masked))));
1738 #endif
1739 }
1740
1741 // C equivalent:
1742 // __m128i _mm_shuffle_epi32_default(__m128i a,
1743 // __constrange(0, 255) int imm) {
1744 // __m128i ret;
1745 // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
1746 // ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];
1747 // return ret;
1748 // }
1749 #define _mm_shuffle_epi32_default(a, imm) \
1750 __extension__({ \
1751 int32x4_t ret; \
1752 ret = vmovq_n_s32( \
1753 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \
1754 ret = vsetq_lane_s32( \
1755 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
1756 ret, 1); \
1757 ret = vsetq_lane_s32( \
1758 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
1759 ret, 2); \
1760 ret = vsetq_lane_s32( \
1761 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
1762 ret, 3); \
1763 vreinterpretq_m128i_s32(ret); \
1764 })
1765
1766 // FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
1767 // int imm)
1768 #if defined(__aarch64__)
1769 #define _mm_shuffle_epi32_splat(a, imm) \
1770 __extension__({ \
1771 vreinterpretq_m128i_s32( \
1772 vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
1773 })
1774 #else
1775 #define _mm_shuffle_epi32_splat(a, imm) \
1776 __extension__({ \
1777 vreinterpretq_m128i_s32( \
1778 vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
1779 })
1780 #endif
1781
1782 // Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
1783 // https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
1784 // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
1785 // __constrange(0,255) int imm)
1786 #if __has_builtin(__builtin_shufflevector)
1787 #define _mm_shuffle_epi32(a, imm) \
1788 __extension__({ \
1789 int32x4_t _input = vreinterpretq_s32_m128i(a); \
1790 int32x4_t _shuf = __builtin_shufflevector( \
1791 _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
1792 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
1793 vreinterpretq_m128i_s32(_shuf); \
1794 })
1795 #else // generic
1796 #define _mm_shuffle_epi32(a, imm) \
1797 __extension__({ \
1798 __m128i ret; \
1799 switch (imm) { \
1800 case _MM_SHUFFLE(1, 0, 3, 2): \
1801 ret = _mm_shuffle_epi_1032((a)); \
1802 break; \
1803 case _MM_SHUFFLE(2, 3, 0, 1): \
1804 ret = _mm_shuffle_epi_2301((a)); \
1805 break; \
1806 case _MM_SHUFFLE(0, 3, 2, 1): \
1807 ret = _mm_shuffle_epi_0321((a)); \
1808 break; \
1809 case _MM_SHUFFLE(2, 1, 0, 3): \
1810 ret = _mm_shuffle_epi_2103((a)); \
1811 break; \
1812 case _MM_SHUFFLE(1, 0, 1, 0): \
1813 ret = _mm_shuffle_epi_1010((a)); \
1814 break; \
1815 case _MM_SHUFFLE(1, 0, 0, 1): \
1816 ret = _mm_shuffle_epi_1001((a)); \
1817 break; \
1818 case _MM_SHUFFLE(0, 1, 0, 1): \
1819 ret = _mm_shuffle_epi_0101((a)); \
1820 break; \
1821 case _MM_SHUFFLE(2, 2, 1, 1): \
1822 ret = _mm_shuffle_epi_2211((a)); \
1823 break; \
1824 case _MM_SHUFFLE(0, 1, 2, 2): \
1825 ret = _mm_shuffle_epi_0122((a)); \
1826 break; \
1827 case _MM_SHUFFLE(3, 3, 3, 2): \
1828 ret = _mm_shuffle_epi_3332((a)); \
1829 break; \
1830 case _MM_SHUFFLE(0, 0, 0, 0): \
1831 ret = _mm_shuffle_epi32_splat((a), 0); \
1832 break; \
1833 case _MM_SHUFFLE(1, 1, 1, 1): \
1834 ret = _mm_shuffle_epi32_splat((a), 1); \
1835 break; \
1836 case _MM_SHUFFLE(2, 2, 2, 2): \
1837 ret = _mm_shuffle_epi32_splat((a), 2); \
1838 break; \
1839 case _MM_SHUFFLE(3, 3, 3, 3): \
1840 ret = _mm_shuffle_epi32_splat((a), 3); \
1841 break; \
1842 default: \
1843 ret = _mm_shuffle_epi32_default((a), (imm)); \
1844 break; \
1845 } \
1846 ret; \
1847 })
1848 #endif
1849
1850 // Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
1851 // by imm.
1852 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
1853 // FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
1854 // __constrange(0,255) int
1855 // imm)
1856 #define _mm_shufflelo_epi16_function(a, imm) \
1857 __extension__({ \
1858 int16x8_t ret = vreinterpretq_s16_m128i(a); \
1859 int16x4_t lowBits = vget_low_s16(ret); \
1860 ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
1861 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
1862 1); \
1863 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
1864 2); \
1865 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1866 3); \
1867 vreinterpretq_m128i_s16(ret); \
1868 })
1869
1870 // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
1871 // __constrange(0,255) int imm)
1872 #if __has_builtin(__builtin_shufflevector)
1873 #define _mm_shufflelo_epi16(a, imm) \
1874 __extension__({ \
1875 int16x8_t _input = vreinterpretq_s16_m128i(a); \
1876 int16x8_t _shuf = __builtin_shufflevector( \
1877 _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
1878 (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
1879 vreinterpretq_m128i_s16(_shuf); \
1880 })
1881 #else // generic
1882 #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
1883 #endif
1884
1885 // Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
1886 // by imm.
1887 // https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
1888 // FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
1889 // __constrange(0,255) int
1890 // imm)
1891 #define _mm_shufflehi_epi16_function(a, imm) \
1892 __extension__({ \
1893 int16x8_t ret = vreinterpretq_s16_m128i(a); \
1894 int16x4_t highBits = vget_high_s16(ret); \
1895 ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
1896 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1897 5); \
1898 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1899 6); \
1900 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1901 7); \
1902 vreinterpretq_m128i_s16(ret); \
1903 })
1904
1905 // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
1906 // __constrange(0,255) int imm)
1907 #if __has_builtin(__builtin_shufflevector)
1908 #define _mm_shufflehi_epi16(a, imm) \
1909 __extension__({ \
1910 int16x8_t _input = vreinterpretq_s16_m128i(a); \
1911 int16x8_t _shuf = __builtin_shufflevector( \
1912 _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
1913 (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
1914 (((imm) >> 6) & 0x3) + 4); \
1915 vreinterpretq_m128i_s16(_shuf); \
1916 })
1917 #else // generic
1918 #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
1919 #endif
1920
1921 // Blend packed 16-bit integers from a and b using control mask imm8, and store
1922 // the results in dst.
1923 //
1924 // FOR j := 0 to 7
1925 // i := j*16
1926 // IF imm8[j]
1927 // dst[i+15:i] := b[i+15:i]
1928 // ELSE
1929 // dst[i+15:i] := a[i+15:i]
1930 // FI
1931 // ENDFOR
1932 // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
1933 // __constrange(0,255) int imm)
1934 #define _mm_blend_epi16(a, b, imm) \
1935 __extension__({ \
1936 const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000, \
1937 ((imm) & (1 << 1)) ? 0xFFFF : 0x0000, \
1938 ((imm) & (1 << 2)) ? 0xFFFF : 0x0000, \
1939 ((imm) & (1 << 3)) ? 0xFFFF : 0x0000, \
1940 ((imm) & (1 << 4)) ? 0xFFFF : 0x0000, \
1941 ((imm) & (1 << 5)) ? 0xFFFF : 0x0000, \
1942 ((imm) & (1 << 6)) ? 0xFFFF : 0x0000, \
1943 ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \
1944 uint16x8_t _mask_vec = vld1q_u16(_mask); \
1945 uint16x8_t _a = vreinterpretq_u16_m128i(a); \
1946 uint16x8_t _b = vreinterpretq_u16_m128i(b); \
1947 vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
1948 })
1949
1950 // Blend packed 8-bit integers from a and b using mask, and store the results in
1951 // dst.
1952 //
1953 // FOR j := 0 to 15
1954 // i := j*8
1955 // IF mask[i+7]
1956 // dst[i+7:i] := b[i+7:i]
1957 // ELSE
1958 // dst[i+7:i] := a[i+7:i]
1959 // FI
1960 // ENDFOR
_mm_blendv_epi8(__m128i _a,__m128i _b,__m128i _mask)1961 FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
1962 {
1963 // Use a signed shift right to create a mask with the sign bit
1964 uint8x16_t mask =
1965 vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
1966 uint8x16_t a = vreinterpretq_u8_m128i(_a);
1967 uint8x16_t b = vreinterpretq_u8_m128i(_b);
1968 return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
1969 }
1970
1971 /* Shifts */
1972
1973
1974 // Shift packed 16-bit integers in a right by imm while shifting in sign
1975 // bits, and store the results in dst.
1976 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
_mm_srai_epi16(__m128i a,int imm)1977 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
1978 {
1979 const int count = (imm & ~15) ? 15 : imm;
1980 return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
1981 }
1982
1983 // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
1984 // shifting in zeros.
1985 //
1986 // r0 := a0 << count
1987 // r1 := a1 << count
1988 // ...
1989 // r7 := a7 << count
1990 //
1991 // https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
1992 #define _mm_slli_epi16(a, imm) \
1993 __extension__({ \
1994 __m128i ret; \
1995 if ((imm) <= 0) { \
1996 ret = a; \
1997 } else if ((imm) > 15) { \
1998 ret = _mm_setzero_si128(); \
1999 } else { \
2000 ret = vreinterpretq_m128i_s16( \
2001 vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
2002 } \
2003 ret; \
2004 })
2005
2006 // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
2007 // shifting in zeros. :
2008 // https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
2009 // FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
_mm_slli_epi32(__m128i a,int imm)2010 FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
2011 {
2012 if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
2013 return a;
2014 if (imm > 31) /* TODO: add unlikely macro */
2015 return _mm_setzero_si128();
2016 return vreinterpretq_m128i_s32(
2017 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
2018 }
2019
2020 // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
2021 // store the results in dst.
_mm_slli_epi64(__m128i a,int imm)2022 FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
2023 {
2024 if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
2025 return a;
2026 if (imm > 63) /* TODO: add unlikely macro */
2027 return _mm_setzero_si128();
2028 return vreinterpretq_m128i_s64(
2029 vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
2030 }
2031
2032 // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
2033 // store the results in dst.
2034 //
2035 // FOR j := 0 to 7
2036 // i := j*16
2037 // IF imm8[7:0] > 15
2038 // dst[i+15:i] := 0
2039 // ELSE
2040 // dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
2041 // FI
2042 // ENDFOR
2043 //
2044 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
2045 #define _mm_srli_epi16(a, imm) \
2046 __extension__({ \
2047 __m128i ret; \
2048 if ((imm) == 0) { \
2049 ret = a; \
2050 } else if (0 < (imm) && (imm) < 16) { \
2051 ret = vreinterpretq_m128i_u16( \
2052 vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
2053 } else { \
2054 ret = _mm_setzero_si128(); \
2055 } \
2056 ret; \
2057 })
2058
2059 // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
2060 // store the results in dst.
2061 //
2062 // FOR j := 0 to 3
2063 // i := j*32
2064 // IF imm8[7:0] > 31
2065 // dst[i+31:i] := 0
2066 // ELSE
2067 // dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
2068 // FI
2069 // ENDFOR
2070 //
2071 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
2072 // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
2073 #define _mm_srli_epi32(a, imm) \
2074 __extension__({ \
2075 __m128i ret; \
2076 if ((imm) == 0) { \
2077 ret = a; \
2078 } else if (0 < (imm) && (imm) < 32) { \
2079 ret = vreinterpretq_m128i_u32( \
2080 vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
2081 } else { \
2082 ret = _mm_setzero_si128(); \
2083 } \
2084 ret; \
2085 })
2086
2087 // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
2088 // store the results in dst.
2089 //
2090 // FOR j := 0 to 1
2091 // i := j*64
2092 // IF imm8[7:0] > 63
2093 // dst[i+63:i] := 0
2094 // ELSE
2095 // dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
2096 // FI
2097 // ENDFOR
2098 //
2099 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
2100 #define _mm_srli_epi64(a, imm) \
2101 __extension__({ \
2102 __m128i ret; \
2103 if ((imm) == 0) { \
2104 ret = a; \
2105 } else if (0 < (imm) && (imm) < 64) { \
2106 ret = vreinterpretq_m128i_u64( \
2107 vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
2108 } else { \
2109 ret = _mm_setzero_si128(); \
2110 } \
2111 ret; \
2112 })
2113
2114 // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
2115 // and store the results in dst.
2116 //
2117 // FOR j := 0 to 3
2118 // i := j*32
2119 // IF imm8[7:0] > 31
2120 // dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
2121 // ELSE
2122 // dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
2123 // FI
2124 // ENDFOR
2125 //
2126 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
2127 // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
2128 #define _mm_srai_epi32(a, imm) \
2129 __extension__({ \
2130 __m128i ret; \
2131 if ((imm) == 0) { \
2132 ret = a; \
2133 } else if (0 < (imm) && (imm) < 32) { \
2134 ret = vreinterpretq_m128i_s32( \
2135 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
2136 } else { \
2137 ret = vreinterpretq_m128i_s32( \
2138 vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \
2139 } \
2140 ret; \
2141 })
2142
2143 // Shifts the 128 - bit value in a right by imm bytes while shifting in
2144 // zeros.imm must be an immediate.
2145 //
2146 // r := srl(a, imm*8)
2147 //
2148 // https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
2149 // FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
2150 #define _mm_srli_si128(a, imm) \
2151 __extension__({ \
2152 __m128i ret; \
2153 if ((imm) <= 0) { \
2154 ret = a; \
2155 } else if ((imm) > 15) { \
2156 ret = _mm_setzero_si128(); \
2157 } else { \
2158 ret = vreinterpretq_m128i_s8( \
2159 vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
2160 } \
2161 ret; \
2162 })
2163
2164 // Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
2165 // must be an immediate.
2166 //
2167 // r := a << (imm * 8)
2168 //
2169 // https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
2170 // FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
2171 #define _mm_slli_si128(a, imm) \
2172 __extension__({ \
2173 __m128i ret; \
2174 if ((imm) <= 0) { \
2175 ret = a; \
2176 } else if ((imm) > 15) { \
2177 ret = _mm_setzero_si128(); \
2178 } else { \
2179 ret = vreinterpretq_m128i_s8(vextq_s8( \
2180 vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
2181 } \
2182 ret; \
2183 })
2184
2185 // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
2186 // shifting in zeros.
2187 //
2188 // r0 := a0 << count
2189 // r1 := a1 << count
2190 // ...
2191 // r7 := a7 << count
2192 //
2193 // https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
_mm_sll_epi16(__m128i a,__m128i count)2194 FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
2195 {
2196 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2197 if (c > 15)
2198 return _mm_setzero_si128();
2199
2200 int16x8_t vc = vdupq_n_s16((int16_t) c);
2201 return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
2202 }
2203
2204 // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
2205 // shifting in zeros.
2206 //
2207 // r0 := a0 << count
2208 // r1 := a1 << count
2209 // r2 := a2 << count
2210 // r3 := a3 << count
2211 //
2212 // https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
_mm_sll_epi32(__m128i a,__m128i count)2213 FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
2214 {
2215 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2216 if (c > 31)
2217 return _mm_setzero_si128();
2218
2219 int32x4_t vc = vdupq_n_s32((int32_t) c);
2220 return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
2221 }
2222
2223 // Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
2224 // shifting in zeros.
2225 //
2226 // r0 := a0 << count
2227 // r1 := a1 << count
2228 //
2229 // https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
_mm_sll_epi64(__m128i a,__m128i count)2230 FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
2231 {
2232 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2233 if (c > 63)
2234 return _mm_setzero_si128();
2235
2236 int64x2_t vc = vdupq_n_s64((int64_t) c);
2237 return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
2238 }
2239
2240 // Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
2241 // while shifting in zeros.
2242 //
2243 // r0 := srl(a0, count)
2244 // r1 := srl(a1, count)
2245 // ...
2246 // r7 := srl(a7, count)
2247 //
2248 // https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
_mm_srl_epi16(__m128i a,__m128i count)2249 FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
2250 {
2251 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2252 if (c > 15)
2253 return _mm_setzero_si128();
2254
2255 int16x8_t vc = vdupq_n_s16(-(int16_t) c);
2256 return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
2257 }
2258
2259 // Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
2260 // while shifting in zeros.
2261 //
2262 // r0 := srl(a0, count)
2263 // r1 := srl(a1, count)
2264 // r2 := srl(a2, count)
2265 // r3 := srl(a3, count)
2266 //
2267 // https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
_mm_srl_epi32(__m128i a,__m128i count)2268 FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
2269 {
2270 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2271 if (c > 31)
2272 return _mm_setzero_si128();
2273
2274 int32x4_t vc = vdupq_n_s32(-(int32_t) c);
2275 return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
2276 }
2277
2278 // Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
2279 // while shifting in zeros.
2280 //
2281 // r0 := srl(a0, count)
2282 // r1 := srl(a1, count)
2283 //
2284 // https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
_mm_srl_epi64(__m128i a,__m128i count)2285 FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
2286 {
2287 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
2288 if (c > 63)
2289 return _mm_setzero_si128();
2290
2291 int64x2_t vc = vdupq_n_s64(-(int64_t) c);
2292 return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
2293 }
2294
2295 // NEON does not provide a version of this function.
2296 // Creates a 16-bit mask from the most significant bits of the 16 signed or
2297 // unsigned 8-bit integers in a and zero extends the upper bits.
2298 // https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
_mm_movemask_epi8(__m128i a)2299 FORCE_INLINE int _mm_movemask_epi8(__m128i a)
2300 {
2301 #if defined(__aarch64__)
2302 uint8x16_t input = vreinterpretq_u8_m128i(a);
2303 const int8_t ALIGN_STRUCT(16)
2304 xr[16] = {-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0};
2305 const uint8x16_t mask_and = vdupq_n_u8(0x80);
2306 const int8x16_t mask_shift = vld1q_s8(xr);
2307 const uint8x16_t mask_result =
2308 vshlq_u8(vandq_u8(input, mask_and), mask_shift);
2309 uint8x8_t lo = vget_low_u8(mask_result);
2310 uint8x8_t hi = vget_high_u8(mask_result);
2311
2312 return vaddv_u8(lo) + (vaddv_u8(hi) << 8);
2313 #else
2314 // Use increasingly wide shifts+adds to collect the sign bits
2315 // together.
2316 // Since the widening shifts would be rather confusing to follow in little
2317 // endian, everything will be illustrated in big endian order instead. This
2318 // has a different result - the bits would actually be reversed on a big
2319 // endian machine.
2320
2321 // Starting input (only half the elements are shown):
2322 // 89 ff 1d c0 00 10 99 33
2323 uint8x16_t input = vreinterpretq_u8_m128i(a);
2324
2325 // Shift out everything but the sign bits with an unsigned shift right.
2326 //
2327 // Bytes of the vector::
2328 // 89 ff 1d c0 00 10 99 33
2329 // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
2330 // | | | | | | | |
2331 // 01 01 00 01 00 00 01 00
2332 //
2333 // Bits of first important lane(s):
2334 // 10001001 (89)
2335 // \______
2336 // |
2337 // 00000001 (01)
2338 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
2339
2340 // Merge the even lanes together with a 16-bit unsigned shift right + add.
2341 // 'xx' represents garbage data which will be ignored in the final result.
2342 // In the important bytes, the add functions like a binary OR.
2343 //
2344 // 01 01 00 01 00 00 01 00
2345 // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
2346 // \| \| \| \|
2347 // xx 03 xx 01 xx 00 xx 02
2348 //
2349 // 00000001 00000001 (01 01)
2350 // \_______ |
2351 // \|
2352 // xxxxxxxx xxxxxx11 (xx 03)
2353 uint32x4_t paired16 =
2354 vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
2355
2356 // Repeat with a wider 32-bit shift + add.
2357 // xx 03 xx 01 xx 00 xx 02
2358 // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>
2359 // 14))
2360 // \| \|
2361 // xx xx xx 0d xx xx xx 02
2362 //
2363 // 00000011 00000001 (03 01)
2364 // \\_____ ||
2365 // '----.\||
2366 // xxxxxxxx xxxx1101 (xx 0d)
2367 uint64x2_t paired32 =
2368 vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
2369
2370 // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
2371 // lanes. xx xx xx 0d xx xx xx 02
2372 // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>
2373 // 28))
2374 // \|
2375 // xx xx xx xx xx xx xx d2
2376 //
2377 // 00001101 00000010 (0d 02)
2378 // \ \___ | |
2379 // '---. \| |
2380 // xxxxxxxx 11010010 (xx d2)
2381 uint8x16_t paired64 =
2382 vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
2383
2384 // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
2385 // xx xx xx xx xx xx xx d2
2386 // || return paired64[0]
2387 // d2
2388 // Note: Little endian would return the correct value 4b (01001011) instead.
2389 return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
2390 #endif
2391 }
2392
2393 // Copy the lower 64-bit integer in a to dst.
2394 //
2395 // dst[63:0] := a[63:0]
2396 //
2397 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
_mm_movepi64_pi64(__m128i a)2398 FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
2399 {
2400 return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
2401 }
2402
2403 // Copy the 64-bit integer a to the lower element of dst, and zero the upper
2404 // element.
2405 //
2406 // dst[63:0] := a[63:0]
2407 // dst[127:64] := 0
2408 //
2409 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
_mm_movpi64_epi64(__m64 a)2410 FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
2411 {
2412 return vreinterpretq_m128i_s64(
2413 vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
2414 }
2415
2416 // NEON does not provide this method
2417 // Creates a 4-bit mask from the most significant bits of the four
2418 // single-precision, floating-point values.
2419 // https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
_mm_movemask_ps(__m128 a)2420 FORCE_INLINE int _mm_movemask_ps(__m128 a)
2421 {
2422 uint32x4_t input = vreinterpretq_u32_m128(a);
2423 #if defined(__aarch64__)
2424 static const int32x4_t shift = {0, 1, 2, 3};
2425 uint32x4_t tmp = vshrq_n_u32(input, 31);
2426 return vaddvq_u32(vshlq_u32(tmp, shift));
2427 #else
2428 // Uses the exact same method as _mm_movemask_epi8, see that for details.
2429 // Shift out everything but the sign bits with a 32-bit unsigned shift
2430 // right.
2431 uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2432 // Merge the two pairs together with a 64-bit unsigned shift right + add.
2433 uint8x16_t paired =
2434 vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2435 // Extract the result.
2436 return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2437 #endif
2438 }
2439
2440 // Compute the bitwise NOT of a and then AND with a 128-bit vector containing
2441 // all 1's, and return 1 if the result is zero, otherwise return 0.
2442 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
_mm_test_all_ones(__m128i a)2443 FORCE_INLINE int _mm_test_all_ones(__m128i a)
2444 {
2445 return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
2446 ~(uint64_t) 0;
2447 }
2448
2449 // Compute the bitwise AND of 128 bits (representing integer data) in a and
2450 // mask, and return 1 if the result is zero, otherwise return 0.
2451 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
_mm_test_all_zeros(__m128i a,__m128i mask)2452 FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
2453 {
2454 int64x2_t a_and_mask =
2455 vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
2456 return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
2457 : 1;
2458 }
2459
2460 /* Math operations */
2461
2462 // Subtracts the four single-precision, floating-point values of a and b.
2463 //
2464 // r0 := a0 - b0
2465 // r1 := a1 - b1
2466 // r2 := a2 - b2
2467 // r3 := a3 - b3
2468 //
2469 // https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
_mm_sub_ps(__m128 a,__m128 b)2470 FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2471 {
2472 return vreinterpretq_m128_f32(
2473 vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2474 }
2475
2476 // Subtract the lower single-precision (32-bit) floating-point element in b from
2477 // the lower single-precision (32-bit) floating-point element in a, store the
2478 // result in the lower element of dst, and copy the upper 3 packed elements from
2479 // a to the upper elements of dst.
2480 //
2481 // dst[31:0] := a[31:0] - b[31:0]
2482 // dst[127:32] := a[127:32]
2483 //
2484 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
_mm_sub_ss(__m128 a,__m128 b)2485 FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2486 {
2487 return _mm_move_ss(a, _mm_sub_ps(a, b));
2488 }
2489
2490 // Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
2491 // and store the results in dst.
2492 // r0 := a0 - b0
2493 // r1 := a1 - b1
_mm_sub_epi64(__m128i a,__m128i b)2494 FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
2495 {
2496 return vreinterpretq_m128i_s64(
2497 vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2498 }
2499
2500 // Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
2501 // unsigned 32-bit integers of a.
2502 //
2503 // r0 := a0 - b0
2504 // r1 := a1 - b1
2505 // r2 := a2 - b2
2506 // r3 := a3 - b3
2507 //
2508 // https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
_mm_sub_epi32(__m128i a,__m128i b)2509 FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
2510 {
2511 return vreinterpretq_m128i_s32(
2512 vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2513 }
2514
_mm_sub_epi16(__m128i a,__m128i b)2515 FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
2516 {
2517 return vreinterpretq_m128i_s16(
2518 vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2519 }
2520
_mm_sub_epi8(__m128i a,__m128i b)2521 FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
2522 {
2523 return vreinterpretq_m128i_s8(
2524 vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2525 }
2526
2527 // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
2528 //
2529 // dst[63:0] := a[63:0] - b[63:0]
2530 //
2531 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
_mm_sub_si64(__m64 a,__m64 b)2532 FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
2533 {
2534 return vreinterpret_m64_s64(
2535 vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2536 }
2537
2538 // Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
2539 // integers of a and saturates..
2540 // https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
_mm_subs_epu16(__m128i a,__m128i b)2541 FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
2542 {
2543 return vreinterpretq_m128i_u16(
2544 vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
2545 }
2546
2547 // Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
2548 // integers of a and saturates.
2549 //
2550 // r0 := UnsignedSaturate(a0 - b0)
2551 // r1 := UnsignedSaturate(a1 - b1)
2552 // ...
2553 // r15 := UnsignedSaturate(a15 - b15)
2554 //
2555 // https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
_mm_subs_epu8(__m128i a,__m128i b)2556 FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
2557 {
2558 return vreinterpretq_m128i_u8(
2559 vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2560 }
2561
2562 // Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
2563 // of a and saturates.
2564 //
2565 // r0 := SignedSaturate(a0 - b0)
2566 // r1 := SignedSaturate(a1 - b1)
2567 // ...
2568 // r15 := SignedSaturate(a15 - b15)
2569 //
2570 // https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
_mm_subs_epi8(__m128i a,__m128i b)2571 FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
2572 {
2573 return vreinterpretq_m128i_s8(
2574 vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2575 }
2576
2577 // Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
2578 // of a and saturates.
2579 //
2580 // r0 := SignedSaturate(a0 - b0)
2581 // r1 := SignedSaturate(a1 - b1)
2582 // ...
2583 // r7 := SignedSaturate(a7 - b7)
2584 //
2585 // https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
_mm_subs_epi16(__m128i a,__m128i b)2586 FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
2587 {
2588 return vreinterpretq_m128i_s16(
2589 vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2590 }
2591
_mm_adds_epu16(__m128i a,__m128i b)2592 FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
2593 {
2594 return vreinterpretq_m128i_u16(
2595 vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
2596 }
2597
2598 // Negate packed 8-bit integers in a when the corresponding signed
2599 // 8-bit integer in b is negative, and store the results in dst.
2600 // Element in dst are zeroed out when the corresponding element
2601 // in b is zero.
2602 //
2603 // for i in 0..15
2604 // if b[i] < 0
2605 // r[i] := -a[i]
2606 // else if b[i] == 0
2607 // r[i] := 0
2608 // else
2609 // r[i] := a[i]
2610 // fi
2611 // done
_mm_sign_epi8(__m128i _a,__m128i _b)2612 FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
2613 {
2614 int8x16_t a = vreinterpretq_s8_m128i(_a);
2615 int8x16_t b = vreinterpretq_s8_m128i(_b);
2616
2617 // signed shift right: faster than vclt
2618 // (b < 0) ? 0xFF : 0
2619 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
2620
2621 // (b == 0) ? 0xFF : 0
2622 #if defined(__aarch64__)
2623 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
2624 #else
2625 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
2626 #endif
2627
2628 // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
2629 // based on ltMask
2630 int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
2631 // res = masked & (~zeroMask)
2632 int8x16_t res = vbicq_s8(masked, zeroMask);
2633
2634 return vreinterpretq_m128i_s8(res);
2635 }
2636
2637 // Negate packed 16-bit integers in a when the corresponding signed
2638 // 16-bit integer in b is negative, and store the results in dst.
2639 // Element in dst are zeroed out when the corresponding element
2640 // in b is zero.
2641 //
2642 // for i in 0..7
2643 // if b[i] < 0
2644 // r[i] := -a[i]
2645 // else if b[i] == 0
2646 // r[i] := 0
2647 // else
2648 // r[i] := a[i]
2649 // fi
2650 // done
_mm_sign_epi16(__m128i _a,__m128i _b)2651 FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
2652 {
2653 int16x8_t a = vreinterpretq_s16_m128i(_a);
2654 int16x8_t b = vreinterpretq_s16_m128i(_b);
2655
2656 // signed shift right: faster than vclt
2657 // (b < 0) ? 0xFFFF : 0
2658 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
2659 // (b == 0) ? 0xFFFF : 0
2660 #if defined(__aarch64__)
2661 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
2662 #else
2663 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
2664 #endif
2665
2666 // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
2667 // 'a') based on ltMask
2668 int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
2669 // res = masked & (~zeroMask)
2670 int16x8_t res = vbicq_s16(masked, zeroMask);
2671 return vreinterpretq_m128i_s16(res);
2672 }
2673
2674 // Negate packed 32-bit integers in a when the corresponding signed
2675 // 32-bit integer in b is negative, and store the results in dst.
2676 // Element in dst are zeroed out when the corresponding element
2677 // in b is zero.
2678 //
2679 // for i in 0..3
2680 // if b[i] < 0
2681 // r[i] := -a[i]
2682 // else if b[i] == 0
2683 // r[i] := 0
2684 // else
2685 // r[i] := a[i]
2686 // fi
2687 // done
_mm_sign_epi32(__m128i _a,__m128i _b)2688 FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
2689 {
2690 int32x4_t a = vreinterpretq_s32_m128i(_a);
2691 int32x4_t b = vreinterpretq_s32_m128i(_b);
2692
2693 // signed shift right: faster than vclt
2694 // (b < 0) ? 0xFFFFFFFF : 0
2695 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
2696
2697 // (b == 0) ? 0xFFFFFFFF : 0
2698 #if defined(__aarch64__)
2699 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
2700 #else
2701 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
2702 #endif
2703
2704 // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
2705 // 'a') based on ltMask
2706 int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
2707 // res = masked & (~zeroMask)
2708 int32x4_t res = vbicq_s32(masked, zeroMask);
2709 return vreinterpretq_m128i_s32(res);
2710 }
2711
2712 // Negate packed 16-bit integers in a when the corresponding signed 16-bit
2713 // integer in b is negative, and store the results in dst. Element in dst are
2714 // zeroed out when the corresponding element in b is zero.
2715 //
2716 // FOR j := 0 to 3
2717 // i := j*16
2718 // IF b[i+15:i] < 0
2719 // dst[i+15:i] := -(a[i+15:i])
2720 // ELSE IF b[i+15:i] == 0
2721 // dst[i+15:i] := 0
2722 // ELSE
2723 // dst[i+15:i] := a[i+15:i]
2724 // FI
2725 // ENDFOR
2726 //
2727 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
_mm_sign_pi16(__m64 _a,__m64 _b)2728 FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
2729 {
2730 int16x4_t a = vreinterpret_s16_m64(_a);
2731 int16x4_t b = vreinterpret_s16_m64(_b);
2732
2733 // signed shift right: faster than vclt
2734 // (b < 0) ? 0xFFFF : 0
2735 uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
2736
2737 // (b == 0) ? 0xFFFF : 0
2738 #if defined(__aarch64__)
2739 int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
2740 #else
2741 int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
2742 #endif
2743
2744 // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
2745 // based on ltMask
2746 int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
2747 // res = masked & (~zeroMask)
2748 int16x4_t res = vbic_s16(masked, zeroMask);
2749
2750 return vreinterpret_m64_s16(res);
2751 }
2752
2753 // Negate packed 32-bit integers in a when the corresponding signed 32-bit
2754 // integer in b is negative, and store the results in dst. Element in dst are
2755 // zeroed out when the corresponding element in b is zero.
2756 //
2757 // FOR j := 0 to 1
2758 // i := j*32
2759 // IF b[i+31:i] < 0
2760 // dst[i+31:i] := -(a[i+31:i])
2761 // ELSE IF b[i+31:i] == 0
2762 // dst[i+31:i] := 0
2763 // ELSE
2764 // dst[i+31:i] := a[i+31:i]
2765 // FI
2766 // ENDFOR
2767 //
2768 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
_mm_sign_pi32(__m64 _a,__m64 _b)2769 FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
2770 {
2771 int32x2_t a = vreinterpret_s32_m64(_a);
2772 int32x2_t b = vreinterpret_s32_m64(_b);
2773
2774 // signed shift right: faster than vclt
2775 // (b < 0) ? 0xFFFFFFFF : 0
2776 uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
2777
2778 // (b == 0) ? 0xFFFFFFFF : 0
2779 #if defined(__aarch64__)
2780 int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
2781 #else
2782 int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
2783 #endif
2784
2785 // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
2786 // based on ltMask
2787 int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
2788 // res = masked & (~zeroMask)
2789 int32x2_t res = vbic_s32(masked, zeroMask);
2790
2791 return vreinterpret_m64_s32(res);
2792 }
2793
2794 // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
2795 // in b is negative, and store the results in dst. Element in dst are zeroed out
2796 // when the corresponding element in b is zero.
2797 //
2798 // FOR j := 0 to 7
2799 // i := j*8
2800 // IF b[i+7:i] < 0
2801 // dst[i+7:i] := -(a[i+7:i])
2802 // ELSE IF b[i+7:i] == 0
2803 // dst[i+7:i] := 0
2804 // ELSE
2805 // dst[i+7:i] := a[i+7:i]
2806 // FI
2807 // ENDFOR
2808 //
2809 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
_mm_sign_pi8(__m64 _a,__m64 _b)2810 FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
2811 {
2812 int8x8_t a = vreinterpret_s8_m64(_a);
2813 int8x8_t b = vreinterpret_s8_m64(_b);
2814
2815 // signed shift right: faster than vclt
2816 // (b < 0) ? 0xFF : 0
2817 uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
2818
2819 // (b == 0) ? 0xFF : 0
2820 #if defined(__aarch64__)
2821 int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
2822 #else
2823 int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
2824 #endif
2825
2826 // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
2827 // based on ltMask
2828 int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
2829 // res = masked & (~zeroMask)
2830 int8x8_t res = vbic_s8(masked, zeroMask);
2831
2832 return vreinterpret_m64_s8(res);
2833 }
2834
2835 // Average packed unsigned 16-bit integers in a and b, and store the results in
2836 // dst.
2837 //
2838 // FOR j := 0 to 3
2839 // i := j*16
2840 // dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2841 // ENDFOR
2842 //
2843 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
_mm_avg_pu16(__m64 a,__m64 b)2844 FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
2845 {
2846 return vreinterpret_m64_u16(
2847 vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
2848 }
2849
2850 // Average packed unsigned 8-bit integers in a and b, and store the results in
2851 // dst.
2852 //
2853 // FOR j := 0 to 7
2854 // i := j*8
2855 // dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2856 // ENDFOR
2857 //
2858 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
_mm_avg_pu8(__m64 a,__m64 b)2859 FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
2860 {
2861 return vreinterpret_m64_u8(
2862 vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2863 }
2864
2865 // Average packed unsigned 8-bit integers in a and b, and store the results in
2866 // dst.
2867 //
2868 // FOR j := 0 to 7
2869 // i := j*8
2870 // dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
2871 // ENDFOR
2872 //
2873 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
2874 #define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2875
2876 // Average packed unsigned 16-bit integers in a and b, and store the results in
2877 // dst.
2878 //
2879 // FOR j := 0 to 3
2880 // i := j*16
2881 // dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
2882 // ENDFOR
2883 //
2884 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
2885 #define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2886
2887 // Computes the average of the 16 unsigned 8-bit integers in a and the 16
2888 // unsigned 8-bit integers in b and rounds.
2889 //
2890 // r0 := (a0 + b0) / 2
2891 // r1 := (a1 + b1) / 2
2892 // ...
2893 // r15 := (a15 + b15) / 2
2894 //
2895 // https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
_mm_avg_epu8(__m128i a,__m128i b)2896 FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
2897 {
2898 return vreinterpretq_m128i_u8(
2899 vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2900 }
2901
2902 // Computes the average of the 8 unsigned 16-bit integers in a and the 8
2903 // unsigned 16-bit integers in b and rounds.
2904 //
2905 // r0 := (a0 + b0) / 2
2906 // r1 := (a1 + b1) / 2
2907 // ...
2908 // r7 := (a7 + b7) / 2
2909 //
2910 // https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
_mm_avg_epu16(__m128i a,__m128i b)2911 FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
2912 {
2913 return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
2914 vreinterpretq_u16_m128i(b));
2915 }
2916
2917 // Adds the four single-precision, floating-point values of a and b.
2918 //
2919 // r0 := a0 + b0
2920 // r1 := a1 + b1
2921 // r2 := a2 + b2
2922 // r3 := a3 + b3
2923 //
2924 // https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
_mm_add_ps(__m128 a,__m128 b)2925 FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
2926 {
2927 return vreinterpretq_m128_f32(
2928 vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2929 }
2930
2931 // Add packed double-precision (64-bit) floating-point elements in a and b, and
2932 // store the results in dst.
2933 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
_mm_add_pd(__m128d a,__m128d b)2934 FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
2935 {
2936 #if defined(__aarch64__)
2937 return vreinterpretq_m128d_f64(
2938 vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2939 #else
2940 double *da = (double *) &a;
2941 double *db = (double *) &b;
2942 double c[2];
2943 c[0] = da[0] + db[0];
2944 c[1] = da[1] + db[1];
2945 return vld1q_f32((float32_t *) c);
2946 #endif
2947 }
2948
2949 // Add 64-bit integers a and b, and store the result in dst.
2950 //
2951 // dst[63:0] := a[63:0] + b[63:0]
2952 //
2953 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
_mm_add_si64(__m64 a,__m64 b)2954 FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
2955 {
2956 return vreinterpret_m64_s64(
2957 vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2958 }
2959
2960 // adds the scalar single-precision floating point values of a and b.
2961 // https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
_mm_add_ss(__m128 a,__m128 b)2962 FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
2963 {
2964 float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
2965 float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
2966 // the upper values in the result must be the remnants of <a>.
2967 return vreinterpretq_m128_f32(vaddq_f32(a, value));
2968 }
2969
2970 // Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
2971 // unsigned 32-bit integers in b.
2972 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
_mm_add_epi64(__m128i a,__m128i b)2973 FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
2974 {
2975 return vreinterpretq_m128i_s64(
2976 vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2977 }
2978
2979 // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
2980 // unsigned 32-bit integers in b.
2981 //
2982 // r0 := a0 + b0
2983 // r1 := a1 + b1
2984 // r2 := a2 + b2
2985 // r3 := a3 + b3
2986 //
2987 // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
_mm_add_epi32(__m128i a,__m128i b)2988 FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
2989 {
2990 return vreinterpretq_m128i_s32(
2991 vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2992 }
2993
2994 // Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
2995 // unsigned 16-bit integers in b.
2996 // https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
_mm_add_epi16(__m128i a,__m128i b)2997 FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
2998 {
2999 return vreinterpretq_m128i_s16(
3000 vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3001 }
3002
3003 // Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
3004 // unsigned 8-bit integers in b.
3005 // https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
_mm_add_epi8(__m128i a,__m128i b)3006 FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
3007 {
3008 return vreinterpretq_m128i_s8(
3009 vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3010 }
3011
3012 // Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
3013 // and saturates.
3014 //
3015 // r0 := SignedSaturate(a0 + b0)
3016 // r1 := SignedSaturate(a1 + b1)
3017 // ...
3018 // r7 := SignedSaturate(a7 + b7)
3019 //
3020 // https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
_mm_adds_epi16(__m128i a,__m128i b)3021 FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
3022 {
3023 return vreinterpretq_m128i_s16(
3024 vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3025 }
3026
3027 // Add packed signed 8-bit integers in a and b using saturation, and store the
3028 // results in dst.
3029 //
3030 // FOR j := 0 to 15
3031 // i := j*8
3032 // dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
3033 // ENDFOR
3034 //
3035 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
_mm_adds_epi8(__m128i a,__m128i b)3036 FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
3037 {
3038 return vreinterpretq_m128i_s8(
3039 vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3040 }
3041
3042 // Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
3043 // b and saturates..
3044 // https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
_mm_adds_epu8(__m128i a,__m128i b)3045 FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
3046 {
3047 return vreinterpretq_m128i_u8(
3048 vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3049 }
3050
3051 // Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
3052 // unsigned 16-bit integers from b.
3053 //
3054 // r0 := (a0 * b0)[15:0]
3055 // r1 := (a1 * b1)[15:0]
3056 // ...
3057 // r7 := (a7 * b7)[15:0]
3058 //
3059 // https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
_mm_mullo_epi16(__m128i a,__m128i b)3060 FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
3061 {
3062 return vreinterpretq_m128i_s16(
3063 vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3064 }
3065
3066 // Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
3067 // unsigned 32-bit integers from b.
3068 // https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
_mm_mullo_epi32(__m128i a,__m128i b)3069 FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
3070 {
3071 return vreinterpretq_m128i_s32(
3072 vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3073 }
3074
3075 // Multiply the packed unsigned 16-bit integers in a and b, producing
3076 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
3077 // integers in dst.
3078 //
3079 // FOR j := 0 to 3
3080 // i := j*16
3081 // tmp[31:0] := a[i+15:i] * b[i+15:i]
3082 // dst[i+15:i] := tmp[31:16]
3083 // ENDFOR
3084 //
3085 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
3086 #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
3087
3088 // Multiplies the four single-precision, floating-point values of a and b.
3089 //
3090 // r0 := a0 * b0
3091 // r1 := a1 * b1
3092 // r2 := a2 * b2
3093 // r3 := a3 * b3
3094 //
3095 // https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
_mm_mul_ps(__m128 a,__m128 b)3096 FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
3097 {
3098 return vreinterpretq_m128_f32(
3099 vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3100 }
3101
3102 // Multiply packed double-precision (64-bit) floating-point elements in a and b,
3103 // and store the results in dst.
3104 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
_mm_mul_pd(__m128d a,__m128d b)3105 FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
3106 {
3107 #if defined(__aarch64__)
3108 return vreinterpretq_m128d_f64(
3109 vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3110 #else
3111 double *da = (double *) &a;
3112 double *db = (double *) &b;
3113 double c[2];
3114 c[0] = da[0] * db[0];
3115 c[1] = da[1] * db[1];
3116 return vld1q_f32((float32_t *) c);
3117 #endif
3118 }
3119
3120 // Multiply the lower single-precision (32-bit) floating-point element in a and
3121 // b, store the result in the lower element of dst, and copy the upper 3 packed
3122 // elements from a to the upper elements of dst.
3123 //
3124 // dst[31:0] := a[31:0] * b[31:0]
3125 // dst[127:32] := a[127:32]
3126 //
3127 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
_mm_mul_ss(__m128 a,__m128 b)3128 FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
3129 {
3130 return _mm_move_ss(a, _mm_mul_ps(a, b));
3131 }
3132
3133 // Multiply the low unsigned 32-bit integers from each packed 64-bit element in
3134 // a and b, and store the unsigned 64-bit results in dst.
3135 //
3136 // r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
3137 // r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
_mm_mul_epu32(__m128i a,__m128i b)3138 FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
3139 {
3140 // vmull_u32 upcasts instead of masking, so we downcast.
3141 uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
3142 uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
3143 return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
3144 }
3145
3146 // Multiply the low unsigned 32-bit integers from a and b, and store the
3147 // unsigned 64-bit result in dst.
3148 //
3149 // dst[63:0] := a[31:0] * b[31:0]
3150 //
3151 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
_mm_mul_su32(__m64 a,__m64 b)3152 FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
3153 {
3154 return vreinterpret_m64_u64(vget_low_u64(
3155 vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
3156 }
3157
3158 // Multiply the low signed 32-bit integers from each packed 64-bit element in
3159 // a and b, and store the signed 64-bit results in dst.
3160 //
3161 // r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
3162 // r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
_mm_mul_epi32(__m128i a,__m128i b)3163 FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
3164 {
3165 // vmull_s32 upcasts instead of masking, so we downcast.
3166 int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
3167 int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
3168 return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
3169 }
3170
3171 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
3172 // integers from b.
3173 //
3174 // r0 := (a0 * b0) + (a1 * b1)
3175 // r1 := (a2 * b2) + (a3 * b3)
3176 // r2 := (a4 * b4) + (a5 * b5)
3177 // r3 := (a6 * b6) + (a7 * b7)
3178 // https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
_mm_madd_epi16(__m128i a,__m128i b)3179 FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
3180 {
3181 int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
3182 vget_low_s16(vreinterpretq_s16_m128i(b)));
3183 int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
3184 vget_high_s16(vreinterpretq_s16_m128i(b)));
3185
3186 int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
3187 int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
3188
3189 return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
3190 }
3191
3192 // Multiply packed signed 16-bit integers in a and b, producing intermediate
3193 // signed 32-bit integers. Shift right by 15 bits while rounding up, and store
3194 // the packed 16-bit integers in dst.
3195 //
3196 // r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
3197 // r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
3198 // r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
3199 // ...
3200 // r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
_mm_mulhrs_epi16(__m128i a,__m128i b)3201 FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
3202 {
3203 // Has issues due to saturation
3204 // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
3205
3206 // Multiply
3207 int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
3208 vget_low_s16(vreinterpretq_s16_m128i(b)));
3209 int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
3210 vget_high_s16(vreinterpretq_s16_m128i(b)));
3211
3212 // Rounding narrowing shift right
3213 // narrow = (int16_t)((mul + 16384) >> 15);
3214 int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
3215 int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
3216
3217 // Join together
3218 return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
3219 }
3220
3221 // Vertically multiply each unsigned 8-bit integer from a with the corresponding
3222 // signed 8-bit integer from b, producing intermediate signed 16-bit integers.
3223 // Horizontally add adjacent pairs of intermediate signed 16-bit integers,
3224 // and pack the saturated results in dst.
3225 //
3226 // FOR j := 0 to 7
3227 // i := j*16
3228 // dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
3229 // a[i+7:i]*b[i+7:i] )
3230 // ENDFOR
_mm_maddubs_epi16(__m128i _a,__m128i _b)3231 FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
3232 {
3233 #if defined(__aarch64__)
3234 uint8x16_t a = vreinterpretq_u8_m128i(_a);
3235 int8x16_t b = vreinterpretq_s8_m128i(_b);
3236 int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
3237 vmovl_s8(vget_low_s8(b)));
3238 int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
3239 vmovl_s8(vget_high_s8(b)));
3240 return vreinterpretq_m128i_s16(
3241 vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
3242 #else
3243 // This would be much simpler if x86 would choose to zero extend OR sign
3244 // extend, not both. This could probably be optimized better.
3245 uint16x8_t a = vreinterpretq_u16_m128i(_a);
3246 int16x8_t b = vreinterpretq_s16_m128i(_b);
3247
3248 // Zero extend a
3249 int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
3250 int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
3251
3252 // Sign extend by shifting left then shifting right.
3253 int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
3254 int16x8_t b_odd = vshrq_n_s16(b, 8);
3255
3256 // multiply
3257 int16x8_t prod1 = vmulq_s16(a_even, b_even);
3258 int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
3259
3260 // saturated add
3261 return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
3262 #endif
3263 }
3264
3265 // Computes the fused multiple add product of 32-bit floating point numbers.
3266 //
3267 // Return Value
3268 // Multiplies A and B, and adds C to the temporary result before returning it.
3269 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
_mm_fmadd_ps(__m128 a,__m128 b,__m128 c)3270 FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
3271 {
3272 #if defined(__aarch64__)
3273 return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
3274 vreinterpretq_f32_m128(b),
3275 vreinterpretq_f32_m128(a)));
3276 #else
3277 return _mm_add_ps(_mm_mul_ps(a, b), c);
3278 #endif
3279 }
3280
3281 // Alternatively add and subtract packed single-precision (32-bit)
3282 // floating-point elements in a to/from packed elements in b, and store the
3283 // results in dst.
3284 //
3285 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
_mm_addsub_ps(__m128 a,__m128 b)3286 FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
3287 {
3288 __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
3289 return _mm_fmadd_ps(b, mask, a);
3290 }
3291
3292 // Compute the absolute differences of packed unsigned 8-bit integers in a and
3293 // b, then horizontally sum each consecutive 8 differences to produce two
3294 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3295 // 16 bits of 64-bit elements in dst.
3296 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
_mm_sad_epu8(__m128i a,__m128i b)3297 FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
3298 {
3299 uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
3300 uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3301 uint16_t r4 = t[4] + t[5] + t[6] + t[7];
3302 uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
3303 return (__m128i) vsetq_lane_u16(r4, r, 4);
3304 }
3305
3306 // Compute the absolute differences of packed unsigned 8-bit integers in a and
3307 // b, then horizontally sum each consecutive 8 differences to produce four
3308 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3309 // 16 bits of dst.
3310 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
_mm_sad_pu8(__m64 a,__m64 b)3311 FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
3312 {
3313 uint16x4_t t =
3314 vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3315 uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3316 return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0));
3317 }
3318
3319 // Compute the absolute differences of packed unsigned 8-bit integers in a and
3320 // b, then horizontally sum each consecutive 8 differences to produce four
3321 // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
3322 // 16 bits of dst.
3323 //
3324 // FOR j := 0 to 7
3325 // i := j*8
3326 // tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
3327 // ENDFOR
3328 // dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] +
3329 // tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
3330 //
3331 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw
3332 #define _m_psadbw(a, b) _mm_sad_pu8(a, b)
3333
3334 // Divides the four single-precision, floating-point values of a and b.
3335 //
3336 // r0 := a0 / b0
3337 // r1 := a1 / b1
3338 // r2 := a2 / b2
3339 // r3 := a3 / b3
3340 //
3341 // https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
_mm_div_ps(__m128 a,__m128 b)3342 FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
3343 {
3344 #if defined(__aarch64__)
3345 return vreinterpretq_m128_f32(
3346 vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3347 #else
3348 float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b));
3349 float32x4_t recip1 =
3350 vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b)));
3351 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1));
3352 #endif
3353 }
3354
3355 // Divides the scalar single-precision floating point value of a by b.
3356 // https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
_mm_div_ss(__m128 a,__m128 b)3357 FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
3358 {
3359 float32_t value =
3360 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
3361 return vreinterpretq_m128_f32(
3362 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
3363 }
3364
3365 // Compute the approximate reciprocal of packed single-precision (32-bit)
3366 // floating-point elements in a, and store the results in dst. The maximum
3367 // relative error for this approximation is less than 1.5*2^-12.
3368 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
_mm_rcp_ps(__m128 in)3369 FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
3370 {
3371 #if defined(__aarch64__)
3372 return vreinterpretq_m128_f32(
3373 vdivq_f32(vdupq_n_f32(1.0f), vreinterpretq_f32_m128(in)));
3374 #else
3375 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
3376 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
3377 return vreinterpretq_m128_f32(recip);
3378 #endif
3379 }
3380
3381 // Compute the approximate reciprocal of the lower single-precision (32-bit)
3382 // floating-point element in a, store the result in the lower element of dst,
3383 // and copy the upper 3 packed elements from a to the upper elements of dst. The
3384 // maximum relative error for this approximation is less than 1.5*2^-12.
3385 //
3386 // dst[31:0] := (1.0 / a[31:0])
3387 // dst[127:32] := a[127:32]
3388 //
3389 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
_mm_rcp_ss(__m128 a)3390 FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
3391 {
3392 return _mm_move_ss(a, _mm_rcp_ps(a));
3393 }
3394
3395 // Computes the approximations of square roots of the four single-precision,
3396 // floating-point values of a. First computes reciprocal square roots and then
3397 // reciprocals of the four values.
3398 //
3399 // r0 := sqrt(a0)
3400 // r1 := sqrt(a1)
3401 // r2 := sqrt(a2)
3402 // r3 := sqrt(a3)
3403 //
3404 // https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
_mm_sqrt_ps(__m128 in)3405 FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
3406 {
3407 #if defined(__aarch64__)
3408 return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
3409 #else
3410 float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
3411 float32x4_t sq = vrecpeq_f32(recipsq);
3412 // ??? use step versions of both sqrt and recip for better accuracy?
3413 return vreinterpretq_m128_f32(sq);
3414 #endif
3415 }
3416
3417 // Computes the approximation of the square root of the scalar single-precision
3418 // floating point value of in.
3419 // https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
_mm_sqrt_ss(__m128 in)3420 FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
3421 {
3422 float32_t value =
3423 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
3424 return vreinterpretq_m128_f32(
3425 vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
3426 }
3427
3428 // Computes the approximations of the reciprocal square roots of the four
3429 // single-precision floating point values of in.
3430 // https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
_mm_rsqrt_ps(__m128 in)3431 FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
3432 {
3433 return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in)));
3434 }
3435
3436 // Compute the approximate reciprocal square root of the lower single-precision
3437 // (32-bit) floating-point element in a, store the result in the lower element
3438 // of dst, and copy the upper 3 packed elements from a to the upper elements of
3439 // dst.
3440 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
_mm_rsqrt_ss(__m128 in)3441 FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
3442 {
3443 return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
3444 }
3445
3446 // Compare packed signed 16-bit integers in a and b, and store packed maximum
3447 // values in dst.
3448 //
3449 // FOR j := 0 to 3
3450 // i := j*16
3451 // dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
3452 // ENDFOR
3453 //
3454 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
_mm_max_pi16(__m64 a,__m64 b)3455 FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
3456 {
3457 return vreinterpret_m64_s16(
3458 vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
3459 }
3460
3461 // Compare packed signed 16-bit integers in a and b, and store packed maximum
3462 // values in dst.
3463 //
3464 // FOR j := 0 to 3
3465 // i := j*16
3466 // dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
3467 // ENDFOR
3468 //
3469 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
3470 #define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
3471
3472 // Computes the maximums of the four single-precision, floating-point values of
3473 // a and b.
3474 // https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
_mm_max_ps(__m128 a,__m128 b)3475 FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
3476 {
3477 #if SSE2NEON_PRECISE_MINMAX
3478 float32x4_t _a = vreinterpretq_f32_m128(a);
3479 float32x4_t _b = vreinterpretq_f32_m128(b);
3480 return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
3481 #else
3482 return vreinterpretq_m128_f32(
3483 vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3484 #endif
3485 }
3486
3487 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
3488 // values in dst.
3489 //
3490 // FOR j := 0 to 7
3491 // i := j*8
3492 // dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
3493 // ENDFOR
3494 //
3495 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
_mm_max_pu8(__m64 a,__m64 b)3496 FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
3497 {
3498 return vreinterpret_m64_u8(
3499 vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3500 }
3501
3502 // Compare packed unsigned 8-bit integers in a and b, and store packed maximum
3503 // values in dst.
3504 //
3505 // FOR j := 0 to 7
3506 // i := j*8
3507 // dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
3508 // ENDFOR
3509 //
3510 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
3511 #define _m_pmaxub(a, b) _mm_max_pu8(a, b)
3512
3513 // Compare packed signed 16-bit integers in a and b, and store packed minimum
3514 // values in dst.
3515 //
3516 // FOR j := 0 to 3
3517 // i := j*16
3518 // dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
3519 // ENDFOR
3520 //
3521 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
_mm_min_pi16(__m64 a,__m64 b)3522 FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
3523 {
3524 return vreinterpret_m64_s16(
3525 vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
3526 }
3527
3528 // Compare packed signed 16-bit integers in a and b, and store packed minimum
3529 // values in dst.
3530 //
3531 // FOR j := 0 to 3
3532 // i := j*16
3533 // dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
3534 // ENDFOR
3535 //
3536 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
3537 #define _m_pminsw(a, b) _mm_min_pi16(a, b)
3538
3539 // Computes the minima of the four single-precision, floating-point values of a
3540 // and b.
3541 // https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
_mm_min_ps(__m128 a,__m128 b)3542 FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
3543 {
3544 #if SSE2NEON_PRECISE_MINMAX
3545 float32x4_t _a = vreinterpretq_f32_m128(a);
3546 float32x4_t _b = vreinterpretq_f32_m128(b);
3547 return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
3548 #else
3549 return vreinterpretq_m128_f32(
3550 vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3551 #endif
3552 }
3553
3554 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
3555 // values in dst.
3556 //
3557 // FOR j := 0 to 7
3558 // i := j*8
3559 // dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
3560 // ENDFOR
3561 //
3562 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
_mm_min_pu8(__m64 a,__m64 b)3563 FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
3564 {
3565 return vreinterpret_m64_u8(
3566 vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
3567 }
3568
3569 // Compare packed unsigned 8-bit integers in a and b, and store packed minimum
3570 // values in dst.
3571 //
3572 // FOR j := 0 to 7
3573 // i := j*8
3574 // dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
3575 // ENDFOR
3576 //
3577 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
3578 #define _m_pminub(a, b) _mm_min_pu8(a, b)
3579
3580 // Computes the maximum of the two lower scalar single-precision floating point
3581 // values of a and b.
3582 // https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
_mm_max_ss(__m128 a,__m128 b)3583 FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
3584 {
3585 float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
3586 return vreinterpretq_m128_f32(
3587 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
3588 }
3589
3590 // Computes the minimum of the two lower scalar single-precision floating point
3591 // values of a and b.
3592 // https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
_mm_min_ss(__m128 a,__m128 b)3593 FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
3594 {
3595 float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
3596 return vreinterpretq_m128_f32(
3597 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
3598 }
3599
3600 // Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
3601 // 16 unsigned 8-bit integers from b.
3602 // https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
_mm_max_epu8(__m128i a,__m128i b)3603 FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
3604 {
3605 return vreinterpretq_m128i_u8(
3606 vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3607 }
3608
3609 // Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
3610 // 16 unsigned 8-bit integers from b.
3611 // https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
_mm_min_epu8(__m128i a,__m128i b)3612 FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
3613 {
3614 return vreinterpretq_m128i_u8(
3615 vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3616 }
3617
3618 // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
3619 // signed 16-bit integers from b.
3620 // https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
_mm_min_epi16(__m128i a,__m128i b)3621 FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
3622 {
3623 return vreinterpretq_m128i_s16(
3624 vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3625 }
3626
3627 // Compare packed signed 8-bit integers in a and b, and store packed maximum
3628 // values in dst.
3629 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
_mm_max_epi8(__m128i a,__m128i b)3630 FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
3631 {
3632 return vreinterpretq_m128i_s8(
3633 vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3634 }
3635
3636 // Compare packed unsigned 16-bit integers in a and b, and store packed maximum
3637 // values in dst.
3638 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
_mm_max_epu16(__m128i a,__m128i b)3639 FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
3640 {
3641 return vreinterpretq_m128i_u16(
3642 vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
3643 }
3644
3645 // Compare packed signed 8-bit integers in a and b, and store packed minimum
3646 // values in dst.
3647 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
_mm_min_epi8(__m128i a,__m128i b)3648 FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
3649 {
3650 return vreinterpretq_m128i_s8(
3651 vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3652 }
3653
3654 // Compare packed unsigned 16-bit integers in a and b, and store packed minimum
3655 // values in dst.
3656 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
_mm_min_epu16(__m128i a,__m128i b)3657 FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
3658 {
3659 return vreinterpretq_m128i_u16(
3660 vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
3661 }
3662
3663 // Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
3664 // signed 16-bit integers from b.
3665 // https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
_mm_max_epi16(__m128i a,__m128i b)3666 FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
3667 {
3668 return vreinterpretq_m128i_s16(
3669 vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3670 }
3671
3672 // epi versions of min/max
3673 // Computes the pariwise maximums of the four signed 32-bit integer values of a
3674 // and b.
3675 //
3676 // A 128-bit parameter that can be defined with the following equations:
3677 // r0 := (a0 > b0) ? a0 : b0
3678 // r1 := (a1 > b1) ? a1 : b1
3679 // r2 := (a2 > b2) ? a2 : b2
3680 // r3 := (a3 > b3) ? a3 : b3
3681 //
3682 // https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
_mm_max_epi32(__m128i a,__m128i b)3683 FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
3684 {
3685 return vreinterpretq_m128i_s32(
3686 vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3687 }
3688
3689 // Computes the pariwise minima of the four signed 32-bit integer values of a
3690 // and b.
3691 //
3692 // A 128-bit parameter that can be defined with the following equations:
3693 // r0 := (a0 < b0) ? a0 : b0
3694 // r1 := (a1 < b1) ? a1 : b1
3695 // r2 := (a2 < b2) ? a2 : b2
3696 // r3 := (a3 < b3) ? a3 : b3
3697 //
3698 // https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
_mm_min_epi32(__m128i a,__m128i b)3699 FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
3700 {
3701 return vreinterpretq_m128i_s32(
3702 vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3703 }
3704
3705 // Compare packed unsigned 32-bit integers in a and b, and store packed maximum
3706 // values in dst.
3707 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
_mm_max_epu32(__m128i a,__m128i b)3708 FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
3709 {
3710 return vreinterpretq_m128i_u32(
3711 vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
3712 }
3713
3714 // Compare packed unsigned 32-bit integers in a and b, and store packed minimum
3715 // values in dst.
3716 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
_mm_min_epu32(__m128i a,__m128i b)3717 FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
3718 {
3719 return vreinterpretq_m128i_u32(
3720 vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
3721 }
3722
3723 // Multiply the packed unsigned 16-bit integers in a and b, producing
3724 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
3725 // integers in dst.
3726 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
_mm_mulhi_pu16(__m64 a,__m64 b)3727 FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
3728 {
3729 return vreinterpret_m64_u16(vshrn_n_u32(
3730 vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
3731 }
3732
3733 // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
3734 // integers from b.
3735 //
3736 // r0 := (a0 * b0)[31:16]
3737 // r1 := (a1 * b1)[31:16]
3738 // ...
3739 // r7 := (a7 * b7)[31:16]
3740 //
3741 // https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
_mm_mulhi_epi16(__m128i a,__m128i b)3742 FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
3743 {
3744 /* FIXME: issue with large values because of result saturation */
3745 // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
3746 // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
3747 // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
3748 int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
3749 int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
3750 int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
3751 int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
3752 int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
3753 int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
3754 uint16x8x2_t r =
3755 vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
3756 return vreinterpretq_m128i_u16(r.val[1]);
3757 }
3758
3759 // Multiply the packed unsigned 16-bit integers in a and b, producing
3760 // intermediate 32-bit integers, and store the high 16 bits of the intermediate
3761 // integers in dst.
3762 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
_mm_mulhi_epu16(__m128i a,__m128i b)3763 FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
3764 {
3765 uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
3766 uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
3767 uint32x4_t ab3210 = vmull_u16(a3210, b3210);
3768 #if defined(__aarch64__)
3769 uint32x4_t ab7654 =
3770 vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
3771 uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
3772 vreinterpretq_u16_u32(ab7654));
3773 return vreinterpretq_m128i_u16(r);
3774 #else
3775 uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
3776 uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
3777 uint32x4_t ab7654 = vmull_u16(a7654, b7654);
3778 uint16x8x2_t r =
3779 vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
3780 return vreinterpretq_m128i_u16(r.val[1]);
3781 #endif
3782 }
3783
3784 // Computes pairwise add of each argument as single-precision, floating-point
3785 // values a and b.
3786 // https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
_mm_hadd_ps(__m128 a,__m128 b)3787 FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
3788 {
3789 #if defined(__aarch64__)
3790 return vreinterpretq_m128_f32(
3791 vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3792 #else
3793 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
3794 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
3795 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
3796 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
3797 return vreinterpretq_m128_f32(
3798 vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
3799 #endif
3800 }
3801
3802 // Computes pairwise add of each argument as a 16-bit signed or unsigned integer
3803 // values a and b.
_mm_hadd_epi16(__m128i _a,__m128i _b)3804 FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
3805 {
3806 int16x8_t a = vreinterpretq_s16_m128i(_a);
3807 int16x8_t b = vreinterpretq_s16_m128i(_b);
3808 #if defined(__aarch64__)
3809 return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
3810 #else
3811 return vreinterpretq_m128i_s16(
3812 vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
3813 vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
3814 #endif
3815 }
3816
3817 // Horizontally substract adjacent pairs of single-precision (32-bit)
3818 // floating-point elements in a and b, and pack the results in dst.
3819 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
_mm_hsub_ps(__m128 _a,__m128 _b)3820 FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
3821 {
3822 #if defined(__aarch64__)
3823 return vreinterpretq_m128_f32(vsubq_f32(
3824 vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
3825 vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
3826 #else
3827 float32x4x2_t c =
3828 vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
3829 return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
3830 #endif
3831 }
3832
3833 // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
3834 // signed 16-bit results in dst.
3835 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
_mm_hadd_pi16(__m64 a,__m64 b)3836 FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
3837 {
3838 return vreinterpret_m64_s16(
3839 vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
3840 }
3841
3842 // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
3843 // signed 32-bit results in dst.
3844 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
_mm_hadd_pi32(__m64 a,__m64 b)3845 FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
3846 {
3847 return vreinterpret_m64_s32(
3848 vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
3849 }
3850
3851 // Computes pairwise difference of each argument as a 16-bit signed or unsigned
3852 // integer values a and b.
_mm_hsub_epi16(__m128i _a,__m128i _b)3853 FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
3854 {
3855 int32x4_t a = vreinterpretq_s32_m128i(_a);
3856 int32x4_t b = vreinterpretq_s32_m128i(_b);
3857 // Interleave using vshrn/vmovn
3858 // [a0|a2|a4|a6|b0|b2|b4|b6]
3859 // [a1|a3|a5|a7|b1|b3|b5|b7]
3860 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
3861 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
3862 // Subtract
3863 return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
3864 }
3865
3866 // Computes saturated pairwise sub of each argument as a 16-bit signed
3867 // integer values a and b.
_mm_hadds_epi16(__m128i _a,__m128i _b)3868 FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
3869 {
3870 #if defined(__aarch64__)
3871 int16x8_t a = vreinterpretq_s16_m128i(_a);
3872 int16x8_t b = vreinterpretq_s16_m128i(_b);
3873 return vreinterpretq_s64_s16(
3874 vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
3875 #else
3876 int32x4_t a = vreinterpretq_s32_m128i(_a);
3877 int32x4_t b = vreinterpretq_s32_m128i(_b);
3878 // Interleave using vshrn/vmovn
3879 // [a0|a2|a4|a6|b0|b2|b4|b6]
3880 // [a1|a3|a5|a7|b1|b3|b5|b7]
3881 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
3882 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
3883 // Saturated add
3884 return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
3885 #endif
3886 }
3887
3888 // Computes saturated pairwise difference of each argument as a 16-bit signed
3889 // integer values a and b.
3890 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
_mm_hsubs_epi16(__m128i _a,__m128i _b)3891 FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
3892 {
3893 #if defined(__aarch64__)
3894 int16x8_t a = vreinterpretq_s16_m128i(_a);
3895 int16x8_t b = vreinterpretq_s16_m128i(_b);
3896 return vreinterpretq_s64_s16(
3897 vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
3898 #else
3899 int32x4_t a = vreinterpretq_s32_m128i(_a);
3900 int32x4_t b = vreinterpretq_s32_m128i(_b);
3901 // Interleave using vshrn/vmovn
3902 // [a0|a2|a4|a6|b0|b2|b4|b6]
3903 // [a1|a3|a5|a7|b1|b3|b5|b7]
3904 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
3905 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
3906 // Saturated subtract
3907 return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
3908 #endif
3909 }
3910
3911 // Computes pairwise add of each argument as a 32-bit signed or unsigned integer
3912 // values a and b.
_mm_hadd_epi32(__m128i _a,__m128i _b)3913 FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
3914 {
3915 int32x4_t a = vreinterpretq_s32_m128i(_a);
3916 int32x4_t b = vreinterpretq_s32_m128i(_b);
3917 return vreinterpretq_m128i_s32(
3918 vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
3919 vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
3920 }
3921
3922 // Computes pairwise difference of each argument as a 32-bit signed or unsigned
3923 // integer values a and b.
_mm_hsub_epi32(__m128i _a,__m128i _b)3924 FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
3925 {
3926 int64x2_t a = vreinterpretq_s64_m128i(_a);
3927 int64x2_t b = vreinterpretq_s64_m128i(_b);
3928 // Interleave using vshrn/vmovn
3929 // [a0|a2|b0|b2]
3930 // [a1|a2|b1|b3]
3931 int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
3932 int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
3933 // Subtract
3934 return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
3935 }
3936
3937 // Kahan summation for accurate summation of floating-point numbers.
3938 // http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
sse2neon_kadd_f32(float * sum,float * c,float y)3939 FORCE_INLINE void sse2neon_kadd_f32(float *sum, float *c, float y)
3940 {
3941 y -= *c;
3942 float t = *sum + y;
3943 *c = (t - *sum) - y;
3944 *sum = t;
3945 }
3946
3947 // Conditionally multiply the packed single-precision (32-bit) floating-point
3948 // elements in a and b using the high 4 bits in imm8, sum the four products,
3949 // and conditionally store the sum in dst using the low 4 bits of imm.
3950 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
_mm_dp_ps(__m128 a,__m128 b,const int imm)3951 FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
3952 {
3953 #if defined(__aarch64__)
3954 /* shortcuts */
3955 if (imm == 0xFF) {
3956 return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
3957 }
3958 if (imm == 0x7F) {
3959 float32x4_t m = _mm_mul_ps(a, b);
3960 m[3] = 0;
3961 return _mm_set1_ps(vaddvq_f32(m));
3962 }
3963 #endif
3964
3965 float s = 0, c = 0;
3966 float32x4_t f32a = vreinterpretq_f32_m128(a);
3967 float32x4_t f32b = vreinterpretq_f32_m128(b);
3968
3969 /* To improve the accuracy of floating-point summation, Kahan algorithm
3970 * is used for each operation.
3971 */
3972 if (imm & (1 << 4))
3973 sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
3974 if (imm & (1 << 5))
3975 sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
3976 if (imm & (1 << 6))
3977 sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
3978 if (imm & (1 << 7))
3979 sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
3980 s += c;
3981
3982 float32x4_t res = {
3983 (imm & 0x1) ? s : 0,
3984 (imm & 0x2) ? s : 0,
3985 (imm & 0x4) ? s : 0,
3986 (imm & 0x8) ? s : 0,
3987 };
3988 return vreinterpretq_m128_f32(res);
3989 }
3990
3991 /* Compare operations */
3992
3993 // Compares for less than
3994 // https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
_mm_cmplt_ps(__m128 a,__m128 b)3995 FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
3996 {
3997 return vreinterpretq_m128_u32(
3998 vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
3999 }
4000
4001 // Compares for less than
4002 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
_mm_cmplt_ss(__m128 a,__m128 b)4003 FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
4004 {
4005 return _mm_move_ss(a, _mm_cmplt_ps(a, b));
4006 }
4007
4008 // Compares for greater than.
4009 //
4010 // r0 := (a0 > b0) ? 0xffffffff : 0x0
4011 // r1 := (a1 > b1) ? 0xffffffff : 0x0
4012 // r2 := (a2 > b2) ? 0xffffffff : 0x0
4013 // r3 := (a3 > b3) ? 0xffffffff : 0x0
4014 //
4015 // https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
_mm_cmpgt_ps(__m128 a,__m128 b)4016 FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
4017 {
4018 return vreinterpretq_m128_u32(
4019 vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4020 }
4021
4022 // Compares for greater than.
4023 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
_mm_cmpgt_ss(__m128 a,__m128 b)4024 FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
4025 {
4026 return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
4027 }
4028
4029 // Compares for greater than or equal.
4030 // https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
_mm_cmpge_ps(__m128 a,__m128 b)4031 FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
4032 {
4033 return vreinterpretq_m128_u32(
4034 vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4035 }
4036
4037 // Compares for greater than or equal.
4038 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
_mm_cmpge_ss(__m128 a,__m128 b)4039 FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
4040 {
4041 return _mm_move_ss(a, _mm_cmpge_ps(a, b));
4042 }
4043
4044 // Compares for less than or equal.
4045 //
4046 // r0 := (a0 <= b0) ? 0xffffffff : 0x0
4047 // r1 := (a1 <= b1) ? 0xffffffff : 0x0
4048 // r2 := (a2 <= b2) ? 0xffffffff : 0x0
4049 // r3 := (a3 <= b3) ? 0xffffffff : 0x0
4050 //
4051 // https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
_mm_cmple_ps(__m128 a,__m128 b)4052 FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
4053 {
4054 return vreinterpretq_m128_u32(
4055 vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4056 }
4057
4058 // Compares for less than or equal.
4059 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
_mm_cmple_ss(__m128 a,__m128 b)4060 FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
4061 {
4062 return _mm_move_ss(a, _mm_cmple_ps(a, b));
4063 }
4064
4065 // Compares for equality.
4066 // https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
_mm_cmpeq_ps(__m128 a,__m128 b)4067 FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
4068 {
4069 return vreinterpretq_m128_u32(
4070 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4071 }
4072
4073 // Compares for equality.
4074 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
_mm_cmpeq_ss(__m128 a,__m128 b)4075 FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
4076 {
4077 return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
4078 }
4079
4080 // Compares for inequality.
4081 // https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
_mm_cmpneq_ps(__m128 a,__m128 b)4082 FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
4083 {
4084 return vreinterpretq_m128_u32(vmvnq_u32(
4085 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
4086 }
4087
4088 // Compares for inequality.
4089 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
_mm_cmpneq_ss(__m128 a,__m128 b)4090 FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
4091 {
4092 return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
4093 }
4094
4095 // Compares for not greater than or equal.
4096 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
_mm_cmpnge_ps(__m128 a,__m128 b)4097 FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
4098 {
4099 return _mm_cmplt_ps(a, b);
4100 }
4101
4102 // Compares for not greater than or equal.
4103 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
_mm_cmpnge_ss(__m128 a,__m128 b)4104 FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
4105 {
4106 return _mm_cmplt_ss(a, b);
4107 }
4108
4109 // Compares for not greater than.
4110 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
_mm_cmpngt_ps(__m128 a,__m128 b)4111 FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
4112 {
4113 return _mm_cmple_ps(a, b);
4114 }
4115
4116 // Compares for not greater than.
4117 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
_mm_cmpngt_ss(__m128 a,__m128 b)4118 FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
4119 {
4120 return _mm_cmple_ss(a, b);
4121 }
4122
4123 // Compares for not less than or equal.
4124 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
_mm_cmpnle_ps(__m128 a,__m128 b)4125 FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
4126 {
4127 return _mm_cmpgt_ps(a, b);
4128 }
4129
4130 // Compares for not less than or equal.
4131 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
_mm_cmpnle_ss(__m128 a,__m128 b)4132 FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
4133 {
4134 return _mm_cmpgt_ss(a, b);
4135 }
4136
4137 // Compares for not less than.
4138 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
_mm_cmpnlt_ps(__m128 a,__m128 b)4139 FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
4140 {
4141 return _mm_cmpge_ps(a, b);
4142 }
4143
4144 // Compares for not less than.
4145 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
_mm_cmpnlt_ss(__m128 a,__m128 b)4146 FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
4147 {
4148 return _mm_cmpge_ss(a, b);
4149 }
4150
4151 // Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
4152 // unsigned 8-bit integers in b for equality.
4153 // https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
_mm_cmpeq_epi8(__m128i a,__m128i b)4154 FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
4155 {
4156 return vreinterpretq_m128i_u8(
4157 vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4158 }
4159
4160 // Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
4161 // unsigned 16-bit integers in b for equality.
4162 // https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
_mm_cmpeq_epi16(__m128i a,__m128i b)4163 FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
4164 {
4165 return vreinterpretq_m128i_u16(
4166 vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4167 }
4168
4169 // Compare packed 32-bit integers in a and b for equality, and store the results
4170 // in dst
_mm_cmpeq_epi32(__m128i a,__m128i b)4171 FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
4172 {
4173 return vreinterpretq_m128i_u32(
4174 vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4175 }
4176
4177 // Compare packed 64-bit integers in a and b for equality, and store the results
4178 // in dst
_mm_cmpeq_epi64(__m128i a,__m128i b)4179 FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
4180 {
4181 #if defined(__aarch64__)
4182 return vreinterpretq_m128i_u64(
4183 vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
4184 #else
4185 // ARMv7 lacks vceqq_u64
4186 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
4187 uint32x4_t cmp =
4188 vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
4189 uint32x4_t swapped = vrev64q_u32(cmp);
4190 return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
4191 #endif
4192 }
4193
4194 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
4195 // in b for lesser than.
4196 // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
_mm_cmplt_epi8(__m128i a,__m128i b)4197 FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
4198 {
4199 return vreinterpretq_m128i_u8(
4200 vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4201 }
4202
4203 // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
4204 // in b for greater than.
4205 //
4206 // r0 := (a0 > b0) ? 0xff : 0x0
4207 // r1 := (a1 > b1) ? 0xff : 0x0
4208 // ...
4209 // r15 := (a15 > b15) ? 0xff : 0x0
4210 //
4211 // https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
_mm_cmpgt_epi8(__m128i a,__m128i b)4212 FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
4213 {
4214 return vreinterpretq_m128i_u8(
4215 vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
4216 }
4217
4218 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
4219 // in b for less than.
4220 //
4221 // r0 := (a0 < b0) ? 0xffff : 0x0
4222 // r1 := (a1 < b1) ? 0xffff : 0x0
4223 // ...
4224 // r7 := (a7 < b7) ? 0xffff : 0x0
4225 //
4226 // https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
_mm_cmplt_epi16(__m128i a,__m128i b)4227 FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
4228 {
4229 return vreinterpretq_m128i_u16(
4230 vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4231 }
4232
4233 // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
4234 // in b for greater than.
4235 //
4236 // r0 := (a0 > b0) ? 0xffff : 0x0
4237 // r1 := (a1 > b1) ? 0xffff : 0x0
4238 // ...
4239 // r7 := (a7 > b7) ? 0xffff : 0x0
4240 //
4241 // https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
_mm_cmpgt_epi16(__m128i a,__m128i b)4242 FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
4243 {
4244 return vreinterpretq_m128i_u16(
4245 vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4246 }
4247
4248
4249 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
4250 // in b for less than.
4251 // https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
_mm_cmplt_epi32(__m128i a,__m128i b)4252 FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
4253 {
4254 return vreinterpretq_m128i_u32(
4255 vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4256 }
4257
4258 // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
4259 // in b for greater than.
4260 // https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
_mm_cmpgt_epi32(__m128i a,__m128i b)4261 FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
4262 {
4263 return vreinterpretq_m128i_u32(
4264 vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4265 }
4266
4267 // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
4268 // in b for greater than.
_mm_cmpgt_epi64(__m128i a,__m128i b)4269 FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
4270 {
4271 #if defined(__aarch64__)
4272 return vreinterpretq_m128i_u64(
4273 vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
4274 #else
4275 // ARMv7 lacks vcgtq_s64.
4276 // This is based off of Clang's SSE2 polyfill:
4277 // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
4278
4279 // Mask the sign bit out since we need a signed AND an unsigned comparison
4280 // and it is ugly to try and split them.
4281 int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
4282 int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
4283 int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
4284 // Check if a > b
4285 int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
4286 // Copy upper mask to lower mask
4287 // a_hi > b_hi
4288 int64x2_t gt_hi = vshrq_n_s64(greater, 63);
4289 // Copy lower mask to upper mask
4290 // a_lo > b_lo
4291 int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
4292 // Compare for equality
4293 int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
4294 // Copy upper mask to lower mask
4295 // a_hi == b_hi
4296 int64x2_t eq_hi = vshrq_n_s64(equal, 63);
4297 // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
4298 int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
4299 return vreinterpretq_m128i_s64(ret);
4300 #endif
4301 }
4302
4303 // Compares the four 32-bit floats in a and b to check if any values are NaN.
4304 // Ordered compare between each value returns true for "orderable" and false for
4305 // "not orderable" (NaN).
4306 // https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
4307 // also:
4308 // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
4309 // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
_mm_cmpord_ps(__m128 a,__m128 b)4310 FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
4311 {
4312 // Note: NEON does not have ordered compare builtin
4313 // Need to compare a eq a and b eq b to check for NaN
4314 // Do AND of results to get final
4315 uint32x4_t ceqaa =
4316 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4317 uint32x4_t ceqbb =
4318 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4319 return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
4320 }
4321
4322 // Compares for ordered.
4323 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
_mm_cmpord_ss(__m128 a,__m128 b)4324 FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
4325 {
4326 return _mm_move_ss(a, _mm_cmpord_ps(a, b));
4327 }
4328
4329 // Compares for unordered.
4330 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
_mm_cmpunord_ps(__m128 a,__m128 b)4331 FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
4332 {
4333 uint32x4_t f32a =
4334 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4335 uint32x4_t f32b =
4336 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4337 return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
4338 }
4339
4340 // Compares for unordered.
4341 // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
_mm_cmpunord_ss(__m128 a,__m128 b)4342 FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
4343 {
4344 return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
4345 }
4346
4347 // Compares the lower single-precision floating point scalar values of a and b
4348 // using a less than operation. :
4349 // https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
4350 // note!! The documentation on MSDN is incorrect! If either of the values is a
4351 // NAN the docs say you will get a one, but in fact, it will return a zero!!
_mm_comilt_ss(__m128 a,__m128 b)4352 FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
4353 {
4354 uint32x4_t a_not_nan =
4355 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4356 uint32x4_t b_not_nan =
4357 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4358 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4359 uint32x4_t a_lt_b =
4360 vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4361 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
4362 }
4363
4364 // Compares the lower single-precision floating point scalar values of a and b
4365 // using a greater than operation. :
4366 // https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
_mm_comigt_ss(__m128 a,__m128 b)4367 FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
4368 {
4369 // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
4370 // vreinterpretq_f32_m128(b)), 0);
4371 uint32x4_t a_not_nan =
4372 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4373 uint32x4_t b_not_nan =
4374 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4375 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4376 uint32x4_t a_gt_b =
4377 vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4378 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
4379 }
4380
4381 // Compares the lower single-precision floating point scalar values of a and b
4382 // using a less than or equal operation. :
4383 // https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
_mm_comile_ss(__m128 a,__m128 b)4384 FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
4385 {
4386 // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
4387 // vreinterpretq_f32_m128(b)), 0);
4388 uint32x4_t a_not_nan =
4389 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4390 uint32x4_t b_not_nan =
4391 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4392 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4393 uint32x4_t a_le_b =
4394 vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4395 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
4396 }
4397
4398 // Compares the lower single-precision floating point scalar values of a and b
4399 // using a greater than or equal operation. :
4400 // https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
_mm_comige_ss(__m128 a,__m128 b)4401 FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
4402 {
4403 // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
4404 // vreinterpretq_f32_m128(b)), 0);
4405 uint32x4_t a_not_nan =
4406 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4407 uint32x4_t b_not_nan =
4408 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4409 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4410 uint32x4_t a_ge_b =
4411 vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4412 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
4413 }
4414
4415 // Compares the lower single-precision floating point scalar values of a and b
4416 // using an equality operation. :
4417 // https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
_mm_comieq_ss(__m128 a,__m128 b)4418 FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
4419 {
4420 // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
4421 // vreinterpretq_f32_m128(b)), 0);
4422 uint32x4_t a_not_nan =
4423 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4424 uint32x4_t b_not_nan =
4425 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4426 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4427 uint32x4_t a_eq_b =
4428 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
4429 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
4430 }
4431
4432 // Compares the lower single-precision floating point scalar values of a and b
4433 // using an inequality operation. :
4434 // https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
_mm_comineq_ss(__m128 a,__m128 b)4435 FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
4436 {
4437 // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
4438 // vreinterpretq_f32_m128(b)), 0);
4439 uint32x4_t a_not_nan =
4440 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
4441 uint32x4_t b_not_nan =
4442 vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
4443 uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
4444 uint32x4_t a_neq_b = vmvnq_u32(
4445 vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
4446 return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
4447 }
4448
4449 // according to the documentation, these intrinsics behave the same as the
4450 // non-'u' versions. We'll just alias them here.
4451 #define _mm_ucomilt_ss _mm_comilt_ss
4452 #define _mm_ucomile_ss _mm_comile_ss
4453 #define _mm_ucomigt_ss _mm_comigt_ss
4454 #define _mm_ucomige_ss _mm_comige_ss
4455 #define _mm_ucomieq_ss _mm_comieq_ss
4456 #define _mm_ucomineq_ss _mm_comineq_ss
4457
4458 /* Conversions */
4459
4460 // Convert packed signed 32-bit integers in b to packed single-precision
4461 // (32-bit) floating-point elements, store the results in the lower 2 elements
4462 // of dst, and copy the upper 2 packed elements from a to the upper elements of
4463 // dst.
4464 //
4465 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
4466 // dst[63:32] := Convert_Int32_To_FP32(b[63:32])
4467 // dst[95:64] := a[95:64]
4468 // dst[127:96] := a[127:96]
4469 //
4470 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
_mm_cvt_pi2ps(__m128 a,__m64 b)4471 FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
4472 {
4473 return vreinterpretq_m128_f32(
4474 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
4475 vget_high_f32(vreinterpretq_f32_m128(a))));
4476 }
4477
4478 // Convert the signed 32-bit integer b to a single-precision (32-bit)
4479 // floating-point element, store the result in the lower element of dst, and
4480 // copy the upper 3 packed elements from a to the upper elements of dst.
4481 //
4482 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
4483 // dst[127:32] := a[127:32]
4484 //
4485 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
_mm_cvt_si2ss(__m128 a,int b)4486 FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
4487 {
4488 return vreinterpretq_m128_f32(
4489 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
4490 }
4491
4492 // Convert the signed 32-bit integer b to a single-precision (32-bit)
4493 // floating-point element, store the result in the lower element of dst, and
4494 // copy the upper 3 packed elements from a to the upper elements of dst.
4495 //
4496 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
4497 // dst[127:32] := a[127:32]
4498 //
4499 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
4500 #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
4501
4502 // Convert the signed 64-bit integer b to a single-precision (32-bit)
4503 // floating-point element, store the result in the lower element of dst, and
4504 // copy the upper 3 packed elements from a to the upper elements of dst.
4505 //
4506 // dst[31:0] := Convert_Int64_To_FP32(b[63:0])
4507 // dst[127:32] := a[127:32]
4508 //
4509 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
_mm_cvtsi64_ss(__m128 a,int64_t b)4510 FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
4511 {
4512 return vreinterpretq_m128_f32(
4513 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
4514 }
4515
4516 // Convert the lower single-precision (32-bit) floating-point element in a to a
4517 // 32-bit integer, and store the result in dst.
4518 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
_mm_cvt_ss2si(__m128 a)4519 FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
4520 {
4521 #if defined(__aarch64__)
4522 return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0);
4523 #else
4524 float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
4525 float32_t diff = data - floor(data);
4526 if (diff > 0.5)
4527 return (int32_t) ceil(data);
4528 if (diff == 0.5) {
4529 int32_t f = (int32_t) floor(data);
4530 int32_t c = (int32_t) ceil(data);
4531 return c & 1 ? f : c;
4532 }
4533 return (int32_t) floor(data);
4534 #endif
4535 }
4536
4537 // Convert packed 16-bit integers in a to packed single-precision (32-bit)
4538 // floating-point elements, and store the results in dst.
4539 //
4540 // FOR j := 0 to 3
4541 // i := j*16
4542 // m := j*32
4543 // dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
4544 // ENDFOR
4545 //
4546 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
_mm_cvtpi16_ps(__m64 a)4547 FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
4548 {
4549 return vreinterpretq_m128_f32(
4550 vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
4551 }
4552
4553 // Convert packed 32-bit integers in b to packed single-precision (32-bit)
4554 // floating-point elements, store the results in the lower 2 elements of dst,
4555 // and copy the upper 2 packed elements from a to the upper elements of dst.
4556 //
4557 // dst[31:0] := Convert_Int32_To_FP32(b[31:0])
4558 // dst[63:32] := Convert_Int32_To_FP32(b[63:32])
4559 // dst[95:64] := a[95:64]
4560 // dst[127:96] := a[127:96]
4561 //
4562 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
_mm_cvtpi32_ps(__m128 a,__m64 b)4563 FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
4564 {
4565 return vreinterpretq_m128_f32(
4566 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
4567 vget_high_f32(vreinterpretq_f32_m128(a))));
4568 }
4569
4570 // Convert packed signed 32-bit integers in a to packed single-precision
4571 // (32-bit) floating-point elements, store the results in the lower 2 elements
4572 // of dst, then covert the packed signed 32-bit integers in b to
4573 // single-precision (32-bit) floating-point element, and store the results in
4574 // the upper 2 elements of dst.
4575 //
4576 // dst[31:0] := Convert_Int32_To_FP32(a[31:0])
4577 // dst[63:32] := Convert_Int32_To_FP32(a[63:32])
4578 // dst[95:64] := Convert_Int32_To_FP32(b[31:0])
4579 // dst[127:96] := Convert_Int32_To_FP32(b[63:32])
4580 //
4581 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
_mm_cvtpi32x2_ps(__m64 a,__m64 b)4582 FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
4583 {
4584 return vreinterpretq_m128_f32(vcvtq_f32_s32(
4585 vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
4586 }
4587
4588 // Convert the lower packed 8-bit integers in a to packed single-precision
4589 // (32-bit) floating-point elements, and store the results in dst.
4590 //
4591 // FOR j := 0 to 3
4592 // i := j*8
4593 // m := j*32
4594 // dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
4595 // ENDFOR
4596 //
4597 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
_mm_cvtpi8_ps(__m64 a)4598 FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
4599 {
4600 return vreinterpretq_m128_f32(vcvtq_f32_s32(
4601 vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
4602 }
4603
4604 // Convert packed unsigned 16-bit integers in a to packed single-precision
4605 // (32-bit) floating-point elements, and store the results in dst.
4606 //
4607 // FOR j := 0 to 3
4608 // i := j*16
4609 // m := j*32
4610 // dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
4611 // ENDFOR
4612 //
4613 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
_mm_cvtpu16_ps(__m64 a)4614 FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
4615 {
4616 return vreinterpretq_m128_f32(
4617 vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
4618 }
4619
4620 // Convert the lower packed unsigned 8-bit integers in a to packed
4621 // single-precision (32-bit) floating-point elements, and store the results in
4622 // dst.
4623 //
4624 // FOR j := 0 to 3
4625 // i := j*8
4626 // m := j*32
4627 // dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
4628 // ENDFOR
4629 //
4630 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
_mm_cvtpu8_ps(__m64 a)4631 FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
4632 {
4633 return vreinterpretq_m128_f32(vcvtq_f32_u32(
4634 vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
4635 }
4636
4637 // Converts the four single-precision, floating-point values of a to signed
4638 // 32-bit integer values using truncate.
4639 // https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
_mm_cvttps_epi32(__m128 a)4640 FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
4641 {
4642 return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4643 }
4644
4645 // Convert the lower double-precision (64-bit) floating-point element in a to a
4646 // 64-bit integer with truncation, and store the result in dst.
4647 //
4648 // dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4649 //
4650 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
_mm_cvttsd_si64(__m128d a)4651 FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
4652 {
4653 #if defined(__aarch64__)
4654 return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4655 #else
4656 double ret = *((double *) &a);
4657 return (int64_t) ret;
4658 #endif
4659 }
4660
4661 // Convert the lower double-precision (64-bit) floating-point element in a to a
4662 // 64-bit integer with truncation, and store the result in dst.
4663 //
4664 // dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
4665 //
4666 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
4667 #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4668
4669 // Converts the four signed 32-bit integer values of a to single-precision,
4670 // floating-point values
4671 // https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
_mm_cvtepi32_ps(__m128i a)4672 FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
4673 {
4674 return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
4675 }
4676
4677 // Converts the four unsigned 8-bit integers in the lower 16 bits to four
4678 // unsigned 32-bit integers.
_mm_cvtepu8_epi16(__m128i a)4679 FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
4680 {
4681 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
4682 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
4683 return vreinterpretq_m128i_u16(u16x8);
4684 }
4685
4686 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
4687 // unsigned 32-bit integers.
4688 // https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
_mm_cvtepu8_epi32(__m128i a)4689 FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
4690 {
4691 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
4692 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
4693 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
4694 return vreinterpretq_m128i_u32(u32x4);
4695 }
4696
4697 // Converts the two unsigned 8-bit integers in the lower 16 bits to two
4698 // unsigned 64-bit integers.
_mm_cvtepu8_epi64(__m128i a)4699 FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
4700 {
4701 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
4702 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
4703 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
4704 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
4705 return vreinterpretq_m128i_u64(u64x2);
4706 }
4707
4708 // Converts the four unsigned 8-bit integers in the lower 16 bits to four
4709 // unsigned 32-bit integers.
_mm_cvtepi8_epi16(__m128i a)4710 FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
4711 {
4712 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
4713 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
4714 return vreinterpretq_m128i_s16(s16x8);
4715 }
4716
4717 // Converts the four unsigned 8-bit integers in the lower 32 bits to four
4718 // unsigned 32-bit integers.
_mm_cvtepi8_epi32(__m128i a)4719 FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
4720 {
4721 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
4722 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
4723 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
4724 return vreinterpretq_m128i_s32(s32x4);
4725 }
4726
4727 // Converts the two signed 8-bit integers in the lower 32 bits to four
4728 // signed 64-bit integers.
_mm_cvtepi8_epi64(__m128i a)4729 FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
4730 {
4731 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
4732 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
4733 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
4734 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
4735 return vreinterpretq_m128i_s64(s64x2);
4736 }
4737
4738 // Converts the four signed 16-bit integers in the lower 64 bits to four signed
4739 // 32-bit integers.
_mm_cvtepi16_epi32(__m128i a)4740 FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
4741 {
4742 return vreinterpretq_m128i_s32(
4743 vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
4744 }
4745
4746 // Converts the two signed 16-bit integers in the lower 32 bits two signed
4747 // 32-bit integers.
_mm_cvtepi16_epi64(__m128i a)4748 FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
4749 {
4750 int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
4751 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
4752 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
4753 return vreinterpretq_m128i_s64(s64x2);
4754 }
4755
4756 // Converts the four unsigned 16-bit integers in the lower 64 bits to four
4757 // unsigned 32-bit integers.
_mm_cvtepu16_epi32(__m128i a)4758 FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
4759 {
4760 return vreinterpretq_m128i_u32(
4761 vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
4762 }
4763
4764 // Converts the two unsigned 16-bit integers in the lower 32 bits to two
4765 // unsigned 64-bit integers.
_mm_cvtepu16_epi64(__m128i a)4766 FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
4767 {
4768 uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
4769 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
4770 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
4771 return vreinterpretq_m128i_u64(u64x2);
4772 }
4773
4774 // Converts the two unsigned 32-bit integers in the lower 64 bits to two
4775 // unsigned 64-bit integers.
_mm_cvtepu32_epi64(__m128i a)4776 FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
4777 {
4778 return vreinterpretq_m128i_u64(
4779 vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
4780 }
4781
4782 // Converts the two signed 32-bit integers in the lower 64 bits to two signed
4783 // 64-bit integers.
_mm_cvtepi32_epi64(__m128i a)4784 FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
4785 {
4786 return vreinterpretq_m128i_s64(
4787 vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
4788 }
4789
4790 // Converts the four single-precision, floating-point values of a to signed
4791 // 32-bit integer values.
4792 //
4793 // r0 := (int) a0
4794 // r1 := (int) a1
4795 // r2 := (int) a2
4796 // r3 := (int) a3
4797 //
4798 // https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
4799 // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
4800 // does not support! It is supported on ARMv8-A however.
_mm_cvtps_epi32(__m128 a)4801 FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
4802 {
4803 #if defined(__aarch64__)
4804 return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
4805 #else
4806 uint32x4_t signmask = vdupq_n_u32(0x80000000);
4807 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
4808 vdupq_n_f32(0.5f)); /* +/- 0.5 */
4809 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
4810 vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
4811 int32x4_t r_trunc =
4812 vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
4813 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
4814 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
4815 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
4816 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
4817 float32x4_t delta = vsubq_f32(
4818 vreinterpretq_f32_m128(a),
4819 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
4820 uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
4821 return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
4822 #endif
4823 }
4824
4825 // Copy the lower 32-bit integer in a to dst.
4826 //
4827 // dst[31:0] := a[31:0]
4828 //
4829 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
_mm_cvtsi128_si32(__m128i a)4830 FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
4831 {
4832 return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4833 }
4834
4835 // Copy the lower 64-bit integer in a to dst.
4836 //
4837 // dst[63:0] := a[63:0]
4838 //
4839 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
_mm_cvtsi128_si64(__m128i a)4840 FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
4841 {
4842 return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4843 }
4844
4845 // Copy the lower 64-bit integer in a to dst.
4846 //
4847 // dst[63:0] := a[63:0]
4848 //
4849 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
4850 #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4851
4852 // Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
4853 // zero extending the upper bits.
4854 //
4855 // r0 := a
4856 // r1 := 0x0
4857 // r2 := 0x0
4858 // r3 := 0x0
4859 //
4860 // https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
_mm_cvtsi32_si128(int a)4861 FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
4862 {
4863 return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4864 }
4865
4866 // Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
4867 // zero extending the upper bits.
4868 //
4869 // r0 := a
4870 // r1 := 0x0
_mm_cvtsi64_si128(int64_t a)4871 FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
4872 {
4873 return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4874 }
4875
4876 // Cast vector of type __m128 to type __m128d. This intrinsic is only used for
4877 // compilation and does not generate any instructions, thus it has zero latency.
4878 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
_mm_castps_pd(__m128 a)4879 FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
4880 {
4881 return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
4882 }
4883
4884 // Applies a type cast to reinterpret four 32-bit floating point values passed
4885 // in as a 128-bit parameter as packed 32-bit integers.
4886 // https://msdn.microsoft.com/en-us/library/bb514099.aspx
_mm_castps_si128(__m128 a)4887 FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
4888 {
4889 return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
4890 }
4891
4892 // Applies a type cast to reinterpret four 32-bit integers passed in as a
4893 // 128-bit parameter as packed 32-bit floating point values.
4894 // https://msdn.microsoft.com/en-us/library/bb514029.aspx
_mm_castsi128_ps(__m128i a)4895 FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
4896 {
4897 return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
4898 }
4899
4900 // Loads 128-bit value. :
4901 // https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
_mm_load_si128(const __m128i * p)4902 FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
4903 {
4904 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4905 }
4906
4907 // Load a double-precision (64-bit) floating-point element from memory into both
4908 // elements of dst.
4909 //
4910 // dst[63:0] := MEM[mem_addr+63:mem_addr]
4911 // dst[127:64] := MEM[mem_addr+63:mem_addr]
4912 //
4913 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
_mm_load1_pd(const double * p)4914 FORCE_INLINE __m128d _mm_load1_pd(const double *p)
4915 {
4916 #if defined(__aarch64__)
4917 return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4918 #else
4919 return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4920 #endif
4921 }
4922
4923 // Load a double-precision (64-bit) floating-point element from memory into the
4924 // upper element of dst, and copy the lower element from a to dst. mem_addr does
4925 // not need to be aligned on any particular boundary.
4926 //
4927 // dst[63:0] := a[63:0]
4928 // dst[127:64] := MEM[mem_addr+63:mem_addr]
4929 //
4930 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
_mm_loadh_pd(__m128d a,const double * p)4931 FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
4932 {
4933 #if defined(__aarch64__)
4934 return vreinterpretq_m128d_f64(
4935 vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4936 #else
4937 return vreinterpretq_m128d_f32(vcombine_f32(
4938 vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4939 #endif
4940 }
4941
4942 // Load a double-precision (64-bit) floating-point element from memory into both
4943 // elements of dst.
4944 //
4945 // dst[63:0] := MEM[mem_addr+63:mem_addr]
4946 // dst[127:64] := MEM[mem_addr+63:mem_addr]
4947 //
4948 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
4949 #define _mm_load_pd1 _mm_load1_pd
4950
4951 // Load a double-precision (64-bit) floating-point element from memory into both
4952 // elements of dst.
4953 //
4954 // dst[63:0] := MEM[mem_addr+63:mem_addr]
4955 // dst[127:64] := MEM[mem_addr+63:mem_addr]
4956 //
4957 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
4958 #define _mm_loaddup_pd _mm_load1_pd
4959
4960 // Loads 128-bit value. :
4961 // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
_mm_loadu_si128(const __m128i * p)4962 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
4963 {
4964 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4965 }
4966
4967 // Load unaligned 32-bit integer from memory into the first element of dst.
4968 //
4969 // dst[31:0] := MEM[mem_addr+31:mem_addr]
4970 // dst[MAX:32] := 0
4971 //
4972 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
_mm_loadu_si32(const void * p)4973 FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
4974 {
4975 return vreinterpretq_m128i_s32(
4976 vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4977 }
4978
4979 // Convert packed double-precision (64-bit) floating-point elements in a to
4980 // packed single-precision (32-bit) floating-point elements, and store the
4981 // results in dst.
4982 //
4983 // FOR j := 0 to 1
4984 // i := 32*j
4985 // k := 64*j
4986 // dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
4987 // ENDFOR
4988 // dst[127:64] := 0
4989 //
4990 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
_mm_cvtpd_ps(__m128d a)4991 FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
4992 {
4993 #if defined(__aarch64__)
4994 float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
4995 return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
4996 #else
4997 float a0 = (float) ((double *) &a)[0];
4998 float a1 = (float) ((double *) &a)[1];
4999 return _mm_set_ps(0, 0, a1, a0);
5000 #endif
5001 }
5002
5003 // Copy the lower double-precision (64-bit) floating-point element of a to dst.
5004 //
5005 // dst[63:0] := a[63:0]
5006 //
5007 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
_mm_cvtsd_f64(__m128d a)5008 FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
5009 {
5010 #if defined(__aarch64__)
5011 return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
5012 #else
5013 return ((double *) &a)[0];
5014 #endif
5015 }
5016
5017 // Convert packed single-precision (32-bit) floating-point elements in a to
5018 // packed double-precision (64-bit) floating-point elements, and store the
5019 // results in dst.
5020 //
5021 // FOR j := 0 to 1
5022 // i := 64*j
5023 // k := 32*j
5024 // dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
5025 // ENDFOR
5026 //
5027 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
_mm_cvtps_pd(__m128 a)5028 FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
5029 {
5030 #if defined(__aarch64__)
5031 return vreinterpretq_m128d_f64(
5032 vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
5033 #else
5034 double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
5035 double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
5036 return _mm_set_pd(a1, a0);
5037 #endif
5038 }
5039
5040 // Cast vector of type __m128d to type __m128i. This intrinsic is only used for
5041 // compilation and does not generate any instructions, thus it has zero latency.
5042 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
_mm_castpd_si128(__m128d a)5043 FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
5044 {
5045 return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
5046 }
5047
5048 // Blend packed single-precision (32-bit) floating-point elements from a and b
5049 // using mask, and store the results in dst.
5050 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
_mm_blendv_ps(__m128 a,__m128 b,__m128 mask)5051 FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 mask)
5052 {
5053 return vreinterpretq_m128_f32(vbslq_f32(vreinterpretq_u32_m128(mask),
5054 vreinterpretq_f32_m128(b),
5055 vreinterpretq_f32_m128(a)));
5056 }
5057
5058 // Round the packed single-precision (32-bit) floating-point elements in a using
5059 // the rounding parameter, and store the results as packed single-precision
5060 // floating-point elements in dst.
5061 // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
_mm_round_ps(__m128 a,int rounding)5062 FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
5063 {
5064 #if defined(__aarch64__)
5065 switch (rounding) {
5066 case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
5067 return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
5068 case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
5069 return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
5070 case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
5071 return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
5072 case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
5073 return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
5074 default: //_MM_FROUND_CUR_DIRECTION
5075 return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
5076 }
5077 #else
5078 float *v_float = (float *) &a;
5079 __m128 zero, neg_inf, pos_inf;
5080
5081 switch (rounding) {
5082 case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
5083 return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
5084 case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
5085 return (__m128){floorf(v_float[0]), floorf(v_float[1]),
5086 floorf(v_float[2]), floorf(v_float[3])};
5087 case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
5088 return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]),
5089 ceilf(v_float[3])};
5090 case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
5091 zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
5092 neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]),
5093 floorf(v_float[2]), floorf(v_float[3]));
5094 pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]),
5095 ceilf(v_float[2]), ceilf(v_float[3]));
5096 return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero));
5097 default: //_MM_FROUND_CUR_DIRECTION
5098 return (__m128){roundf(v_float[0]), roundf(v_float[1]),
5099 roundf(v_float[2]), roundf(v_float[3])};
5100 }
5101 #endif
5102 }
5103
5104 // Convert packed single-precision (32-bit) floating-point elements in a to
5105 // packed 32-bit integers, and store the results in dst.
5106 //
5107 // FOR j := 0 to 1
5108 // i := 32*j
5109 // dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
5110 // ENDFOR
5111 //
5112 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
_mm_cvt_ps2pi(__m128 a)5113 FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
5114 {
5115 #if defined(__aarch64__)
5116 return vreinterpret_m64_s32(
5117 vget_low_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))));
5118 #else
5119 return vreinterpret_m64_s32(
5120 vcvt_s32_f32(vget_low_f32(vreinterpretq_f32_m128(
5121 _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)))));
5122 #endif
5123 }
5124
5125 // Round the packed single-precision (32-bit) floating-point elements in a up to
5126 // an integer value, and store the results as packed single-precision
5127 // floating-point elements in dst.
5128 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
_mm_ceil_ps(__m128 a)5129 FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
5130 {
5131 return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
5132 }
5133
5134 // Round the packed single-precision (32-bit) floating-point elements in a down
5135 // to an integer value, and store the results as packed single-precision
5136 // floating-point elements in dst.
5137 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
_mm_floor_ps(__m128 a)5138 FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
5139 {
5140 return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
5141 }
5142
5143
5144 // Load 128-bits of integer data from unaligned memory into dst. This intrinsic
5145 // may perform better than _mm_loadu_si128 when the data crosses a cache line
5146 // boundary.
5147 //
5148 // dst[127:0] := MEM[mem_addr+127:mem_addr]
5149 //
5150 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
5151 #define _mm_lddqu_si128 _mm_loadu_si128
5152
5153 /* Miscellaneous Operations */
5154
5155 // Shifts the 8 signed 16-bit integers in a right by count bits while shifting
5156 // in the sign bit.
5157 //
5158 // r0 := a0 >> count
5159 // r1 := a1 >> count
5160 // ...
5161 // r7 := a7 >> count
5162 //
5163 // https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
_mm_sra_epi16(__m128i a,__m128i count)5164 FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
5165 {
5166 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5167 if (c > 15)
5168 return _mm_cmplt_epi16(a, _mm_setzero_si128());
5169 return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
5170 }
5171
5172 // Shifts the 4 signed 32-bit integers in a right by count bits while shifting
5173 // in the sign bit.
5174 //
5175 // r0 := a0 >> count
5176 // r1 := a1 >> count
5177 // r2 := a2 >> count
5178 // r3 := a3 >> count
5179 //
5180 // https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
_mm_sra_epi32(__m128i a,__m128i count)5181 FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
5182 {
5183 int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
5184 if (c > 31)
5185 return _mm_cmplt_epi32(a, _mm_setzero_si128());
5186 return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
5187 }
5188
5189 // Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
5190 // saturates.
5191 // https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
_mm_packs_epi16(__m128i a,__m128i b)5192 FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
5193 {
5194 return vreinterpretq_m128i_s8(
5195 vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
5196 vqmovn_s16(vreinterpretq_s16_m128i(b))));
5197 }
5198
5199 // Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
5200 // integers and saturates.
5201 //
5202 // r0 := UnsignedSaturate(a0)
5203 // r1 := UnsignedSaturate(a1)
5204 // ...
5205 // r7 := UnsignedSaturate(a7)
5206 // r8 := UnsignedSaturate(b0)
5207 // r9 := UnsignedSaturate(b1)
5208 // ...
5209 // r15 := UnsignedSaturate(b7)
5210 //
5211 // https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
_mm_packus_epi16(const __m128i a,const __m128i b)5212 FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
5213 {
5214 return vreinterpretq_m128i_u8(
5215 vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
5216 vqmovun_s16(vreinterpretq_s16_m128i(b))));
5217 }
5218
5219 // Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
5220 // and saturates.
5221 //
5222 // r0 := SignedSaturate(a0)
5223 // r1 := SignedSaturate(a1)
5224 // r2 := SignedSaturate(a2)
5225 // r3 := SignedSaturate(a3)
5226 // r4 := SignedSaturate(b0)
5227 // r5 := SignedSaturate(b1)
5228 // r6 := SignedSaturate(b2)
5229 // r7 := SignedSaturate(b3)
5230 //
5231 // https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
_mm_packs_epi32(__m128i a,__m128i b)5232 FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
5233 {
5234 return vreinterpretq_m128i_s16(
5235 vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
5236 vqmovn_s32(vreinterpretq_s32_m128i(b))));
5237 }
5238
5239 // Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
5240 // integers and saturates.
5241 //
5242 // r0 := UnsignedSaturate(a0)
5243 // r1 := UnsignedSaturate(a1)
5244 // r2 := UnsignedSaturate(a2)
5245 // r3 := UnsignedSaturate(a3)
5246 // r4 := UnsignedSaturate(b0)
5247 // r5 := UnsignedSaturate(b1)
5248 // r6 := UnsignedSaturate(b2)
5249 // r7 := UnsignedSaturate(b3)
_mm_packus_epi32(__m128i a,__m128i b)5250 FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
5251 {
5252 return vreinterpretq_m128i_u16(
5253 vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
5254 vqmovun_s32(vreinterpretq_s32_m128i(b))));
5255 }
5256
5257 // Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
5258 // 8 signed or unsigned 8-bit integers in b.
5259 //
5260 // r0 := a0
5261 // r1 := b0
5262 // r2 := a1
5263 // r3 := b1
5264 // ...
5265 // r14 := a7
5266 // r15 := b7
5267 //
5268 // https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
_mm_unpacklo_epi8(__m128i a,__m128i b)5269 FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
5270 {
5271 #if defined(__aarch64__)
5272 return vreinterpretq_m128i_s8(
5273 vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5274 #else
5275 int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
5276 int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
5277 int8x8x2_t result = vzip_s8(a1, b1);
5278 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5279 #endif
5280 }
5281
5282 // Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
5283 // lower 4 signed or unsigned 16-bit integers in b.
5284 //
5285 // r0 := a0
5286 // r1 := b0
5287 // r2 := a1
5288 // r3 := b1
5289 // r4 := a2
5290 // r5 := b2
5291 // r6 := a3
5292 // r7 := b3
5293 //
5294 // https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
_mm_unpacklo_epi16(__m128i a,__m128i b)5295 FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
5296 {
5297 #if defined(__aarch64__)
5298 return vreinterpretq_m128i_s16(
5299 vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5300 #else
5301 int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
5302 int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
5303 int16x4x2_t result = vzip_s16(a1, b1);
5304 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5305 #endif
5306 }
5307
5308 // Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
5309 // lower 2 signed or unsigned 32 - bit integers in b.
5310 //
5311 // r0 := a0
5312 // r1 := b0
5313 // r2 := a1
5314 // r3 := b1
5315 //
5316 // https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
_mm_unpacklo_epi32(__m128i a,__m128i b)5317 FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
5318 {
5319 #if defined(__aarch64__)
5320 return vreinterpretq_m128i_s32(
5321 vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5322 #else
5323 int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
5324 int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
5325 int32x2x2_t result = vzip_s32(a1, b1);
5326 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5327 #endif
5328 }
5329
_mm_unpacklo_epi64(__m128i a,__m128i b)5330 FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
5331 {
5332 int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
5333 int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
5334 return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
5335 }
5336
5337 // Selects and interleaves the lower two single-precision, floating-point values
5338 // from a and b.
5339 //
5340 // r0 := a0
5341 // r1 := b0
5342 // r2 := a1
5343 // r3 := b1
5344 //
5345 // https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
_mm_unpacklo_ps(__m128 a,__m128 b)5346 FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
5347 {
5348 #if defined(__aarch64__)
5349 return vreinterpretq_m128_f32(
5350 vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
5351 #else
5352 float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
5353 float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
5354 float32x2x2_t result = vzip_f32(a1, b1);
5355 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
5356 #endif
5357 }
5358
5359 // Selects and interleaves the upper two single-precision, floating-point values
5360 // from a and b.
5361 //
5362 // r0 := a2
5363 // r1 := b2
5364 // r2 := a3
5365 // r3 := b3
5366 //
5367 // https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
_mm_unpackhi_ps(__m128 a,__m128 b)5368 FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
5369 {
5370 #if defined(__aarch64__)
5371 return vreinterpretq_m128_f32(
5372 vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
5373 #else
5374 float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
5375 float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
5376 float32x2x2_t result = vzip_f32(a1, b1);
5377 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
5378 #endif
5379 }
5380
5381 // Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
5382 // 8 signed or unsigned 8-bit integers in b.
5383 //
5384 // r0 := a8
5385 // r1 := b8
5386 // r2 := a9
5387 // r3 := b9
5388 // ...
5389 // r14 := a15
5390 // r15 := b15
5391 //
5392 // https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
_mm_unpackhi_epi8(__m128i a,__m128i b)5393 FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
5394 {
5395 #if defined(__aarch64__)
5396 return vreinterpretq_m128i_s8(
5397 vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5398 #else
5399 int8x8_t a1 =
5400 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
5401 int8x8_t b1 =
5402 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
5403 int8x8x2_t result = vzip_s8(a1, b1);
5404 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5405 #endif
5406 }
5407
5408 // Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
5409 // upper 4 signed or unsigned 16-bit integers in b.
5410 //
5411 // r0 := a4
5412 // r1 := b4
5413 // r2 := a5
5414 // r3 := b5
5415 // r4 := a6
5416 // r5 := b6
5417 // r6 := a7
5418 // r7 := b7
5419 //
5420 // https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
_mm_unpackhi_epi16(__m128i a,__m128i b)5421 FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
5422 {
5423 #if defined(__aarch64__)
5424 return vreinterpretq_m128i_s16(
5425 vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5426 #else
5427 int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
5428 int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
5429 int16x4x2_t result = vzip_s16(a1, b1);
5430 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5431 #endif
5432 }
5433
5434 // Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
5435 // upper 2 signed or unsigned 32-bit integers in b.
5436 // https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
_mm_unpackhi_epi32(__m128i a,__m128i b)5437 FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
5438 {
5439 #if defined(__aarch64__)
5440 return vreinterpretq_m128i_s32(
5441 vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5442 #else
5443 int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
5444 int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
5445 int32x2x2_t result = vzip_s32(a1, b1);
5446 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5447 #endif
5448 }
5449
5450 // Interleaves the upper signed or unsigned 64-bit integer in a with the
5451 // upper signed or unsigned 64-bit integer in b.
5452 //
5453 // r0 := a1
5454 // r1 := b1
_mm_unpackhi_epi64(__m128i a,__m128i b)5455 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
5456 {
5457 int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
5458 int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
5459 return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
5460 }
5461
5462 // Horizontally compute the minimum amongst the packed unsigned 16-bit integers
5463 // in a, store the minimum and index in dst, and zero the remaining bits in dst.
5464 //
5465 // index[2:0] := 0
5466 // min[15:0] := a[15:0]
5467 // FOR j := 0 to 7
5468 // i := j*16
5469 // IF a[i+15:i] < min[15:0]
5470 // index[2:0] := j
5471 // min[15:0] := a[i+15:i]
5472 // FI
5473 // ENDFOR
5474 // dst[15:0] := min[15:0]
5475 // dst[18:16] := index[2:0]
5476 // dst[127:19] := 0
5477 //
5478 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
_mm_minpos_epu16(__m128i a)5479 FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
5480 {
5481 __m128i dst;
5482 uint16_t min, idx = 0;
5483 // Find the minimum value
5484 #if defined(__aarch64__)
5485 min = vminvq_u16(vreinterpretq_u16_m128i(a));
5486 #else
5487 __m64 tmp;
5488 tmp = vreinterpret_m64_u16(
5489 vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
5490 vget_high_u16(vreinterpretq_u16_m128i(a))));
5491 tmp = vreinterpret_m64_u16(
5492 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
5493 tmp = vreinterpret_m64_u16(
5494 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
5495 min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
5496 #endif
5497 // Get the index of the minimum value
5498 int i;
5499 for (i = 0; i < 8; i++) {
5500 if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
5501 idx = (uint16_t) i;
5502 break;
5503 }
5504 a = _mm_srli_si128(a, 2);
5505 }
5506 // Generate result
5507 dst = _mm_setzero_si128();
5508 dst = vreinterpretq_m128i_u16(
5509 vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
5510 dst = vreinterpretq_m128i_u16(
5511 vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
5512 return dst;
5513 }
5514
5515 // shift to right
5516 // https://msdn.microsoft.com/en-us/library/bb514041(v=vs.120).aspx
5517 // http://blog.csdn.net/hemmingway/article/details/44828303
5518 // Clang requires a macro here, as it is extremely picky about c being a
5519 // literal.
5520 #define _mm_alignr_epi8(a, b, c) \
5521 ((__m128i) vextq_s8((int8x16_t)(b), (int8x16_t)(a), (c)))
5522
5523 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
5524 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
5525 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
5526 // otherwise set CF to 0. Return the CF value.
5527 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
_mm_testc_si128(__m128i a,__m128i b)5528 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
5529 {
5530 int64x2_t s64 =
5531 vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
5532 vreinterpretq_s64_m128i(b));
5533 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
5534 }
5535
5536 // Compute the bitwise AND of 128 bits (representing integer data) in a and b,
5537 // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
5538 // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
5539 // otherwise set CF to 0. Return the ZF value.
5540 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
_mm_testz_si128(__m128i a,__m128i b)5541 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
5542 {
5543 int64x2_t s64 =
5544 vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
5545 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
5546 }
5547
5548 // Extracts the selected signed or unsigned 8-bit integer from a and zero
5549 // extends.
5550 // FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
5551 #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
5552
5553 // Inserts the least significant 8 bits of b into the selected 8-bit integer
5554 // of a.
5555 // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
5556 // __constrange(0,16) int imm)
5557 #define _mm_insert_epi8(a, b, imm) \
5558 __extension__({ \
5559 vreinterpretq_m128i_s8( \
5560 vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
5561 })
5562
5563 // Extracts the selected signed or unsigned 16-bit integer from a and zero
5564 // extends.
5565 // https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
5566 // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
5567 #define _mm_extract_epi16(a, imm) \
5568 vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
5569
5570 // Inserts the least significant 16 bits of b into the selected 16-bit integer
5571 // of a.
5572 // https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
5573 // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
5574 // __constrange(0,8) int imm)
5575 #define _mm_insert_epi16(a, b, imm) \
5576 __extension__({ \
5577 vreinterpretq_m128i_s16( \
5578 vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
5579 })
5580
5581 // Extracts the selected signed or unsigned 32-bit integer from a and zero
5582 // extends.
5583 // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
5584 #define _mm_extract_epi32(a, imm) \
5585 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
5586
5587 // Extracts the selected single-precision (32-bit) floating-point from a.
5588 // FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
5589 #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
5590
5591 // Inserts the least significant 32 bits of b into the selected 32-bit integer
5592 // of a.
5593 // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
5594 // __constrange(0,4) int imm)
5595 #define _mm_insert_epi32(a, b, imm) \
5596 __extension__({ \
5597 vreinterpretq_m128i_s32( \
5598 vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
5599 })
5600
5601 // Extracts the selected signed or unsigned 64-bit integer from a and zero
5602 // extends.
5603 // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
5604 #define _mm_extract_epi64(a, imm) \
5605 vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
5606
5607 // Inserts the least significant 64 bits of b into the selected 64-bit integer
5608 // of a.
5609 // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
5610 // __constrange(0,2) int imm)
5611 #define _mm_insert_epi64(a, b, imm) \
5612 __extension__({ \
5613 vreinterpretq_m128i_s64( \
5614 vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
5615 })
5616
5617 // Count the number of bits set to 1 in unsigned 32-bit integer a, and
5618 // return that count in dst.
5619 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
_mm_popcnt_u32(unsigned int a)5620 FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
5621 {
5622 #if defined(__aarch64__)
5623 #if __has_builtin(__builtin_popcount)
5624 return __builtin_popcount(a);
5625 #else
5626 return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
5627 #endif
5628 #else
5629 uint32_t count = 0;
5630 uint8x8_t input_val, count8x8_val;
5631 uint16x4_t count16x4_val;
5632 uint32x2_t count32x2_val;
5633
5634 input_val = vld1_u8((uint8_t *) &a);
5635 count8x8_val = vcnt_u8(input_val);
5636 count16x4_val = vpaddl_u8(count8x8_val);
5637 count32x2_val = vpaddl_u16(count16x4_val);
5638
5639 vst1_u32(&count, count32x2_val);
5640 return count;
5641 #endif
5642 }
5643
5644 // Count the number of bits set to 1 in unsigned 64-bit integer a, and
5645 // return that count in dst.
5646 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
_mm_popcnt_u64(uint64_t a)5647 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
5648 {
5649 #if defined(__aarch64__)
5650 #if __has_builtin(__builtin_popcountll)
5651 return __builtin_popcountll(a);
5652 #else
5653 return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
5654 #endif
5655 #else
5656 uint64_t count = 0;
5657 uint8x8_t input_val, count8x8_val;
5658 uint16x4_t count16x4_val;
5659 uint32x2_t count32x2_val;
5660 uint64x1_t count64x1_val;
5661
5662 input_val = vld1_u8((uint8_t *) &a);
5663 count8x8_val = vcnt_u8(input_val);
5664 count16x4_val = vpaddl_u8(count8x8_val);
5665 count32x2_val = vpaddl_u16(count16x4_val);
5666 count64x1_val = vpaddl_u32(count32x2_val);
5667 vst1_u64(&count, count64x1_val);
5668 return count;
5669 #endif
5670 }
5671
5672 // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
5673 // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
5674 // transposed matrix in these vectors (row0 now contains column 0, etc.).
5675 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
5676 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
5677 do { \
5678 float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
5679 float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
5680 row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
5681 vget_low_f32(ROW23.val[0])); \
5682 row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
5683 vget_low_f32(ROW23.val[1])); \
5684 row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
5685 vget_high_f32(ROW23.val[0])); \
5686 row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
5687 vget_high_f32(ROW23.val[1])); \
5688 } while (0)
5689
5690 /* Crypto Extensions */
5691
5692 #if defined(__ARM_FEATURE_CRYPTO)
5693 // Wraps vmull_p64
_sse2neon_vmull_p64(uint64x1_t _a,uint64x1_t _b)5694 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
5695 {
5696 poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
5697 poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
5698 return vreinterpretq_u64_p128(vmull_p64(a, b));
5699 }
5700 #else // ARMv7 polyfill
5701 // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
5702 //
5703 // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
5704 // 64-bit->128-bit polynomial multiply.
5705 //
5706 // It needs some work and is somewhat slow, but it is still faster than all
5707 // known scalar methods.
5708 //
5709 // Algorithm adapted to C from
5710 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
5711 // from "Fast Software Polynomial Multiplication on ARM Processors Using the
5712 // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
5713 // (https://hal.inria.fr/hal-01506572)
_sse2neon_vmull_p64(uint64x1_t _a,uint64x1_t _b)5714 static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
5715 {
5716 poly8x8_t a = vreinterpret_p8_u64(_a);
5717 poly8x8_t b = vreinterpret_p8_u64(_b);
5718
5719 // Masks
5720 uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
5721 vcreate_u8(0x00000000ffffffff));
5722 uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
5723 vcreate_u8(0x0000000000000000));
5724
5725 // Do the multiplies, rotating with vext to get all combinations
5726 uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
5727 uint8x16_t e =
5728 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
5729 uint8x16_t f =
5730 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
5731 uint8x16_t g =
5732 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
5733 uint8x16_t h =
5734 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
5735 uint8x16_t i =
5736 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
5737 uint8x16_t j =
5738 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
5739 uint8x16_t k =
5740 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
5741
5742 // Add cross products
5743 uint8x16_t l = veorq_u8(e, f); // L = E + F
5744 uint8x16_t m = veorq_u8(g, h); // M = G + H
5745 uint8x16_t n = veorq_u8(i, j); // N = I + J
5746
5747 // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
5748 // instructions.
5749 #if defined(__aarch64__)
5750 uint8x16_t lm_p0 = vreinterpretq_u8_u64(
5751 vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
5752 uint8x16_t lm_p1 = vreinterpretq_u8_u64(
5753 vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
5754 uint8x16_t nk_p0 = vreinterpretq_u8_u64(
5755 vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
5756 uint8x16_t nk_p1 = vreinterpretq_u8_u64(
5757 vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
5758 #else
5759 uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
5760 uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
5761 uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
5762 uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
5763 #endif
5764 // t0 = (L) (P0 + P1) << 8
5765 // t1 = (M) (P2 + P3) << 16
5766 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
5767 uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
5768 uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
5769
5770 // t2 = (N) (P4 + P5) << 24
5771 // t3 = (K) (P6 + P7) << 32
5772 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
5773 uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
5774 uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
5775
5776 // De-interleave
5777 #if defined(__aarch64__)
5778 uint8x16_t t0 = vreinterpretq_u8_u64(
5779 vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
5780 uint8x16_t t1 = vreinterpretq_u8_u64(
5781 vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
5782 uint8x16_t t2 = vreinterpretq_u8_u64(
5783 vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
5784 uint8x16_t t3 = vreinterpretq_u8_u64(
5785 vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
5786 #else
5787 uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
5788 uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
5789 uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
5790 uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
5791 #endif
5792 // Shift the cross products
5793 uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
5794 uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
5795 uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
5796 uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
5797
5798 // Accumulate the products
5799 uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
5800 uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
5801 uint8x16_t mix = veorq_u8(d, cross1);
5802 uint8x16_t r = veorq_u8(mix, cross2);
5803 return vreinterpretq_u64_u8(r);
5804 }
5805 #endif // ARMv7 polyfill
5806
_mm_clmulepi64_si128(__m128i _a,__m128i _b,const int imm)5807 FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
5808 {
5809 uint64x2_t a = vreinterpretq_u64_m128i(_a);
5810 uint64x2_t b = vreinterpretq_u64_m128i(_b);
5811 switch (imm & 0x11) {
5812 case 0x00:
5813 return vreinterpretq_m128i_u64(
5814 _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
5815 case 0x01:
5816 return vreinterpretq_m128i_u64(
5817 _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
5818 case 0x10:
5819 return vreinterpretq_m128i_u64(
5820 _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
5821 case 0x11:
5822 return vreinterpretq_m128i_u64(
5823 _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
5824 default:
5825 abort();
5826 }
5827 }
5828
5829 #if !defined(__ARM_FEATURE_CRYPTO)
5830 /* clang-format off */
5831 #define SSE2NEON_AES_DATA(w) \
5832 { \
5833 w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
5834 w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
5835 w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
5836 w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
5837 w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
5838 w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
5839 w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
5840 w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
5841 w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
5842 w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
5843 w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
5844 w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
5845 w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
5846 w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
5847 w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
5848 w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
5849 w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
5850 w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
5851 w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
5852 w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
5853 w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
5854 w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
5855 w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
5856 w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
5857 w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
5858 w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
5859 w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
5860 w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
5861 w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
5862 w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
5863 w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
5864 w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
5865 w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
5866 w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
5867 w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
5868 w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
5869 w(0xb0), w(0x54), w(0xbb), w(0x16) \
5870 }
5871 /* clang-format on */
5872
5873 /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
5874 #define SSE2NEON_AES_H0(x) (x)
5875 static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
5876 #undef SSE2NEON_AES_H0
5877
5878 // In the absence of crypto extensions, implement aesenc using regular neon
5879 // intrinsics instead. See:
5880 // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
5881 // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
5882 // https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
5883 // for more information Reproduced with permission of the author.
_mm_aesenc_si128(__m128i EncBlock,__m128i RoundKey)5884 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
5885 {
5886 #if defined(__aarch64__)
5887 static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
5888 0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
5889 0xc, 0x1, 0x6, 0xb};
5890 static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
5891 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
5892
5893 uint8x16_t v;
5894 uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
5895
5896 // shift rows
5897 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
5898
5899 // sub bytes
5900 v = vqtbl4q_u8(vld1q_u8_x4(SSE2NEON_sbox), w);
5901 v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
5902 v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
5903 v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
5904
5905 // mix columns
5906 w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
5907 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
5908 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
5909
5910 // add round key
5911 return vreinterpretq_m128i_u8(w) ^ RoundKey;
5912
5913 #else /* ARMv7-A NEON implementation */
5914 #define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
5915 (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
5916 (b0))
5917 #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
5918 #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
5919 #define SSE2NEON_AES_U0(p) \
5920 SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
5921 #define SSE2NEON_AES_U1(p) \
5922 SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
5923 #define SSE2NEON_AES_U2(p) \
5924 SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
5925 #define SSE2NEON_AES_U3(p) \
5926 SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
5927 static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
5928 SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
5929 SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
5930 SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
5931 SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
5932 };
5933 #undef SSE2NEON_AES_B2W
5934 #undef SSE2NEON_AES_F2
5935 #undef SSE2NEON_AES_F3
5936 #undef SSE2NEON_AES_U0
5937 #undef SSE2NEON_AES_U1
5938 #undef SSE2NEON_AES_U2
5939 #undef SSE2NEON_AES_U3
5940
5941 uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
5942 uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
5943 uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
5944 uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
5945
5946 __m128i out = _mm_set_epi32(
5947 (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
5948 aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
5949 (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
5950 aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
5951 (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
5952 aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
5953 (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
5954 aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
5955
5956 return _mm_xor_si128(out, RoundKey);
5957 #endif
5958 }
5959
_mm_aesenclast_si128(__m128i a,__m128i RoundKey)5960 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
5961 {
5962 /* FIXME: optimized for NEON */
5963 uint8_t v[4][4] = {
5964 [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
5965 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
5966 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
5967 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
5968 [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
5969 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
5970 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
5971 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
5972 [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
5973 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
5974 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
5975 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
5976 [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
5977 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
5978 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
5979 SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
5980 };
5981 for (int i = 0; i < 16; i++)
5982 vreinterpretq_nth_u8_m128i(a, i) =
5983 v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
5984 return a;
5985 }
5986
5987 // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
5988 // This instruction generates a round key for AES encryption. See
5989 // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
5990 // for details.
5991 //
5992 // https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
_mm_aeskeygenassist_si128(__m128i key,const int rcon)5993 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
5994 {
5995 uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
5996 uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
5997 for (int i = 0; i < 4; ++i) {
5998 ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
5999 ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
6000 }
6001 return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
6002 ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
6003 }
6004 #undef SSE2NEON_AES_DATA
6005
6006 #else /* __ARM_FEATURE_CRYPTO */
6007 // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
6008 // AESMC and then manually applying the real key as an xor operation. This
6009 // unfortunately means an additional xor op; the compiler should be able to
6010 // optimize this away for repeated calls however. See
6011 // https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
6012 // for more details.
_mm_aesenc_si128(__m128i a,__m128i b)6013 FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
6014 {
6015 return vreinterpretq_m128i_u8(
6016 vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
6017 vreinterpretq_u8_m128i(b));
6018 }
6019
6020 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
_mm_aesenclast_si128(__m128i a,__m128i RoundKey)6021 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
6022 {
6023 return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
6024 vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
6025 RoundKey);
6026 }
6027
_mm_aeskeygenassist_si128(__m128i a,const int rcon)6028 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
6029 {
6030 // AESE does ShiftRows and SubBytes on A
6031 uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
6032
6033 uint8x16_t dest = {
6034 // Undo ShiftRows step from AESE and extract X1 and X3
6035 u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
6036 u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
6037 u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
6038 u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
6039 };
6040 uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
6041 return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
6042 }
6043 #endif
6044
6045 /* Streaming Extensions */
6046
6047 // Guarantees that every preceding store is globally visible before any
6048 // subsequent store.
6049 // https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
_mm_sfence(void)6050 FORCE_INLINE void _mm_sfence(void)
6051 {
6052 __sync_synchronize();
6053 }
6054
6055 // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
6056 // point elements) from a into memory using a non-temporal memory hint.
6057 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
_mm_stream_ps(float * p,__m128 a)6058 FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
6059 {
6060 #if __has_builtin(__builtin_nontemporal_store)
6061 __builtin_nontemporal_store(a, (float32x4_t *) p);
6062 #else
6063 vst1q_f32(p, vreinterpretq_f32_m128(a));
6064 #endif
6065 }
6066
6067 // Stores the data in a to the address p without polluting the caches. If the
6068 // cache line containing address p is already in the cache, the cache will be
6069 // updated.
6070 // https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
_mm_stream_si128(__m128i * p,__m128i a)6071 FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
6072 {
6073 #if __has_builtin(__builtin_nontemporal_store)
6074 __builtin_nontemporal_store(a, p);
6075 #else
6076 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
6077 #endif
6078 }
6079
6080 // Load 128-bits of integer data from memory into dst using a non-temporal
6081 // memory hint. mem_addr must be aligned on a 16-byte boundary or a
6082 // general-protection exception may be generated.
6083 //
6084 // dst[127:0] := MEM[mem_addr+127:mem_addr]
6085 //
6086 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
_mm_stream_load_si128(__m128i * p)6087 FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
6088 {
6089 #if __has_builtin(__builtin_nontemporal_store)
6090 return __builtin_nontemporal_load(p);
6091 #else
6092 return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
6093 #endif
6094 }
6095
6096 // Cache line containing p is flushed and invalidated from all caches in the
6097 // coherency domain. :
6098 // https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
_mm_clflush(void const * p)6099 FORCE_INLINE void _mm_clflush(void const *p)
6100 {
6101 (void) p;
6102 // no corollary for Neon?
6103 }
6104
6105 // Allocate aligned blocks of memory.
6106 // https://software.intel.com/en-us/
6107 // cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
_mm_malloc(size_t size,size_t align)6108 FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
6109 {
6110 void *ptr;
6111 if (align == 1)
6112 return malloc(size);
6113 if (align == 2 || (sizeof(void *) == 8 && align == 4))
6114 align = sizeof(void *);
6115 if (!posix_memalign(&ptr, align, size))
6116 return ptr;
6117 return NULL;
6118 }
6119
_mm_free(void * addr)6120 FORCE_INLINE void _mm_free(void *addr)
6121 {
6122 free(addr);
6123 }
6124
6125 // Starting with the initial value in crc, accumulates a CRC32 value for
6126 // unsigned 8-bit integer v.
6127 // https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
_mm_crc32_u8(uint32_t crc,uint8_t v)6128 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
6129 {
6130 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
6131 __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
6132 : [c] "+r"(crc)
6133 : [v] "r"(v));
6134 #else
6135 crc ^= v;
6136 for (int bit = 0; bit < 8; bit++) {
6137 if (crc & 1)
6138 crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
6139 else
6140 crc = (crc >> 1);
6141 }
6142 #endif
6143 return crc;
6144 }
6145
6146 // Starting with the initial value in crc, accumulates a CRC32 value for
6147 // unsigned 16-bit integer v.
6148 // https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
_mm_crc32_u16(uint32_t crc,uint16_t v)6149 FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
6150 {
6151 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
6152 __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
6153 : [c] "+r"(crc)
6154 : [v] "r"(v));
6155 #else
6156 crc = _mm_crc32_u8(crc, v & 0xff);
6157 crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
6158 #endif
6159 return crc;
6160 }
6161
6162 // Starting with the initial value in crc, accumulates a CRC32 value for
6163 // unsigned 32-bit integer v.
6164 // https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
_mm_crc32_u32(uint32_t crc,uint32_t v)6165 FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
6166 {
6167 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
6168 __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
6169 : [c] "+r"(crc)
6170 : [v] "r"(v));
6171 #else
6172 crc = _mm_crc32_u16(crc, v & 0xffff);
6173 crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
6174 #endif
6175 return crc;
6176 }
6177
6178 // Starting with the initial value in crc, accumulates a CRC32 value for
6179 // unsigned 64-bit integer v.
6180 // https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
_mm_crc32_u64(uint64_t crc,uint64_t v)6181 FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
6182 {
6183 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
6184 __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
6185 : [c] "+r"(crc)
6186 : [v] "r"(v));
6187 #else
6188 crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
6189 crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
6190 #endif
6191 return crc;
6192 }
6193
6194 #if defined(__GNUC__) || defined(__clang__)
6195 #pragma pop_macro("ALIGN_STRUCT")
6196 #pragma pop_macro("FORCE_INLINE")
6197 #endif
6198
6199 #if defined(__GNUC__)
6200 #pragma GCC pop_options
6201 #endif
6202
6203 #endif
6204