106f32e7eSjoerg /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
206f32e7eSjoerg  *
306f32e7eSjoerg  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
406f32e7eSjoerg  * See https://llvm.org/LICENSE.txt for license information.
506f32e7eSjoerg  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
606f32e7eSjoerg  *
706f32e7eSjoerg  *===-----------------------------------------------------------------------===
806f32e7eSjoerg  */
906f32e7eSjoerg 
1006f32e7eSjoerg #ifndef __EMMINTRIN_H
1106f32e7eSjoerg #define __EMMINTRIN_H
1206f32e7eSjoerg 
1306f32e7eSjoerg #include <xmmintrin.h>
1406f32e7eSjoerg 
1506f32e7eSjoerg typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
1606f32e7eSjoerg typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
1706f32e7eSjoerg 
1806f32e7eSjoerg typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
1906f32e7eSjoerg typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1)));
2006f32e7eSjoerg 
2106f32e7eSjoerg /* Type defines.  */
2206f32e7eSjoerg typedef double __v2df __attribute__ ((__vector_size__ (16)));
2306f32e7eSjoerg typedef long long __v2di __attribute__ ((__vector_size__ (16)));
2406f32e7eSjoerg typedef short __v8hi __attribute__((__vector_size__(16)));
2506f32e7eSjoerg typedef char __v16qi __attribute__((__vector_size__(16)));
2606f32e7eSjoerg 
2706f32e7eSjoerg /* Unsigned types */
2806f32e7eSjoerg typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
2906f32e7eSjoerg typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
3006f32e7eSjoerg typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
3106f32e7eSjoerg 
3206f32e7eSjoerg /* We need an explicitly signed variant for char. Note that this shouldn't
3306f32e7eSjoerg  * appear in the interface though. */
3406f32e7eSjoerg typedef signed char __v16qs __attribute__((__vector_size__(16)));
3506f32e7eSjoerg 
3606f32e7eSjoerg /* Define the default attributes for the functions in this file. */
3706f32e7eSjoerg #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128)))
3806f32e7eSjoerg #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64)))
3906f32e7eSjoerg 
4006f32e7eSjoerg /// Adds lower double-precision values in both operands and returns the
4106f32e7eSjoerg ///    sum in the lower 64 bits of the result. The upper 64 bits of the result
4206f32e7eSjoerg ///    are copied from the upper double-precision value of the first operand.
4306f32e7eSjoerg ///
4406f32e7eSjoerg /// \headerfile <x86intrin.h>
4506f32e7eSjoerg ///
4606f32e7eSjoerg /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
4706f32e7eSjoerg ///
4806f32e7eSjoerg /// \param __a
4906f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the source operands.
5006f32e7eSjoerg /// \param __b
5106f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the source operands.
5206f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
5306f32e7eSjoerg ///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
5406f32e7eSjoerg ///    from the upper 64 bits of the first source operand.
5506f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_add_sd(__m128d __a,__m128d __b)5606f32e7eSjoerg _mm_add_sd(__m128d __a, __m128d __b)
5706f32e7eSjoerg {
5806f32e7eSjoerg   __a[0] += __b[0];
5906f32e7eSjoerg   return __a;
6006f32e7eSjoerg }
6106f32e7eSjoerg 
6206f32e7eSjoerg /// Adds two 128-bit vectors of [2 x double].
6306f32e7eSjoerg ///
6406f32e7eSjoerg /// \headerfile <x86intrin.h>
6506f32e7eSjoerg ///
6606f32e7eSjoerg /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
6706f32e7eSjoerg ///
6806f32e7eSjoerg /// \param __a
6906f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the source operands.
7006f32e7eSjoerg /// \param __b
7106f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the source operands.
7206f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the sums of both
7306f32e7eSjoerg ///    operands.
7406f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_add_pd(__m128d __a,__m128d __b)7506f32e7eSjoerg _mm_add_pd(__m128d __a, __m128d __b)
7606f32e7eSjoerg {
7706f32e7eSjoerg   return (__m128d)((__v2df)__a + (__v2df)__b);
7806f32e7eSjoerg }
7906f32e7eSjoerg 
8006f32e7eSjoerg /// Subtracts the lower double-precision value of the second operand
8106f32e7eSjoerg ///    from the lower double-precision value of the first operand and returns
8206f32e7eSjoerg ///    the difference in the lower 64 bits of the result. The upper 64 bits of
8306f32e7eSjoerg ///    the result are copied from the upper double-precision value of the first
8406f32e7eSjoerg ///    operand.
8506f32e7eSjoerg ///
8606f32e7eSjoerg /// \headerfile <x86intrin.h>
8706f32e7eSjoerg ///
8806f32e7eSjoerg /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
8906f32e7eSjoerg ///
9006f32e7eSjoerg /// \param __a
9106f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing the minuend.
9206f32e7eSjoerg /// \param __b
9306f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing the subtrahend.
9406f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
9506f32e7eSjoerg ///    difference of the lower 64 bits of both operands. The upper 64 bits are
9606f32e7eSjoerg ///    copied from the upper 64 bits of the first source operand.
9706f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_sub_sd(__m128d __a,__m128d __b)9806f32e7eSjoerg _mm_sub_sd(__m128d __a, __m128d __b)
9906f32e7eSjoerg {
10006f32e7eSjoerg   __a[0] -= __b[0];
10106f32e7eSjoerg   return __a;
10206f32e7eSjoerg }
10306f32e7eSjoerg 
10406f32e7eSjoerg /// Subtracts two 128-bit vectors of [2 x double].
10506f32e7eSjoerg ///
10606f32e7eSjoerg /// \headerfile <x86intrin.h>
10706f32e7eSjoerg ///
10806f32e7eSjoerg /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
10906f32e7eSjoerg ///
11006f32e7eSjoerg /// \param __a
11106f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing the minuend.
11206f32e7eSjoerg /// \param __b
11306f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing the subtrahend.
11406f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the differences between
11506f32e7eSjoerg ///    both operands.
11606f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_sub_pd(__m128d __a,__m128d __b)11706f32e7eSjoerg _mm_sub_pd(__m128d __a, __m128d __b)
11806f32e7eSjoerg {
11906f32e7eSjoerg   return (__m128d)((__v2df)__a - (__v2df)__b);
12006f32e7eSjoerg }
12106f32e7eSjoerg 
12206f32e7eSjoerg /// Multiplies lower double-precision values in both operands and returns
12306f32e7eSjoerg ///    the product in the lower 64 bits of the result. The upper 64 bits of the
12406f32e7eSjoerg ///    result are copied from the upper double-precision value of the first
12506f32e7eSjoerg ///    operand.
12606f32e7eSjoerg ///
12706f32e7eSjoerg /// \headerfile <x86intrin.h>
12806f32e7eSjoerg ///
12906f32e7eSjoerg /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
13006f32e7eSjoerg ///
13106f32e7eSjoerg /// \param __a
13206f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the source operands.
13306f32e7eSjoerg /// \param __b
13406f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the source operands.
13506f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
13606f32e7eSjoerg ///    product of the lower 64 bits of both operands. The upper 64 bits are
13706f32e7eSjoerg ///    copied from the upper 64 bits of the first source operand.
13806f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mul_sd(__m128d __a,__m128d __b)13906f32e7eSjoerg _mm_mul_sd(__m128d __a, __m128d __b)
14006f32e7eSjoerg {
14106f32e7eSjoerg   __a[0] *= __b[0];
14206f32e7eSjoerg   return __a;
14306f32e7eSjoerg }
14406f32e7eSjoerg 
14506f32e7eSjoerg /// Multiplies two 128-bit vectors of [2 x double].
14606f32e7eSjoerg ///
14706f32e7eSjoerg /// \headerfile <x86intrin.h>
14806f32e7eSjoerg ///
14906f32e7eSjoerg /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
15006f32e7eSjoerg ///
15106f32e7eSjoerg /// \param __a
15206f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the operands.
15306f32e7eSjoerg /// \param __b
15406f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the operands.
15506f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the products of both
15606f32e7eSjoerg ///    operands.
15706f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_mul_pd(__m128d __a,__m128d __b)15806f32e7eSjoerg _mm_mul_pd(__m128d __a, __m128d __b)
15906f32e7eSjoerg {
16006f32e7eSjoerg   return (__m128d)((__v2df)__a * (__v2df)__b);
16106f32e7eSjoerg }
16206f32e7eSjoerg 
16306f32e7eSjoerg /// Divides the lower double-precision value of the first operand by the
16406f32e7eSjoerg ///    lower double-precision value of the second operand and returns the
16506f32e7eSjoerg ///    quotient in the lower 64 bits of the result. The upper 64 bits of the
16606f32e7eSjoerg ///    result are copied from the upper double-precision value of the first
16706f32e7eSjoerg ///    operand.
16806f32e7eSjoerg ///
16906f32e7eSjoerg /// \headerfile <x86intrin.h>
17006f32e7eSjoerg ///
17106f32e7eSjoerg /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
17206f32e7eSjoerg ///
17306f32e7eSjoerg /// \param __a
17406f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing the dividend.
17506f32e7eSjoerg /// \param __b
17606f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing divisor.
17706f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
17806f32e7eSjoerg ///    quotient of the lower 64 bits of both operands. The upper 64 bits are
17906f32e7eSjoerg ///    copied from the upper 64 bits of the first source operand.
18006f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_div_sd(__m128d __a,__m128d __b)18106f32e7eSjoerg _mm_div_sd(__m128d __a, __m128d __b)
18206f32e7eSjoerg {
18306f32e7eSjoerg   __a[0] /= __b[0];
18406f32e7eSjoerg   return __a;
18506f32e7eSjoerg }
18606f32e7eSjoerg 
18706f32e7eSjoerg /// Performs an element-by-element division of two 128-bit vectors of
18806f32e7eSjoerg ///    [2 x double].
18906f32e7eSjoerg ///
19006f32e7eSjoerg /// \headerfile <x86intrin.h>
19106f32e7eSjoerg ///
19206f32e7eSjoerg /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
19306f32e7eSjoerg ///
19406f32e7eSjoerg /// \param __a
19506f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing the dividend.
19606f32e7eSjoerg /// \param __b
19706f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing the divisor.
19806f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the quotients of both
19906f32e7eSjoerg ///    operands.
20006f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_div_pd(__m128d __a,__m128d __b)20106f32e7eSjoerg _mm_div_pd(__m128d __a, __m128d __b)
20206f32e7eSjoerg {
20306f32e7eSjoerg   return (__m128d)((__v2df)__a / (__v2df)__b);
20406f32e7eSjoerg }
20506f32e7eSjoerg 
20606f32e7eSjoerg /// Calculates the square root of the lower double-precision value of
20706f32e7eSjoerg ///    the second operand and returns it in the lower 64 bits of the result.
20806f32e7eSjoerg ///    The upper 64 bits of the result are copied from the upper
20906f32e7eSjoerg ///    double-precision value of the first operand.
21006f32e7eSjoerg ///
21106f32e7eSjoerg /// \headerfile <x86intrin.h>
21206f32e7eSjoerg ///
21306f32e7eSjoerg /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
21406f32e7eSjoerg ///
21506f32e7eSjoerg /// \param __a
21606f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the operands. The
21706f32e7eSjoerg ///    upper 64 bits of this operand are copied to the upper 64 bits of the
21806f32e7eSjoerg ///    result.
21906f32e7eSjoerg /// \param __b
22006f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the operands. The
22106f32e7eSjoerg ///    square root is calculated using the lower 64 bits of this operand.
22206f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
22306f32e7eSjoerg ///    square root of the lower 64 bits of operand \a __b, and whose upper 64
22406f32e7eSjoerg ///    bits are copied from the upper 64 bits of operand \a __a.
22506f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_sqrt_sd(__m128d __a,__m128d __b)22606f32e7eSjoerg _mm_sqrt_sd(__m128d __a, __m128d __b)
22706f32e7eSjoerg {
22806f32e7eSjoerg   __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
22906f32e7eSjoerg   return __extension__ (__m128d) { __c[0], __a[1] };
23006f32e7eSjoerg }
23106f32e7eSjoerg 
23206f32e7eSjoerg /// Calculates the square root of the each of two values stored in a
23306f32e7eSjoerg ///    128-bit vector of [2 x double].
23406f32e7eSjoerg ///
23506f32e7eSjoerg /// \headerfile <x86intrin.h>
23606f32e7eSjoerg ///
23706f32e7eSjoerg /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
23806f32e7eSjoerg ///
23906f32e7eSjoerg /// \param __a
24006f32e7eSjoerg ///    A 128-bit vector of [2 x double].
24106f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the square roots of the
24206f32e7eSjoerg ///    values in the operand.
24306f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_sqrt_pd(__m128d __a)24406f32e7eSjoerg _mm_sqrt_pd(__m128d __a)
24506f32e7eSjoerg {
24606f32e7eSjoerg   return __builtin_ia32_sqrtpd((__v2df)__a);
24706f32e7eSjoerg }
24806f32e7eSjoerg 
24906f32e7eSjoerg /// Compares lower 64-bit double-precision values of both operands, and
25006f32e7eSjoerg ///    returns the lesser of the pair of values in the lower 64-bits of the
25106f32e7eSjoerg ///    result. The upper 64 bits of the result are copied from the upper
25206f32e7eSjoerg ///    double-precision value of the first operand.
25306f32e7eSjoerg ///
25406f32e7eSjoerg /// \headerfile <x86intrin.h>
25506f32e7eSjoerg ///
25606f32e7eSjoerg /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
25706f32e7eSjoerg ///
25806f32e7eSjoerg /// \param __a
25906f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the operands. The
26006f32e7eSjoerg ///    lower 64 bits of this operand are used in the comparison.
26106f32e7eSjoerg /// \param __b
26206f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the operands. The
26306f32e7eSjoerg ///    lower 64 bits of this operand are used in the comparison.
26406f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
26506f32e7eSjoerg ///    minimum value between both operands. The upper 64 bits are copied from
26606f32e7eSjoerg ///    the upper 64 bits of the first source operand.
26706f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_min_sd(__m128d __a,__m128d __b)26806f32e7eSjoerg _mm_min_sd(__m128d __a, __m128d __b)
26906f32e7eSjoerg {
27006f32e7eSjoerg   return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
27106f32e7eSjoerg }
27206f32e7eSjoerg 
27306f32e7eSjoerg /// Performs element-by-element comparison of the two 128-bit vectors of
27406f32e7eSjoerg ///    [2 x double] and returns the vector containing the lesser of each pair of
27506f32e7eSjoerg ///    values.
27606f32e7eSjoerg ///
27706f32e7eSjoerg /// \headerfile <x86intrin.h>
27806f32e7eSjoerg ///
27906f32e7eSjoerg /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
28006f32e7eSjoerg ///
28106f32e7eSjoerg /// \param __a
28206f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the operands.
28306f32e7eSjoerg /// \param __b
28406f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the operands.
28506f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the minimum values
28606f32e7eSjoerg ///    between both operands.
28706f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_min_pd(__m128d __a,__m128d __b)28806f32e7eSjoerg _mm_min_pd(__m128d __a, __m128d __b)
28906f32e7eSjoerg {
29006f32e7eSjoerg   return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
29106f32e7eSjoerg }
29206f32e7eSjoerg 
29306f32e7eSjoerg /// Compares lower 64-bit double-precision values of both operands, and
29406f32e7eSjoerg ///    returns the greater of the pair of values in the lower 64-bits of the
29506f32e7eSjoerg ///    result. The upper 64 bits of the result are copied from the upper
29606f32e7eSjoerg ///    double-precision value of the first operand.
29706f32e7eSjoerg ///
29806f32e7eSjoerg /// \headerfile <x86intrin.h>
29906f32e7eSjoerg ///
30006f32e7eSjoerg /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
30106f32e7eSjoerg ///
30206f32e7eSjoerg /// \param __a
30306f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the operands. The
30406f32e7eSjoerg ///    lower 64 bits of this operand are used in the comparison.
30506f32e7eSjoerg /// \param __b
30606f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the operands. The
30706f32e7eSjoerg ///    lower 64 bits of this operand are used in the comparison.
30806f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
30906f32e7eSjoerg ///    maximum value between both operands. The upper 64 bits are copied from
31006f32e7eSjoerg ///    the upper 64 bits of the first source operand.
31106f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_max_sd(__m128d __a,__m128d __b)31206f32e7eSjoerg _mm_max_sd(__m128d __a, __m128d __b)
31306f32e7eSjoerg {
31406f32e7eSjoerg   return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
31506f32e7eSjoerg }
31606f32e7eSjoerg 
31706f32e7eSjoerg /// Performs element-by-element comparison of the two 128-bit vectors of
31806f32e7eSjoerg ///    [2 x double] and returns the vector containing the greater of each pair
31906f32e7eSjoerg ///    of values.
32006f32e7eSjoerg ///
32106f32e7eSjoerg /// \headerfile <x86intrin.h>
32206f32e7eSjoerg ///
32306f32e7eSjoerg /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
32406f32e7eSjoerg ///
32506f32e7eSjoerg /// \param __a
32606f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the operands.
32706f32e7eSjoerg /// \param __b
32806f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the operands.
32906f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the maximum values
33006f32e7eSjoerg ///    between both operands.
33106f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_max_pd(__m128d __a,__m128d __b)33206f32e7eSjoerg _mm_max_pd(__m128d __a, __m128d __b)
33306f32e7eSjoerg {
33406f32e7eSjoerg   return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
33506f32e7eSjoerg }
33606f32e7eSjoerg 
33706f32e7eSjoerg /// Performs a bitwise AND of two 128-bit vectors of [2 x double].
33806f32e7eSjoerg ///
33906f32e7eSjoerg /// \headerfile <x86intrin.h>
34006f32e7eSjoerg ///
34106f32e7eSjoerg /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
34206f32e7eSjoerg ///
34306f32e7eSjoerg /// \param __a
34406f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the source operands.
34506f32e7eSjoerg /// \param __b
34606f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the source operands.
34706f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
34806f32e7eSjoerg ///    values between both operands.
34906f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_and_pd(__m128d __a,__m128d __b)35006f32e7eSjoerg _mm_and_pd(__m128d __a, __m128d __b)
35106f32e7eSjoerg {
35206f32e7eSjoerg   return (__m128d)((__v2du)__a & (__v2du)__b);
35306f32e7eSjoerg }
35406f32e7eSjoerg 
35506f32e7eSjoerg /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
35606f32e7eSjoerg ///    the one's complement of the values contained in the first source operand.
35706f32e7eSjoerg ///
35806f32e7eSjoerg /// \headerfile <x86intrin.h>
35906f32e7eSjoerg ///
36006f32e7eSjoerg /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
36106f32e7eSjoerg ///
36206f32e7eSjoerg /// \param __a
36306f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing the left source operand. The
36406f32e7eSjoerg ///    one's complement of this value is used in the bitwise AND.
36506f32e7eSjoerg /// \param __b
36606f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing the right source operand.
36706f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
36806f32e7eSjoerg ///    values in the second operand and the one's complement of the first
36906f32e7eSjoerg ///    operand.
37006f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_andnot_pd(__m128d __a,__m128d __b)37106f32e7eSjoerg _mm_andnot_pd(__m128d __a, __m128d __b)
37206f32e7eSjoerg {
37306f32e7eSjoerg   return (__m128d)(~(__v2du)__a & (__v2du)__b);
37406f32e7eSjoerg }
37506f32e7eSjoerg 
37606f32e7eSjoerg /// Performs a bitwise OR of two 128-bit vectors of [2 x double].
37706f32e7eSjoerg ///
37806f32e7eSjoerg /// \headerfile <x86intrin.h>
37906f32e7eSjoerg ///
38006f32e7eSjoerg /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
38106f32e7eSjoerg ///
38206f32e7eSjoerg /// \param __a
38306f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the source operands.
38406f32e7eSjoerg /// \param __b
38506f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the source operands.
38606f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
38706f32e7eSjoerg ///    values between both operands.
38806f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_or_pd(__m128d __a,__m128d __b)38906f32e7eSjoerg _mm_or_pd(__m128d __a, __m128d __b)
39006f32e7eSjoerg {
39106f32e7eSjoerg   return (__m128d)((__v2du)__a | (__v2du)__b);
39206f32e7eSjoerg }
39306f32e7eSjoerg 
39406f32e7eSjoerg /// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
39506f32e7eSjoerg ///
39606f32e7eSjoerg /// \headerfile <x86intrin.h>
39706f32e7eSjoerg ///
39806f32e7eSjoerg /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
39906f32e7eSjoerg ///
40006f32e7eSjoerg /// \param __a
40106f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the source operands.
40206f32e7eSjoerg /// \param __b
40306f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing one of the source operands.
40406f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
40506f32e7eSjoerg ///    values between both operands.
40606f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_xor_pd(__m128d __a,__m128d __b)40706f32e7eSjoerg _mm_xor_pd(__m128d __a, __m128d __b)
40806f32e7eSjoerg {
40906f32e7eSjoerg   return (__m128d)((__v2du)__a ^ (__v2du)__b);
41006f32e7eSjoerg }
41106f32e7eSjoerg 
41206f32e7eSjoerg /// Compares each of the corresponding double-precision values of the
41306f32e7eSjoerg ///    128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
41406f32e7eSjoerg ///    for false, 0xFFFFFFFFFFFFFFFF for true.
41506f32e7eSjoerg ///
41606f32e7eSjoerg /// \headerfile <x86intrin.h>
41706f32e7eSjoerg ///
41806f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
41906f32e7eSjoerg ///
42006f32e7eSjoerg /// \param __a
42106f32e7eSjoerg ///    A 128-bit vector of [2 x double].
42206f32e7eSjoerg /// \param __b
42306f32e7eSjoerg ///    A 128-bit vector of [2 x double].
42406f32e7eSjoerg /// \returns A 128-bit vector containing the comparison results.
42506f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpeq_pd(__m128d __a,__m128d __b)42606f32e7eSjoerg _mm_cmpeq_pd(__m128d __a, __m128d __b)
42706f32e7eSjoerg {
42806f32e7eSjoerg   return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
42906f32e7eSjoerg }
43006f32e7eSjoerg 
43106f32e7eSjoerg /// Compares each of the corresponding double-precision values of the
43206f32e7eSjoerg ///    128-bit vectors of [2 x double] to determine if the values in the first
43306f32e7eSjoerg ///    operand are less than those in the second operand. Each comparison
43406f32e7eSjoerg ///    yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
43506f32e7eSjoerg ///
43606f32e7eSjoerg /// \headerfile <x86intrin.h>
43706f32e7eSjoerg ///
43806f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
43906f32e7eSjoerg ///
44006f32e7eSjoerg /// \param __a
44106f32e7eSjoerg ///    A 128-bit vector of [2 x double].
44206f32e7eSjoerg /// \param __b
44306f32e7eSjoerg ///    A 128-bit vector of [2 x double].
44406f32e7eSjoerg /// \returns A 128-bit vector containing the comparison results.
44506f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmplt_pd(__m128d __a,__m128d __b)44606f32e7eSjoerg _mm_cmplt_pd(__m128d __a, __m128d __b)
44706f32e7eSjoerg {
44806f32e7eSjoerg   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
44906f32e7eSjoerg }
45006f32e7eSjoerg 
45106f32e7eSjoerg /// Compares each of the corresponding double-precision values of the
45206f32e7eSjoerg ///    128-bit vectors of [2 x double] to determine if the values in the first
45306f32e7eSjoerg ///    operand are less than or equal to those in the second operand.
45406f32e7eSjoerg ///
45506f32e7eSjoerg ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
45606f32e7eSjoerg ///
45706f32e7eSjoerg /// \headerfile <x86intrin.h>
45806f32e7eSjoerg ///
45906f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
46006f32e7eSjoerg ///
46106f32e7eSjoerg /// \param __a
46206f32e7eSjoerg ///    A 128-bit vector of [2 x double].
46306f32e7eSjoerg /// \param __b
46406f32e7eSjoerg ///    A 128-bit vector of [2 x double].
46506f32e7eSjoerg /// \returns A 128-bit vector containing the comparison results.
46606f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmple_pd(__m128d __a,__m128d __b)46706f32e7eSjoerg _mm_cmple_pd(__m128d __a, __m128d __b)
46806f32e7eSjoerg {
46906f32e7eSjoerg   return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
47006f32e7eSjoerg }
47106f32e7eSjoerg 
47206f32e7eSjoerg /// Compares each of the corresponding double-precision values of the
47306f32e7eSjoerg ///    128-bit vectors of [2 x double] to determine if the values in the first
47406f32e7eSjoerg ///    operand are greater than those in the second operand.
47506f32e7eSjoerg ///
47606f32e7eSjoerg ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
47706f32e7eSjoerg ///
47806f32e7eSjoerg /// \headerfile <x86intrin.h>
47906f32e7eSjoerg ///
48006f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
48106f32e7eSjoerg ///
48206f32e7eSjoerg /// \param __a
48306f32e7eSjoerg ///    A 128-bit vector of [2 x double].
48406f32e7eSjoerg /// \param __b
48506f32e7eSjoerg ///    A 128-bit vector of [2 x double].
48606f32e7eSjoerg /// \returns A 128-bit vector containing the comparison results.
48706f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpgt_pd(__m128d __a,__m128d __b)48806f32e7eSjoerg _mm_cmpgt_pd(__m128d __a, __m128d __b)
48906f32e7eSjoerg {
49006f32e7eSjoerg   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
49106f32e7eSjoerg }
49206f32e7eSjoerg 
49306f32e7eSjoerg /// Compares each of the corresponding double-precision values of the
49406f32e7eSjoerg ///    128-bit vectors of [2 x double] to determine if the values in the first
49506f32e7eSjoerg ///    operand are greater than or equal to those in the second operand.
49606f32e7eSjoerg ///
49706f32e7eSjoerg ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
49806f32e7eSjoerg ///
49906f32e7eSjoerg /// \headerfile <x86intrin.h>
50006f32e7eSjoerg ///
50106f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
50206f32e7eSjoerg ///
50306f32e7eSjoerg /// \param __a
50406f32e7eSjoerg ///    A 128-bit vector of [2 x double].
50506f32e7eSjoerg /// \param __b
50606f32e7eSjoerg ///    A 128-bit vector of [2 x double].
50706f32e7eSjoerg /// \returns A 128-bit vector containing the comparison results.
50806f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpge_pd(__m128d __a,__m128d __b)50906f32e7eSjoerg _mm_cmpge_pd(__m128d __a, __m128d __b)
51006f32e7eSjoerg {
51106f32e7eSjoerg   return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
51206f32e7eSjoerg }
51306f32e7eSjoerg 
51406f32e7eSjoerg /// Compares each of the corresponding double-precision values of the
51506f32e7eSjoerg ///    128-bit vectors of [2 x double] to determine if the values in the first
51606f32e7eSjoerg ///    operand are ordered with respect to those in the second operand.
51706f32e7eSjoerg ///
51806f32e7eSjoerg ///    A pair of double-precision values are "ordered" with respect to each
51906f32e7eSjoerg ///    other if neither value is a NaN. Each comparison yields 0x0 for false,
52006f32e7eSjoerg ///    0xFFFFFFFFFFFFFFFF for true.
52106f32e7eSjoerg ///
52206f32e7eSjoerg /// \headerfile <x86intrin.h>
52306f32e7eSjoerg ///
52406f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
52506f32e7eSjoerg ///
52606f32e7eSjoerg /// \param __a
52706f32e7eSjoerg ///    A 128-bit vector of [2 x double].
52806f32e7eSjoerg /// \param __b
52906f32e7eSjoerg ///    A 128-bit vector of [2 x double].
53006f32e7eSjoerg /// \returns A 128-bit vector containing the comparison results.
53106f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpord_pd(__m128d __a,__m128d __b)53206f32e7eSjoerg _mm_cmpord_pd(__m128d __a, __m128d __b)
53306f32e7eSjoerg {
53406f32e7eSjoerg   return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
53506f32e7eSjoerg }
53606f32e7eSjoerg 
53706f32e7eSjoerg /// Compares each of the corresponding double-precision values of the
53806f32e7eSjoerg ///    128-bit vectors of [2 x double] to determine if the values in the first
53906f32e7eSjoerg ///    operand are unordered with respect to those in the second operand.
54006f32e7eSjoerg ///
54106f32e7eSjoerg ///    A pair of double-precision values are "unordered" with respect to each
54206f32e7eSjoerg ///    other if one or both values are NaN. Each comparison yields 0x0 for
54306f32e7eSjoerg ///    false, 0xFFFFFFFFFFFFFFFF for true.
54406f32e7eSjoerg ///
54506f32e7eSjoerg /// \headerfile <x86intrin.h>
54606f32e7eSjoerg ///
54706f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
54806f32e7eSjoerg ///   instruction.
54906f32e7eSjoerg ///
55006f32e7eSjoerg /// \param __a
55106f32e7eSjoerg ///    A 128-bit vector of [2 x double].
55206f32e7eSjoerg /// \param __b
55306f32e7eSjoerg ///    A 128-bit vector of [2 x double].
55406f32e7eSjoerg /// \returns A 128-bit vector containing the comparison results.
55506f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpunord_pd(__m128d __a,__m128d __b)55606f32e7eSjoerg _mm_cmpunord_pd(__m128d __a, __m128d __b)
55706f32e7eSjoerg {
55806f32e7eSjoerg   return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
55906f32e7eSjoerg }
56006f32e7eSjoerg 
56106f32e7eSjoerg /// Compares each of the corresponding double-precision values of the
56206f32e7eSjoerg ///    128-bit vectors of [2 x double] to determine if the values in the first
56306f32e7eSjoerg ///    operand are unequal to those in the second operand.
56406f32e7eSjoerg ///
56506f32e7eSjoerg ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
56606f32e7eSjoerg ///
56706f32e7eSjoerg /// \headerfile <x86intrin.h>
56806f32e7eSjoerg ///
56906f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
57006f32e7eSjoerg ///
57106f32e7eSjoerg /// \param __a
57206f32e7eSjoerg ///    A 128-bit vector of [2 x double].
57306f32e7eSjoerg /// \param __b
57406f32e7eSjoerg ///    A 128-bit vector of [2 x double].
57506f32e7eSjoerg /// \returns A 128-bit vector containing the comparison results.
57606f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpneq_pd(__m128d __a,__m128d __b)57706f32e7eSjoerg _mm_cmpneq_pd(__m128d __a, __m128d __b)
57806f32e7eSjoerg {
57906f32e7eSjoerg   return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
58006f32e7eSjoerg }
58106f32e7eSjoerg 
58206f32e7eSjoerg /// Compares each of the corresponding double-precision values of the
58306f32e7eSjoerg ///    128-bit vectors of [2 x double] to determine if the values in the first
58406f32e7eSjoerg ///    operand are not less than those in the second operand.
58506f32e7eSjoerg ///
58606f32e7eSjoerg ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
58706f32e7eSjoerg ///
58806f32e7eSjoerg /// \headerfile <x86intrin.h>
58906f32e7eSjoerg ///
59006f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
59106f32e7eSjoerg ///
59206f32e7eSjoerg /// \param __a
59306f32e7eSjoerg ///    A 128-bit vector of [2 x double].
59406f32e7eSjoerg /// \param __b
59506f32e7eSjoerg ///    A 128-bit vector of [2 x double].
59606f32e7eSjoerg /// \returns A 128-bit vector containing the comparison results.
59706f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnlt_pd(__m128d __a,__m128d __b)59806f32e7eSjoerg _mm_cmpnlt_pd(__m128d __a, __m128d __b)
59906f32e7eSjoerg {
60006f32e7eSjoerg   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
60106f32e7eSjoerg }
60206f32e7eSjoerg 
60306f32e7eSjoerg /// Compares each of the corresponding double-precision values of the
60406f32e7eSjoerg ///    128-bit vectors of [2 x double] to determine if the values in the first
60506f32e7eSjoerg ///    operand are not less than or equal to those in the second operand.
60606f32e7eSjoerg ///
60706f32e7eSjoerg ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
60806f32e7eSjoerg ///
60906f32e7eSjoerg /// \headerfile <x86intrin.h>
61006f32e7eSjoerg ///
61106f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
61206f32e7eSjoerg ///
61306f32e7eSjoerg /// \param __a
61406f32e7eSjoerg ///    A 128-bit vector of [2 x double].
61506f32e7eSjoerg /// \param __b
61606f32e7eSjoerg ///    A 128-bit vector of [2 x double].
61706f32e7eSjoerg /// \returns A 128-bit vector containing the comparison results.
61806f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnle_pd(__m128d __a,__m128d __b)61906f32e7eSjoerg _mm_cmpnle_pd(__m128d __a, __m128d __b)
62006f32e7eSjoerg {
62106f32e7eSjoerg   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
62206f32e7eSjoerg }
62306f32e7eSjoerg 
62406f32e7eSjoerg /// Compares each of the corresponding double-precision values of the
62506f32e7eSjoerg ///    128-bit vectors of [2 x double] to determine if the values in the first
62606f32e7eSjoerg ///    operand are not greater than those in the second operand.
62706f32e7eSjoerg ///
62806f32e7eSjoerg ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
62906f32e7eSjoerg ///
63006f32e7eSjoerg /// \headerfile <x86intrin.h>
63106f32e7eSjoerg ///
63206f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
63306f32e7eSjoerg ///
63406f32e7eSjoerg /// \param __a
63506f32e7eSjoerg ///    A 128-bit vector of [2 x double].
63606f32e7eSjoerg /// \param __b
63706f32e7eSjoerg ///    A 128-bit vector of [2 x double].
63806f32e7eSjoerg /// \returns A 128-bit vector containing the comparison results.
63906f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpngt_pd(__m128d __a,__m128d __b)64006f32e7eSjoerg _mm_cmpngt_pd(__m128d __a, __m128d __b)
64106f32e7eSjoerg {
64206f32e7eSjoerg   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
64306f32e7eSjoerg }
64406f32e7eSjoerg 
64506f32e7eSjoerg /// Compares each of the corresponding double-precision values of the
64606f32e7eSjoerg ///    128-bit vectors of [2 x double] to determine if the values in the first
64706f32e7eSjoerg ///    operand are not greater than or equal to those in the second operand.
64806f32e7eSjoerg ///
64906f32e7eSjoerg ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
65006f32e7eSjoerg ///
65106f32e7eSjoerg /// \headerfile <x86intrin.h>
65206f32e7eSjoerg ///
65306f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
65406f32e7eSjoerg ///
65506f32e7eSjoerg /// \param __a
65606f32e7eSjoerg ///    A 128-bit vector of [2 x double].
65706f32e7eSjoerg /// \param __b
65806f32e7eSjoerg ///    A 128-bit vector of [2 x double].
65906f32e7eSjoerg /// \returns A 128-bit vector containing the comparison results.
66006f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnge_pd(__m128d __a,__m128d __b)66106f32e7eSjoerg _mm_cmpnge_pd(__m128d __a, __m128d __b)
66206f32e7eSjoerg {
66306f32e7eSjoerg   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
66406f32e7eSjoerg }
66506f32e7eSjoerg 
66606f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
66706f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] for equality.
66806f32e7eSjoerg ///
66906f32e7eSjoerg ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
67006f32e7eSjoerg ///
67106f32e7eSjoerg /// \headerfile <x86intrin.h>
67206f32e7eSjoerg ///
67306f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
67406f32e7eSjoerg ///
67506f32e7eSjoerg /// \param __a
67606f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
67706f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
67806f32e7eSjoerg /// \param __b
67906f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
68006f32e7eSjoerg ///    compared to the lower double-precision value of \a __a.
68106f32e7eSjoerg /// \returns A 128-bit vector. The lower 64 bits contains the comparison
68206f32e7eSjoerg ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
68306f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpeq_sd(__m128d __a,__m128d __b)68406f32e7eSjoerg _mm_cmpeq_sd(__m128d __a, __m128d __b)
68506f32e7eSjoerg {
68606f32e7eSjoerg   return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
68706f32e7eSjoerg }
68806f32e7eSjoerg 
68906f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
69006f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
69106f32e7eSjoerg ///    the value in the first parameter is less than the corresponding value in
69206f32e7eSjoerg ///    the second parameter.
69306f32e7eSjoerg ///
69406f32e7eSjoerg ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
69506f32e7eSjoerg ///
69606f32e7eSjoerg /// \headerfile <x86intrin.h>
69706f32e7eSjoerg ///
69806f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
69906f32e7eSjoerg ///
70006f32e7eSjoerg /// \param __a
70106f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
70206f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
70306f32e7eSjoerg /// \param __b
70406f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
70506f32e7eSjoerg ///    compared to the lower double-precision value of \a __a.
70606f32e7eSjoerg /// \returns A 128-bit vector. The lower 64 bits contains the comparison
70706f32e7eSjoerg ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
70806f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmplt_sd(__m128d __a,__m128d __b)70906f32e7eSjoerg _mm_cmplt_sd(__m128d __a, __m128d __b)
71006f32e7eSjoerg {
71106f32e7eSjoerg   return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
71206f32e7eSjoerg }
71306f32e7eSjoerg 
71406f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
71506f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
71606f32e7eSjoerg ///    the value in the first parameter is less than or equal to the
71706f32e7eSjoerg ///    corresponding value in the second parameter.
71806f32e7eSjoerg ///
71906f32e7eSjoerg ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
72006f32e7eSjoerg ///
72106f32e7eSjoerg /// \headerfile <x86intrin.h>
72206f32e7eSjoerg ///
72306f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
72406f32e7eSjoerg ///
72506f32e7eSjoerg /// \param __a
72606f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
72706f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
72806f32e7eSjoerg /// \param __b
72906f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
73006f32e7eSjoerg ///    compared to the lower double-precision value of \a __a.
73106f32e7eSjoerg /// \returns A 128-bit vector. The lower 64 bits contains the comparison
73206f32e7eSjoerg ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
73306f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmple_sd(__m128d __a,__m128d __b)73406f32e7eSjoerg _mm_cmple_sd(__m128d __a, __m128d __b)
73506f32e7eSjoerg {
73606f32e7eSjoerg   return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
73706f32e7eSjoerg }
73806f32e7eSjoerg 
73906f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
74006f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
74106f32e7eSjoerg ///    the value in the first parameter is greater than the corresponding value
74206f32e7eSjoerg ///    in the second parameter.
74306f32e7eSjoerg ///
74406f32e7eSjoerg ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
74506f32e7eSjoerg ///
74606f32e7eSjoerg /// \headerfile <x86intrin.h>
74706f32e7eSjoerg ///
74806f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
74906f32e7eSjoerg ///
75006f32e7eSjoerg /// \param __a
75106f32e7eSjoerg ///     A 128-bit vector of [2 x double]. The lower double-precision value is
75206f32e7eSjoerg ///     compared to the lower double-precision value of \a __b.
75306f32e7eSjoerg /// \param __b
75406f32e7eSjoerg ///     A 128-bit vector of [2 x double]. The lower double-precision value is
75506f32e7eSjoerg ///     compared to the lower double-precision value of \a __a.
75606f32e7eSjoerg /// \returns A 128-bit vector. The lower 64 bits contains the comparison
75706f32e7eSjoerg ///     results. The upper 64 bits are copied from the upper 64 bits of \a __a.
75806f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpgt_sd(__m128d __a,__m128d __b)75906f32e7eSjoerg _mm_cmpgt_sd(__m128d __a, __m128d __b)
76006f32e7eSjoerg {
76106f32e7eSjoerg   __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
76206f32e7eSjoerg   return __extension__ (__m128d) { __c[0], __a[1] };
76306f32e7eSjoerg }
76406f32e7eSjoerg 
76506f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
76606f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
76706f32e7eSjoerg ///    the value in the first parameter is greater than or equal to the
76806f32e7eSjoerg ///    corresponding value in the second parameter.
76906f32e7eSjoerg ///
77006f32e7eSjoerg ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
77106f32e7eSjoerg ///
77206f32e7eSjoerg /// \headerfile <x86intrin.h>
77306f32e7eSjoerg ///
77406f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
77506f32e7eSjoerg ///
77606f32e7eSjoerg /// \param __a
77706f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
77806f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
77906f32e7eSjoerg /// \param __b
78006f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
78106f32e7eSjoerg ///    compared to the lower double-precision value of \a __a.
78206f32e7eSjoerg /// \returns A 128-bit vector. The lower 64 bits contains the comparison
78306f32e7eSjoerg ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
78406f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpge_sd(__m128d __a,__m128d __b)78506f32e7eSjoerg _mm_cmpge_sd(__m128d __a, __m128d __b)
78606f32e7eSjoerg {
78706f32e7eSjoerg   __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
78806f32e7eSjoerg   return __extension__ (__m128d) { __c[0], __a[1] };
78906f32e7eSjoerg }
79006f32e7eSjoerg 
79106f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
79206f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
79306f32e7eSjoerg ///    the value in the first parameter is "ordered" with respect to the
79406f32e7eSjoerg ///    corresponding value in the second parameter.
79506f32e7eSjoerg ///
79606f32e7eSjoerg ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
79706f32e7eSjoerg ///    of double-precision values are "ordered" with respect to each other if
79806f32e7eSjoerg ///    neither value is a NaN.
79906f32e7eSjoerg ///
80006f32e7eSjoerg /// \headerfile <x86intrin.h>
80106f32e7eSjoerg ///
80206f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
80306f32e7eSjoerg ///
80406f32e7eSjoerg /// \param __a
80506f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
80606f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
80706f32e7eSjoerg /// \param __b
80806f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
80906f32e7eSjoerg ///    compared to the lower double-precision value of \a __a.
81006f32e7eSjoerg /// \returns A 128-bit vector. The lower 64 bits contains the comparison
81106f32e7eSjoerg ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
81206f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpord_sd(__m128d __a,__m128d __b)81306f32e7eSjoerg _mm_cmpord_sd(__m128d __a, __m128d __b)
81406f32e7eSjoerg {
81506f32e7eSjoerg   return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
81606f32e7eSjoerg }
81706f32e7eSjoerg 
81806f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
81906f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
82006f32e7eSjoerg ///    the value in the first parameter is "unordered" with respect to the
82106f32e7eSjoerg ///    corresponding value in the second parameter.
82206f32e7eSjoerg ///
82306f32e7eSjoerg ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
82406f32e7eSjoerg ///    of double-precision values are "unordered" with respect to each other if
82506f32e7eSjoerg ///    one or both values are NaN.
82606f32e7eSjoerg ///
82706f32e7eSjoerg /// \headerfile <x86intrin.h>
82806f32e7eSjoerg ///
82906f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
83006f32e7eSjoerg ///   instruction.
83106f32e7eSjoerg ///
83206f32e7eSjoerg /// \param __a
83306f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
83406f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
83506f32e7eSjoerg /// \param __b
83606f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
83706f32e7eSjoerg ///    compared to the lower double-precision value of \a __a.
83806f32e7eSjoerg /// \returns A 128-bit vector. The lower 64 bits contains the comparison
83906f32e7eSjoerg ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
84006f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpunord_sd(__m128d __a,__m128d __b)84106f32e7eSjoerg _mm_cmpunord_sd(__m128d __a, __m128d __b)
84206f32e7eSjoerg {
84306f32e7eSjoerg   return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
84406f32e7eSjoerg }
84506f32e7eSjoerg 
84606f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
84706f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
84806f32e7eSjoerg ///    the value in the first parameter is unequal to the corresponding value in
84906f32e7eSjoerg ///    the second parameter.
85006f32e7eSjoerg ///
85106f32e7eSjoerg ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
85206f32e7eSjoerg ///
85306f32e7eSjoerg /// \headerfile <x86intrin.h>
85406f32e7eSjoerg ///
85506f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
85606f32e7eSjoerg ///
85706f32e7eSjoerg /// \param __a
85806f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
85906f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
86006f32e7eSjoerg /// \param __b
86106f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
86206f32e7eSjoerg ///    compared to the lower double-precision value of \a __a.
86306f32e7eSjoerg /// \returns A 128-bit vector. The lower 64 bits contains the comparison
86406f32e7eSjoerg ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
86506f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpneq_sd(__m128d __a,__m128d __b)86606f32e7eSjoerg _mm_cmpneq_sd(__m128d __a, __m128d __b)
86706f32e7eSjoerg {
86806f32e7eSjoerg   return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
86906f32e7eSjoerg }
87006f32e7eSjoerg 
87106f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
87206f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
87306f32e7eSjoerg ///    the value in the first parameter is not less than the corresponding
87406f32e7eSjoerg ///    value in the second parameter.
87506f32e7eSjoerg ///
87606f32e7eSjoerg ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
87706f32e7eSjoerg ///
87806f32e7eSjoerg /// \headerfile <x86intrin.h>
87906f32e7eSjoerg ///
88006f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
88106f32e7eSjoerg ///
88206f32e7eSjoerg /// \param __a
88306f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
88406f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
88506f32e7eSjoerg /// \param __b
88606f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
88706f32e7eSjoerg ///    compared to the lower double-precision value of \a __a.
88806f32e7eSjoerg /// \returns A 128-bit vector. The lower 64 bits contains the comparison
88906f32e7eSjoerg ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
89006f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnlt_sd(__m128d __a,__m128d __b)89106f32e7eSjoerg _mm_cmpnlt_sd(__m128d __a, __m128d __b)
89206f32e7eSjoerg {
89306f32e7eSjoerg   return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
89406f32e7eSjoerg }
89506f32e7eSjoerg 
89606f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
89706f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
89806f32e7eSjoerg ///    the value in the first parameter is not less than or equal to the
89906f32e7eSjoerg ///    corresponding value in the second parameter.
90006f32e7eSjoerg ///
90106f32e7eSjoerg ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
90206f32e7eSjoerg ///
90306f32e7eSjoerg /// \headerfile <x86intrin.h>
90406f32e7eSjoerg ///
90506f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
90606f32e7eSjoerg ///
90706f32e7eSjoerg /// \param __a
90806f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
90906f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
91006f32e7eSjoerg /// \param __b
91106f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
91206f32e7eSjoerg ///    compared to the lower double-precision value of \a __a.
91306f32e7eSjoerg /// \returns  A 128-bit vector. The lower 64 bits contains the comparison
91406f32e7eSjoerg ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
91506f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnle_sd(__m128d __a,__m128d __b)91606f32e7eSjoerg _mm_cmpnle_sd(__m128d __a, __m128d __b)
91706f32e7eSjoerg {
91806f32e7eSjoerg   return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
91906f32e7eSjoerg }
92006f32e7eSjoerg 
92106f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
92206f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
92306f32e7eSjoerg ///    the value in the first parameter is not greater than the corresponding
92406f32e7eSjoerg ///    value in the second parameter.
92506f32e7eSjoerg ///
92606f32e7eSjoerg ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
92706f32e7eSjoerg ///
92806f32e7eSjoerg /// \headerfile <x86intrin.h>
92906f32e7eSjoerg ///
93006f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
93106f32e7eSjoerg ///
93206f32e7eSjoerg /// \param __a
93306f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
93406f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
93506f32e7eSjoerg /// \param __b
93606f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
93706f32e7eSjoerg ///    compared to the lower double-precision value of \a __a.
93806f32e7eSjoerg /// \returns A 128-bit vector. The lower 64 bits contains the comparison
93906f32e7eSjoerg ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
94006f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpngt_sd(__m128d __a,__m128d __b)94106f32e7eSjoerg _mm_cmpngt_sd(__m128d __a, __m128d __b)
94206f32e7eSjoerg {
94306f32e7eSjoerg   __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
94406f32e7eSjoerg   return __extension__ (__m128d) { __c[0], __a[1] };
94506f32e7eSjoerg }
94606f32e7eSjoerg 
94706f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
94806f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
94906f32e7eSjoerg ///    the value in the first parameter is not greater than or equal to the
95006f32e7eSjoerg ///    corresponding value in the second parameter.
95106f32e7eSjoerg ///
95206f32e7eSjoerg ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
95306f32e7eSjoerg ///
95406f32e7eSjoerg /// \headerfile <x86intrin.h>
95506f32e7eSjoerg ///
95606f32e7eSjoerg /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
95706f32e7eSjoerg ///
95806f32e7eSjoerg /// \param __a
95906f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
96006f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
96106f32e7eSjoerg /// \param __b
96206f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
96306f32e7eSjoerg ///    compared to the lower double-precision value of \a __a.
96406f32e7eSjoerg /// \returns A 128-bit vector. The lower 64 bits contains the comparison
96506f32e7eSjoerg ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
96606f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cmpnge_sd(__m128d __a,__m128d __b)96706f32e7eSjoerg _mm_cmpnge_sd(__m128d __a, __m128d __b)
96806f32e7eSjoerg {
96906f32e7eSjoerg   __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
97006f32e7eSjoerg   return __extension__ (__m128d) { __c[0], __a[1] };
97106f32e7eSjoerg }
97206f32e7eSjoerg 
97306f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
97406f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] for equality.
97506f32e7eSjoerg ///
97606f32e7eSjoerg ///    The comparison yields 0 for false, 1 for true. If either of the two
97706f32e7eSjoerg ///    lower double-precision values is NaN, 0 is returned.
97806f32e7eSjoerg ///
97906f32e7eSjoerg /// \headerfile <x86intrin.h>
98006f32e7eSjoerg ///
98106f32e7eSjoerg /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
98206f32e7eSjoerg ///
98306f32e7eSjoerg /// \param __a
98406f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
98506f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
98606f32e7eSjoerg /// \param __b
98706f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
98806f32e7eSjoerg ///    compared to the lower double-precision value of \a __a.
98906f32e7eSjoerg /// \returns An integer containing the comparison results. If either of the two
99006f32e7eSjoerg ///    lower double-precision values is NaN, 0 is returned.
99106f32e7eSjoerg static __inline__ int __DEFAULT_FN_ATTRS
_mm_comieq_sd(__m128d __a,__m128d __b)99206f32e7eSjoerg _mm_comieq_sd(__m128d __a, __m128d __b)
99306f32e7eSjoerg {
99406f32e7eSjoerg   return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
99506f32e7eSjoerg }
99606f32e7eSjoerg 
99706f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
99806f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
99906f32e7eSjoerg ///    the value in the first parameter is less than the corresponding value in
100006f32e7eSjoerg ///    the second parameter.
100106f32e7eSjoerg ///
100206f32e7eSjoerg ///    The comparison yields 0 for false, 1 for true. If either of the two
100306f32e7eSjoerg ///    lower double-precision values is NaN, 0 is returned.
100406f32e7eSjoerg ///
100506f32e7eSjoerg /// \headerfile <x86intrin.h>
100606f32e7eSjoerg ///
100706f32e7eSjoerg /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
100806f32e7eSjoerg ///
100906f32e7eSjoerg /// \param __a
101006f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
101106f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
101206f32e7eSjoerg /// \param __b
101306f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
101406f32e7eSjoerg ///    compared to the lower double-precision value of \a __a.
101506f32e7eSjoerg /// \returns An integer containing the comparison results. If either of the two
101606f32e7eSjoerg ///     lower double-precision values is NaN, 0 is returned.
101706f32e7eSjoerg static __inline__ int __DEFAULT_FN_ATTRS
_mm_comilt_sd(__m128d __a,__m128d __b)101806f32e7eSjoerg _mm_comilt_sd(__m128d __a, __m128d __b)
101906f32e7eSjoerg {
102006f32e7eSjoerg   return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
102106f32e7eSjoerg }
102206f32e7eSjoerg 
102306f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
102406f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
102506f32e7eSjoerg ///    the value in the first parameter is less than or equal to the
102606f32e7eSjoerg ///    corresponding value in the second parameter.
102706f32e7eSjoerg ///
102806f32e7eSjoerg ///    The comparison yields 0 for false, 1 for true. If either of the two
102906f32e7eSjoerg ///    lower double-precision values is NaN, 0 is returned.
103006f32e7eSjoerg ///
103106f32e7eSjoerg /// \headerfile <x86intrin.h>
103206f32e7eSjoerg ///
103306f32e7eSjoerg /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
103406f32e7eSjoerg ///
103506f32e7eSjoerg /// \param __a
103606f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
103706f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
103806f32e7eSjoerg /// \param __b
103906f32e7eSjoerg ///     A 128-bit vector of [2 x double]. The lower double-precision value is
104006f32e7eSjoerg ///     compared to the lower double-precision value of \a __a.
104106f32e7eSjoerg /// \returns An integer containing the comparison results. If either of the two
104206f32e7eSjoerg ///     lower double-precision values is NaN, 0 is returned.
104306f32e7eSjoerg static __inline__ int __DEFAULT_FN_ATTRS
_mm_comile_sd(__m128d __a,__m128d __b)104406f32e7eSjoerg _mm_comile_sd(__m128d __a, __m128d __b)
104506f32e7eSjoerg {
104606f32e7eSjoerg   return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
104706f32e7eSjoerg }
104806f32e7eSjoerg 
104906f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
105006f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
105106f32e7eSjoerg ///    the value in the first parameter is greater than the corresponding value
105206f32e7eSjoerg ///    in the second parameter.
105306f32e7eSjoerg ///
105406f32e7eSjoerg ///    The comparison yields 0 for false, 1 for true. If either of the two
105506f32e7eSjoerg ///    lower double-precision values is NaN, 0 is returned.
105606f32e7eSjoerg ///
105706f32e7eSjoerg /// \headerfile <x86intrin.h>
105806f32e7eSjoerg ///
105906f32e7eSjoerg /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
106006f32e7eSjoerg ///
106106f32e7eSjoerg /// \param __a
106206f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
106306f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
106406f32e7eSjoerg /// \param __b
106506f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
106606f32e7eSjoerg ///    compared to the lower double-precision value of \a __a.
106706f32e7eSjoerg /// \returns An integer containing the comparison results. If either of the two
106806f32e7eSjoerg ///     lower double-precision values is NaN, 0 is returned.
106906f32e7eSjoerg static __inline__ int __DEFAULT_FN_ATTRS
_mm_comigt_sd(__m128d __a,__m128d __b)107006f32e7eSjoerg _mm_comigt_sd(__m128d __a, __m128d __b)
107106f32e7eSjoerg {
107206f32e7eSjoerg   return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
107306f32e7eSjoerg }
107406f32e7eSjoerg 
107506f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
107606f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
107706f32e7eSjoerg ///    the value in the first parameter is greater than or equal to the
107806f32e7eSjoerg ///    corresponding value in the second parameter.
107906f32e7eSjoerg ///
108006f32e7eSjoerg ///    The comparison yields 0 for false, 1 for true. If either of the two
108106f32e7eSjoerg ///    lower double-precision values is NaN, 0 is returned.
108206f32e7eSjoerg ///
108306f32e7eSjoerg /// \headerfile <x86intrin.h>
108406f32e7eSjoerg ///
108506f32e7eSjoerg /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
108606f32e7eSjoerg ///
108706f32e7eSjoerg /// \param __a
108806f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
108906f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
109006f32e7eSjoerg /// \param __b
109106f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
109206f32e7eSjoerg ///    compared to the lower double-precision value of \a __a.
109306f32e7eSjoerg /// \returns An integer containing the comparison results. If either of the two
109406f32e7eSjoerg ///    lower double-precision values is NaN, 0 is returned.
109506f32e7eSjoerg static __inline__ int __DEFAULT_FN_ATTRS
_mm_comige_sd(__m128d __a,__m128d __b)109606f32e7eSjoerg _mm_comige_sd(__m128d __a, __m128d __b)
109706f32e7eSjoerg {
109806f32e7eSjoerg   return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
109906f32e7eSjoerg }
110006f32e7eSjoerg 
110106f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
110206f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
110306f32e7eSjoerg ///    the value in the first parameter is unequal to the corresponding value in
110406f32e7eSjoerg ///    the second parameter.
110506f32e7eSjoerg ///
110606f32e7eSjoerg ///    The comparison yields 0 for false, 1 for true. If either of the two
110706f32e7eSjoerg ///    lower double-precision values is NaN, 1 is returned.
110806f32e7eSjoerg ///
110906f32e7eSjoerg /// \headerfile <x86intrin.h>
111006f32e7eSjoerg ///
111106f32e7eSjoerg /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
111206f32e7eSjoerg ///
111306f32e7eSjoerg /// \param __a
111406f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
111506f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
111606f32e7eSjoerg /// \param __b
111706f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
111806f32e7eSjoerg ///    compared to the lower double-precision value of \a __a.
111906f32e7eSjoerg /// \returns An integer containing the comparison results. If either of the two
112006f32e7eSjoerg ///     lower double-precision values is NaN, 1 is returned.
112106f32e7eSjoerg static __inline__ int __DEFAULT_FN_ATTRS
_mm_comineq_sd(__m128d __a,__m128d __b)112206f32e7eSjoerg _mm_comineq_sd(__m128d __a, __m128d __b)
112306f32e7eSjoerg {
112406f32e7eSjoerg   return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
112506f32e7eSjoerg }
112606f32e7eSjoerg 
112706f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
112806f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] for equality. The
112906f32e7eSjoerg ///    comparison yields 0 for false, 1 for true.
113006f32e7eSjoerg ///
113106f32e7eSjoerg ///    If either of the two lower double-precision values is NaN, 0 is returned.
113206f32e7eSjoerg ///
113306f32e7eSjoerg /// \headerfile <x86intrin.h>
113406f32e7eSjoerg ///
113506f32e7eSjoerg /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
113606f32e7eSjoerg ///
113706f32e7eSjoerg /// \param __a
113806f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
113906f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
114006f32e7eSjoerg /// \param __b
114106f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
114206f32e7eSjoerg ///    compared to the lower double-precision value of \a __a.
114306f32e7eSjoerg /// \returns An integer containing the comparison results. If either of the two
114406f32e7eSjoerg ///    lower double-precision values is NaN, 0 is returned.
114506f32e7eSjoerg static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomieq_sd(__m128d __a,__m128d __b)114606f32e7eSjoerg _mm_ucomieq_sd(__m128d __a, __m128d __b)
114706f32e7eSjoerg {
114806f32e7eSjoerg   return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
114906f32e7eSjoerg }
115006f32e7eSjoerg 
115106f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
115206f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
115306f32e7eSjoerg ///    the value in the first parameter is less than the corresponding value in
115406f32e7eSjoerg ///    the second parameter.
115506f32e7eSjoerg ///
115606f32e7eSjoerg ///    The comparison yields 0 for false, 1 for true. If either of the two lower
115706f32e7eSjoerg ///    double-precision values is NaN, 0 is returned.
115806f32e7eSjoerg ///
115906f32e7eSjoerg /// \headerfile <x86intrin.h>
116006f32e7eSjoerg ///
116106f32e7eSjoerg /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
116206f32e7eSjoerg ///
116306f32e7eSjoerg /// \param __a
116406f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
116506f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
116606f32e7eSjoerg /// \param __b
116706f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
116806f32e7eSjoerg ///    compared to the lower double-precision value of \a __a.
116906f32e7eSjoerg /// \returns An integer containing the comparison results. If either of the two
117006f32e7eSjoerg ///    lower double-precision values is NaN, 0 is returned.
117106f32e7eSjoerg static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomilt_sd(__m128d __a,__m128d __b)117206f32e7eSjoerg _mm_ucomilt_sd(__m128d __a, __m128d __b)
117306f32e7eSjoerg {
117406f32e7eSjoerg   return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
117506f32e7eSjoerg }
117606f32e7eSjoerg 
117706f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
117806f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
117906f32e7eSjoerg ///    the value in the first parameter is less than or equal to the
118006f32e7eSjoerg ///    corresponding value in the second parameter.
118106f32e7eSjoerg ///
118206f32e7eSjoerg ///    The comparison yields 0 for false, 1 for true. If either of the two lower
118306f32e7eSjoerg ///    double-precision values is NaN, 0 is returned.
118406f32e7eSjoerg ///
118506f32e7eSjoerg /// \headerfile <x86intrin.h>
118606f32e7eSjoerg ///
118706f32e7eSjoerg /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
118806f32e7eSjoerg ///
118906f32e7eSjoerg /// \param __a
119006f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
119106f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
119206f32e7eSjoerg /// \param __b
119306f32e7eSjoerg ///     A 128-bit vector of [2 x double]. The lower double-precision value is
119406f32e7eSjoerg ///     compared to the lower double-precision value of \a __a.
119506f32e7eSjoerg /// \returns An integer containing the comparison results. If either of the two
119606f32e7eSjoerg ///     lower double-precision values is NaN, 0 is returned.
119706f32e7eSjoerg static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomile_sd(__m128d __a,__m128d __b)119806f32e7eSjoerg _mm_ucomile_sd(__m128d __a, __m128d __b)
119906f32e7eSjoerg {
120006f32e7eSjoerg   return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
120106f32e7eSjoerg }
120206f32e7eSjoerg 
120306f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
120406f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
120506f32e7eSjoerg ///    the value in the first parameter is greater than the corresponding value
120606f32e7eSjoerg ///    in the second parameter.
120706f32e7eSjoerg ///
120806f32e7eSjoerg ///    The comparison yields 0 for false, 1 for true. If either of the two lower
120906f32e7eSjoerg ///    double-precision values is NaN, 0 is returned.
121006f32e7eSjoerg ///
121106f32e7eSjoerg /// \headerfile <x86intrin.h>
121206f32e7eSjoerg ///
121306f32e7eSjoerg /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
121406f32e7eSjoerg ///
121506f32e7eSjoerg /// \param __a
121606f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
121706f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
121806f32e7eSjoerg /// \param __b
121906f32e7eSjoerg ///     A 128-bit vector of [2 x double]. The lower double-precision value is
122006f32e7eSjoerg ///     compared to the lower double-precision value of \a __a.
122106f32e7eSjoerg /// \returns An integer containing the comparison results. If either of the two
122206f32e7eSjoerg ///     lower double-precision values is NaN, 0 is returned.
122306f32e7eSjoerg static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomigt_sd(__m128d __a,__m128d __b)122406f32e7eSjoerg _mm_ucomigt_sd(__m128d __a, __m128d __b)
122506f32e7eSjoerg {
122606f32e7eSjoerg   return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
122706f32e7eSjoerg }
122806f32e7eSjoerg 
122906f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
123006f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
123106f32e7eSjoerg ///    the value in the first parameter is greater than or equal to the
123206f32e7eSjoerg ///    corresponding value in the second parameter.
123306f32e7eSjoerg ///
123406f32e7eSjoerg ///    The comparison yields 0 for false, 1 for true.  If either of the two
123506f32e7eSjoerg ///    lower double-precision values is NaN, 0 is returned.
123606f32e7eSjoerg ///
123706f32e7eSjoerg /// \headerfile <x86intrin.h>
123806f32e7eSjoerg ///
123906f32e7eSjoerg /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
124006f32e7eSjoerg ///
124106f32e7eSjoerg /// \param __a
124206f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
124306f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
124406f32e7eSjoerg /// \param __b
124506f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
124606f32e7eSjoerg ///    compared to the lower double-precision value of \a __a.
124706f32e7eSjoerg /// \returns An integer containing the comparison results. If either of the two
124806f32e7eSjoerg ///    lower double-precision values is NaN, 0 is returned.
124906f32e7eSjoerg static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomige_sd(__m128d __a,__m128d __b)125006f32e7eSjoerg _mm_ucomige_sd(__m128d __a, __m128d __b)
125106f32e7eSjoerg {
125206f32e7eSjoerg   return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
125306f32e7eSjoerg }
125406f32e7eSjoerg 
125506f32e7eSjoerg /// Compares the lower double-precision floating-point values in each of
125606f32e7eSjoerg ///    the two 128-bit floating-point vectors of [2 x double] to determine if
125706f32e7eSjoerg ///    the value in the first parameter is unequal to the corresponding value in
125806f32e7eSjoerg ///    the second parameter.
125906f32e7eSjoerg ///
126006f32e7eSjoerg ///    The comparison yields 0 for false, 1 for true. If either of the two lower
126106f32e7eSjoerg ///    double-precision values is NaN, 1 is returned.
126206f32e7eSjoerg ///
126306f32e7eSjoerg /// \headerfile <x86intrin.h>
126406f32e7eSjoerg ///
126506f32e7eSjoerg /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
126606f32e7eSjoerg ///
126706f32e7eSjoerg /// \param __a
126806f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
126906f32e7eSjoerg ///    compared to the lower double-precision value of \a __b.
127006f32e7eSjoerg /// \param __b
127106f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision value is
127206f32e7eSjoerg ///    compared to the lower double-precision value of \a __a.
127306f32e7eSjoerg /// \returns An integer containing the comparison result. If either of the two
127406f32e7eSjoerg ///    lower double-precision values is NaN, 1 is returned.
127506f32e7eSjoerg static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomineq_sd(__m128d __a,__m128d __b)127606f32e7eSjoerg _mm_ucomineq_sd(__m128d __a, __m128d __b)
127706f32e7eSjoerg {
127806f32e7eSjoerg   return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
127906f32e7eSjoerg }
128006f32e7eSjoerg 
128106f32e7eSjoerg /// Converts the two double-precision floating-point elements of a
128206f32e7eSjoerg ///    128-bit vector of [2 x double] into two single-precision floating-point
128306f32e7eSjoerg ///    values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
128406f32e7eSjoerg ///    The upper 64 bits of the result vector are set to zero.
128506f32e7eSjoerg ///
128606f32e7eSjoerg /// \headerfile <x86intrin.h>
128706f32e7eSjoerg ///
128806f32e7eSjoerg /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
128906f32e7eSjoerg ///
129006f32e7eSjoerg /// \param __a
129106f32e7eSjoerg ///    A 128-bit vector of [2 x double].
129206f32e7eSjoerg /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
129306f32e7eSjoerg ///    converted values. The upper 64 bits are set to zero.
129406f32e7eSjoerg static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtpd_ps(__m128d __a)129506f32e7eSjoerg _mm_cvtpd_ps(__m128d __a)
129606f32e7eSjoerg {
129706f32e7eSjoerg   return __builtin_ia32_cvtpd2ps((__v2df)__a);
129806f32e7eSjoerg }
129906f32e7eSjoerg 
130006f32e7eSjoerg /// Converts the lower two single-precision floating-point elements of a
130106f32e7eSjoerg ///    128-bit vector of [4 x float] into two double-precision floating-point
130206f32e7eSjoerg ///    values, returned in a 128-bit vector of [2 x double]. The upper two
130306f32e7eSjoerg ///    elements of the input vector are unused.
130406f32e7eSjoerg ///
130506f32e7eSjoerg /// \headerfile <x86intrin.h>
130606f32e7eSjoerg ///
130706f32e7eSjoerg /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
130806f32e7eSjoerg ///
130906f32e7eSjoerg /// \param __a
131006f32e7eSjoerg ///    A 128-bit vector of [4 x float]. The lower two single-precision
131106f32e7eSjoerg ///    floating-point elements are converted to double-precision values. The
131206f32e7eSjoerg ///    upper two elements are unused.
131306f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the converted values.
131406f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtps_pd(__m128 __a)131506f32e7eSjoerg _mm_cvtps_pd(__m128 __a)
131606f32e7eSjoerg {
131706f32e7eSjoerg   return (__m128d) __builtin_convertvector(
131806f32e7eSjoerg       __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
131906f32e7eSjoerg }
132006f32e7eSjoerg 
132106f32e7eSjoerg /// Converts the lower two integer elements of a 128-bit vector of
132206f32e7eSjoerg ///    [4 x i32] into two double-precision floating-point values, returned in a
132306f32e7eSjoerg ///    128-bit vector of [2 x double].
132406f32e7eSjoerg ///
132506f32e7eSjoerg ///    The upper two elements of the input vector are unused.
132606f32e7eSjoerg ///
132706f32e7eSjoerg /// \headerfile <x86intrin.h>
132806f32e7eSjoerg ///
132906f32e7eSjoerg /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
133006f32e7eSjoerg ///
133106f32e7eSjoerg /// \param __a
133206f32e7eSjoerg ///    A 128-bit integer vector of [4 x i32]. The lower two integer elements are
133306f32e7eSjoerg ///    converted to double-precision values.
133406f32e7eSjoerg ///
133506f32e7eSjoerg ///    The upper two elements are unused.
133606f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the converted values.
133706f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtepi32_pd(__m128i __a)133806f32e7eSjoerg _mm_cvtepi32_pd(__m128i __a)
133906f32e7eSjoerg {
134006f32e7eSjoerg   return (__m128d) __builtin_convertvector(
134106f32e7eSjoerg       __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
134206f32e7eSjoerg }
134306f32e7eSjoerg 
134406f32e7eSjoerg /// Converts the two double-precision floating-point elements of a
134506f32e7eSjoerg ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
134606f32e7eSjoerg ///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
134706f32e7eSjoerg ///    64 bits of the result vector are set to zero.
134806f32e7eSjoerg ///
134906f32e7eSjoerg /// \headerfile <x86intrin.h>
135006f32e7eSjoerg ///
135106f32e7eSjoerg /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
135206f32e7eSjoerg ///
135306f32e7eSjoerg /// \param __a
135406f32e7eSjoerg ///    A 128-bit vector of [2 x double].
135506f32e7eSjoerg /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
135606f32e7eSjoerg ///    converted values. The upper 64 bits are set to zero.
135706f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtpd_epi32(__m128d __a)135806f32e7eSjoerg _mm_cvtpd_epi32(__m128d __a)
135906f32e7eSjoerg {
136006f32e7eSjoerg   return __builtin_ia32_cvtpd2dq((__v2df)__a);
136106f32e7eSjoerg }
136206f32e7eSjoerg 
136306f32e7eSjoerg /// Converts the low-order element of a 128-bit vector of [2 x double]
136406f32e7eSjoerg ///    into a 32-bit signed integer value.
136506f32e7eSjoerg ///
136606f32e7eSjoerg /// \headerfile <x86intrin.h>
136706f32e7eSjoerg ///
136806f32e7eSjoerg /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
136906f32e7eSjoerg ///
137006f32e7eSjoerg /// \param __a
137106f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
137206f32e7eSjoerg ///    conversion.
137306f32e7eSjoerg /// \returns A 32-bit signed integer containing the converted value.
137406f32e7eSjoerg static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvtsd_si32(__m128d __a)137506f32e7eSjoerg _mm_cvtsd_si32(__m128d __a)
137606f32e7eSjoerg {
137706f32e7eSjoerg   return __builtin_ia32_cvtsd2si((__v2df)__a);
137806f32e7eSjoerg }
137906f32e7eSjoerg 
138006f32e7eSjoerg /// Converts the lower double-precision floating-point element of a
138106f32e7eSjoerg ///    128-bit vector of [2 x double], in the second parameter, into a
138206f32e7eSjoerg ///    single-precision floating-point value, returned in the lower 32 bits of a
138306f32e7eSjoerg ///    128-bit vector of [4 x float]. The upper 96 bits of the result vector are
138406f32e7eSjoerg ///    copied from the upper 96 bits of the first parameter.
138506f32e7eSjoerg ///
138606f32e7eSjoerg /// \headerfile <x86intrin.h>
138706f32e7eSjoerg ///
138806f32e7eSjoerg /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
138906f32e7eSjoerg ///
139006f32e7eSjoerg /// \param __a
139106f32e7eSjoerg ///    A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
139206f32e7eSjoerg ///    copied to the upper 96 bits of the result.
139306f32e7eSjoerg /// \param __b
139406f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower double-precision
139506f32e7eSjoerg ///    floating-point element is used in the conversion.
139606f32e7eSjoerg /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
139706f32e7eSjoerg ///    converted value from the second parameter. The upper 96 bits are copied
139806f32e7eSjoerg ///    from the upper 96 bits of the first parameter.
139906f32e7eSjoerg static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtsd_ss(__m128 __a,__m128d __b)140006f32e7eSjoerg _mm_cvtsd_ss(__m128 __a, __m128d __b)
140106f32e7eSjoerg {
140206f32e7eSjoerg   return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
140306f32e7eSjoerg }
140406f32e7eSjoerg 
140506f32e7eSjoerg /// Converts a 32-bit signed integer value, in the second parameter, into
140606f32e7eSjoerg ///    a double-precision floating-point value, returned in the lower 64 bits of
140706f32e7eSjoerg ///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
140806f32e7eSjoerg ///    are copied from the upper 64 bits of the first parameter.
140906f32e7eSjoerg ///
141006f32e7eSjoerg /// \headerfile <x86intrin.h>
141106f32e7eSjoerg ///
141206f32e7eSjoerg /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
141306f32e7eSjoerg ///
141406f32e7eSjoerg /// \param __a
141506f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
141606f32e7eSjoerg ///    copied to the upper 64 bits of the result.
141706f32e7eSjoerg /// \param __b
141806f32e7eSjoerg ///    A 32-bit signed integer containing the value to be converted.
141906f32e7eSjoerg /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
142006f32e7eSjoerg ///    converted value from the second parameter. The upper 64 bits are copied
142106f32e7eSjoerg ///    from the upper 64 bits of the first parameter.
142206f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtsi32_sd(__m128d __a,int __b)142306f32e7eSjoerg _mm_cvtsi32_sd(__m128d __a, int __b)
142406f32e7eSjoerg {
142506f32e7eSjoerg   __a[0] = __b;
142606f32e7eSjoerg   return __a;
142706f32e7eSjoerg }
142806f32e7eSjoerg 
142906f32e7eSjoerg /// Converts the lower single-precision floating-point element of a
143006f32e7eSjoerg ///    128-bit vector of [4 x float], in the second parameter, into a
143106f32e7eSjoerg ///    double-precision floating-point value, returned in the lower 64 bits of
143206f32e7eSjoerg ///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
143306f32e7eSjoerg ///    are copied from the upper 64 bits of the first parameter.
143406f32e7eSjoerg ///
143506f32e7eSjoerg /// \headerfile <x86intrin.h>
143606f32e7eSjoerg ///
143706f32e7eSjoerg /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
143806f32e7eSjoerg ///
143906f32e7eSjoerg /// \param __a
144006f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
144106f32e7eSjoerg ///    copied to the upper 64 bits of the result.
144206f32e7eSjoerg /// \param __b
144306f32e7eSjoerg ///    A 128-bit vector of [4 x float]. The lower single-precision
144406f32e7eSjoerg ///    floating-point element is used in the conversion.
144506f32e7eSjoerg /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
144606f32e7eSjoerg ///    converted value from the second parameter. The upper 64 bits are copied
144706f32e7eSjoerg ///    from the upper 64 bits of the first parameter.
144806f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtss_sd(__m128d __a,__m128 __b)144906f32e7eSjoerg _mm_cvtss_sd(__m128d __a, __m128 __b)
145006f32e7eSjoerg {
145106f32e7eSjoerg   __a[0] = __b[0];
145206f32e7eSjoerg   return __a;
145306f32e7eSjoerg }
145406f32e7eSjoerg 
145506f32e7eSjoerg /// Converts the two double-precision floating-point elements of a
145606f32e7eSjoerg ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
145706f32e7eSjoerg ///    returned in the lower 64 bits of a 128-bit vector of [4 x i32].
145806f32e7eSjoerg ///
145906f32e7eSjoerg ///    If the result of either conversion is inexact, the result is truncated
146006f32e7eSjoerg ///    (rounded towards zero) regardless of the current MXCSR setting. The upper
146106f32e7eSjoerg ///    64 bits of the result vector are set to zero.
146206f32e7eSjoerg ///
146306f32e7eSjoerg /// \headerfile <x86intrin.h>
146406f32e7eSjoerg ///
146506f32e7eSjoerg /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
146606f32e7eSjoerg ///   instruction.
146706f32e7eSjoerg ///
146806f32e7eSjoerg /// \param __a
146906f32e7eSjoerg ///    A 128-bit vector of [2 x double].
147006f32e7eSjoerg /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
147106f32e7eSjoerg ///    converted values. The upper 64 bits are set to zero.
147206f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvttpd_epi32(__m128d __a)147306f32e7eSjoerg _mm_cvttpd_epi32(__m128d __a)
147406f32e7eSjoerg {
147506f32e7eSjoerg   return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
147606f32e7eSjoerg }
147706f32e7eSjoerg 
147806f32e7eSjoerg /// Converts the low-order element of a [2 x double] vector into a 32-bit
147906f32e7eSjoerg ///    signed integer value, truncating the result when it is inexact.
148006f32e7eSjoerg ///
148106f32e7eSjoerg /// \headerfile <x86intrin.h>
148206f32e7eSjoerg ///
148306f32e7eSjoerg /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
148406f32e7eSjoerg ///   instruction.
148506f32e7eSjoerg ///
148606f32e7eSjoerg /// \param __a
148706f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
148806f32e7eSjoerg ///    conversion.
148906f32e7eSjoerg /// \returns A 32-bit signed integer containing the converted value.
149006f32e7eSjoerg static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvttsd_si32(__m128d __a)149106f32e7eSjoerg _mm_cvttsd_si32(__m128d __a)
149206f32e7eSjoerg {
149306f32e7eSjoerg   return __builtin_ia32_cvttsd2si((__v2df)__a);
149406f32e7eSjoerg }
149506f32e7eSjoerg 
149606f32e7eSjoerg /// Converts the two double-precision floating-point elements of a
149706f32e7eSjoerg ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
149806f32e7eSjoerg ///    returned in a 64-bit vector of [2 x i32].
149906f32e7eSjoerg ///
150006f32e7eSjoerg /// \headerfile <x86intrin.h>
150106f32e7eSjoerg ///
150206f32e7eSjoerg /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
150306f32e7eSjoerg ///
150406f32e7eSjoerg /// \param __a
150506f32e7eSjoerg ///    A 128-bit vector of [2 x double].
150606f32e7eSjoerg /// \returns A 64-bit vector of [2 x i32] containing the converted values.
150706f32e7eSjoerg static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpd_pi32(__m128d __a)150806f32e7eSjoerg _mm_cvtpd_pi32(__m128d __a)
150906f32e7eSjoerg {
151006f32e7eSjoerg   return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
151106f32e7eSjoerg }
151206f32e7eSjoerg 
151306f32e7eSjoerg /// Converts the two double-precision floating-point elements of a
151406f32e7eSjoerg ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
151506f32e7eSjoerg ///    returned in a 64-bit vector of [2 x i32].
151606f32e7eSjoerg ///
151706f32e7eSjoerg ///    If the result of either conversion is inexact, the result is truncated
151806f32e7eSjoerg ///    (rounded towards zero) regardless of the current MXCSR setting.
151906f32e7eSjoerg ///
152006f32e7eSjoerg /// \headerfile <x86intrin.h>
152106f32e7eSjoerg ///
152206f32e7eSjoerg /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
152306f32e7eSjoerg ///
152406f32e7eSjoerg /// \param __a
152506f32e7eSjoerg ///    A 128-bit vector of [2 x double].
152606f32e7eSjoerg /// \returns A 64-bit vector of [2 x i32] containing the converted values.
152706f32e7eSjoerg static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvttpd_pi32(__m128d __a)152806f32e7eSjoerg _mm_cvttpd_pi32(__m128d __a)
152906f32e7eSjoerg {
153006f32e7eSjoerg   return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
153106f32e7eSjoerg }
153206f32e7eSjoerg 
153306f32e7eSjoerg /// Converts the two signed 32-bit integer elements of a 64-bit vector of
153406f32e7eSjoerg ///    [2 x i32] into two double-precision floating-point values, returned in a
153506f32e7eSjoerg ///    128-bit vector of [2 x double].
153606f32e7eSjoerg ///
153706f32e7eSjoerg /// \headerfile <x86intrin.h>
153806f32e7eSjoerg ///
153906f32e7eSjoerg /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
154006f32e7eSjoerg ///
154106f32e7eSjoerg /// \param __a
154206f32e7eSjoerg ///    A 64-bit vector of [2 x i32].
154306f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the converted values.
154406f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX
_mm_cvtpi32_pd(__m64 __a)154506f32e7eSjoerg _mm_cvtpi32_pd(__m64 __a)
154606f32e7eSjoerg {
154706f32e7eSjoerg   return __builtin_ia32_cvtpi2pd((__v2si)__a);
154806f32e7eSjoerg }
154906f32e7eSjoerg 
155006f32e7eSjoerg /// Returns the low-order element of a 128-bit vector of [2 x double] as
155106f32e7eSjoerg ///    a double-precision floating-point value.
155206f32e7eSjoerg ///
155306f32e7eSjoerg /// \headerfile <x86intrin.h>
155406f32e7eSjoerg ///
155506f32e7eSjoerg /// This intrinsic has no corresponding instruction.
155606f32e7eSjoerg ///
155706f32e7eSjoerg /// \param __a
155806f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
155906f32e7eSjoerg /// \returns A double-precision floating-point value copied from the lower 64
156006f32e7eSjoerg ///    bits of \a __a.
156106f32e7eSjoerg static __inline__ double __DEFAULT_FN_ATTRS
_mm_cvtsd_f64(__m128d __a)156206f32e7eSjoerg _mm_cvtsd_f64(__m128d __a)
156306f32e7eSjoerg {
156406f32e7eSjoerg   return __a[0];
156506f32e7eSjoerg }
156606f32e7eSjoerg 
156706f32e7eSjoerg /// Loads a 128-bit floating-point vector of [2 x double] from an aligned
156806f32e7eSjoerg ///    memory location.
156906f32e7eSjoerg ///
157006f32e7eSjoerg /// \headerfile <x86intrin.h>
157106f32e7eSjoerg ///
157206f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
157306f32e7eSjoerg ///
157406f32e7eSjoerg /// \param __dp
157506f32e7eSjoerg ///    A pointer to a 128-bit memory location. The address of the memory
157606f32e7eSjoerg ///    location has to be 16-byte aligned.
157706f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the loaded values.
157806f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_load_pd(double const * __dp)157906f32e7eSjoerg _mm_load_pd(double const *__dp)
158006f32e7eSjoerg {
1581*13fbcb42Sjoerg   return *(const __m128d*)__dp;
158206f32e7eSjoerg }
158306f32e7eSjoerg 
158406f32e7eSjoerg /// Loads a double-precision floating-point value from a specified memory
158506f32e7eSjoerg ///    location and duplicates it to both vector elements of a 128-bit vector of
158606f32e7eSjoerg ///    [2 x double].
158706f32e7eSjoerg ///
158806f32e7eSjoerg /// \headerfile <x86intrin.h>
158906f32e7eSjoerg ///
159006f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
159106f32e7eSjoerg ///
159206f32e7eSjoerg /// \param __dp
159306f32e7eSjoerg ///    A pointer to a memory location containing a double-precision value.
159406f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the loaded and
159506f32e7eSjoerg ///    duplicated values.
159606f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_load1_pd(double const * __dp)159706f32e7eSjoerg _mm_load1_pd(double const *__dp)
159806f32e7eSjoerg {
159906f32e7eSjoerg   struct __mm_load1_pd_struct {
160006f32e7eSjoerg     double __u;
160106f32e7eSjoerg   } __attribute__((__packed__, __may_alias__));
1602*13fbcb42Sjoerg   double __u = ((const struct __mm_load1_pd_struct*)__dp)->__u;
160306f32e7eSjoerg   return __extension__ (__m128d){ __u, __u };
160406f32e7eSjoerg }
160506f32e7eSjoerg 
160606f32e7eSjoerg #define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
160706f32e7eSjoerg 
160806f32e7eSjoerg /// Loads two double-precision values, in reverse order, from an aligned
160906f32e7eSjoerg ///    memory location into a 128-bit vector of [2 x double].
161006f32e7eSjoerg ///
161106f32e7eSjoerg /// \headerfile <x86intrin.h>
161206f32e7eSjoerg ///
161306f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
161406f32e7eSjoerg /// needed shuffling instructions. In AVX mode, the shuffling may be combined
161506f32e7eSjoerg /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
161606f32e7eSjoerg ///
161706f32e7eSjoerg /// \param __dp
161806f32e7eSjoerg ///    A 16-byte aligned pointer to an array of double-precision values to be
161906f32e7eSjoerg ///    loaded in reverse order.
162006f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
162106f32e7eSjoerg ///    values.
162206f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadr_pd(double const * __dp)162306f32e7eSjoerg _mm_loadr_pd(double const *__dp)
162406f32e7eSjoerg {
1625*13fbcb42Sjoerg   __m128d __u = *(const __m128d*)__dp;
162606f32e7eSjoerg   return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
162706f32e7eSjoerg }
162806f32e7eSjoerg 
162906f32e7eSjoerg /// Loads a 128-bit floating-point vector of [2 x double] from an
163006f32e7eSjoerg ///    unaligned memory location.
163106f32e7eSjoerg ///
163206f32e7eSjoerg /// \headerfile <x86intrin.h>
163306f32e7eSjoerg ///
163406f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
163506f32e7eSjoerg ///
163606f32e7eSjoerg /// \param __dp
163706f32e7eSjoerg ///    A pointer to a 128-bit memory location. The address of the memory
163806f32e7eSjoerg ///    location does not have to be aligned.
163906f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the loaded values.
164006f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadu_pd(double const * __dp)164106f32e7eSjoerg _mm_loadu_pd(double const *__dp)
164206f32e7eSjoerg {
164306f32e7eSjoerg   struct __loadu_pd {
164406f32e7eSjoerg     __m128d_u __v;
164506f32e7eSjoerg   } __attribute__((__packed__, __may_alias__));
1646*13fbcb42Sjoerg   return ((const struct __loadu_pd*)__dp)->__v;
164706f32e7eSjoerg }
164806f32e7eSjoerg 
164906f32e7eSjoerg /// Loads a 64-bit integer value to the low element of a 128-bit integer
165006f32e7eSjoerg ///    vector and clears the upper element.
165106f32e7eSjoerg ///
165206f32e7eSjoerg /// \headerfile <x86intrin.h>
165306f32e7eSjoerg ///
165406f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
165506f32e7eSjoerg ///
165606f32e7eSjoerg /// \param __a
165706f32e7eSjoerg ///    A pointer to a 64-bit memory location. The address of the memory
165806f32e7eSjoerg ///    location does not have to be aligned.
165906f32e7eSjoerg /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
166006f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si64(void const * __a)166106f32e7eSjoerg _mm_loadu_si64(void const *__a)
166206f32e7eSjoerg {
166306f32e7eSjoerg   struct __loadu_si64 {
166406f32e7eSjoerg     long long __v;
166506f32e7eSjoerg   } __attribute__((__packed__, __may_alias__));
1666*13fbcb42Sjoerg   long long __u = ((const struct __loadu_si64*)__a)->__v;
166706f32e7eSjoerg   return __extension__ (__m128i)(__v2di){__u, 0LL};
166806f32e7eSjoerg }
166906f32e7eSjoerg 
167006f32e7eSjoerg /// Loads a 32-bit integer value to the low element of a 128-bit integer
167106f32e7eSjoerg ///    vector and clears the upper element.
167206f32e7eSjoerg ///
167306f32e7eSjoerg /// \headerfile <x86intrin.h>
167406f32e7eSjoerg ///
167506f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
167606f32e7eSjoerg ///
167706f32e7eSjoerg /// \param __a
167806f32e7eSjoerg ///    A pointer to a 32-bit memory location. The address of the memory
167906f32e7eSjoerg ///    location does not have to be aligned.
168006f32e7eSjoerg /// \returns A 128-bit vector of [4 x i32] containing the loaded value.
168106f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si32(void const * __a)168206f32e7eSjoerg _mm_loadu_si32(void const *__a)
168306f32e7eSjoerg {
168406f32e7eSjoerg   struct __loadu_si32 {
168506f32e7eSjoerg     int __v;
168606f32e7eSjoerg   } __attribute__((__packed__, __may_alias__));
1687*13fbcb42Sjoerg   int __u = ((const struct __loadu_si32*)__a)->__v;
168806f32e7eSjoerg   return __extension__ (__m128i)(__v4si){__u, 0, 0, 0};
168906f32e7eSjoerg }
169006f32e7eSjoerg 
169106f32e7eSjoerg /// Loads a 16-bit integer value to the low element of a 128-bit integer
169206f32e7eSjoerg ///    vector and clears the upper element.
169306f32e7eSjoerg ///
169406f32e7eSjoerg /// \headerfile <x86intrin.h>
169506f32e7eSjoerg ///
169606f32e7eSjoerg /// This intrinsic does not correspond to a specific instruction.
169706f32e7eSjoerg ///
169806f32e7eSjoerg /// \param __a
169906f32e7eSjoerg ///    A pointer to a 16-bit memory location. The address of the memory
170006f32e7eSjoerg ///    location does not have to be aligned.
170106f32e7eSjoerg /// \returns A 128-bit vector of [8 x i16] containing the loaded value.
170206f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si16(void const * __a)170306f32e7eSjoerg _mm_loadu_si16(void const *__a)
170406f32e7eSjoerg {
170506f32e7eSjoerg   struct __loadu_si16 {
170606f32e7eSjoerg     short __v;
170706f32e7eSjoerg   } __attribute__((__packed__, __may_alias__));
1708*13fbcb42Sjoerg   short __u = ((const struct __loadu_si16*)__a)->__v;
170906f32e7eSjoerg   return __extension__ (__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
171006f32e7eSjoerg }
171106f32e7eSjoerg 
171206f32e7eSjoerg /// Loads a 64-bit double-precision value to the low element of a
171306f32e7eSjoerg ///    128-bit integer vector and clears the upper element.
171406f32e7eSjoerg ///
171506f32e7eSjoerg /// \headerfile <x86intrin.h>
171606f32e7eSjoerg ///
171706f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
171806f32e7eSjoerg ///
171906f32e7eSjoerg /// \param __dp
172006f32e7eSjoerg ///    A pointer to a memory location containing a double-precision value.
172106f32e7eSjoerg ///    The address of the memory location does not have to be aligned.
172206f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the loaded value.
172306f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_load_sd(double const * __dp)172406f32e7eSjoerg _mm_load_sd(double const *__dp)
172506f32e7eSjoerg {
172606f32e7eSjoerg   struct __mm_load_sd_struct {
172706f32e7eSjoerg     double __u;
172806f32e7eSjoerg   } __attribute__((__packed__, __may_alias__));
1729*13fbcb42Sjoerg   double __u = ((const struct __mm_load_sd_struct*)__dp)->__u;
173006f32e7eSjoerg   return __extension__ (__m128d){ __u, 0 };
173106f32e7eSjoerg }
173206f32e7eSjoerg 
173306f32e7eSjoerg /// Loads a double-precision value into the high-order bits of a 128-bit
173406f32e7eSjoerg ///    vector of [2 x double]. The low-order bits are copied from the low-order
173506f32e7eSjoerg ///    bits of the first operand.
173606f32e7eSjoerg ///
173706f32e7eSjoerg /// \headerfile <x86intrin.h>
173806f32e7eSjoerg ///
173906f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
174006f32e7eSjoerg ///
174106f32e7eSjoerg /// \param __a
174206f32e7eSjoerg ///    A 128-bit vector of [2 x double]. \n
174306f32e7eSjoerg ///    Bits [63:0] are written to bits [63:0] of the result.
174406f32e7eSjoerg /// \param __dp
174506f32e7eSjoerg ///    A pointer to a 64-bit memory location containing a double-precision
174606f32e7eSjoerg ///    floating-point value that is loaded. The loaded value is written to bits
174706f32e7eSjoerg ///    [127:64] of the result. The address of the memory location does not have
174806f32e7eSjoerg ///    to be aligned.
174906f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the moved values.
175006f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadh_pd(__m128d __a,double const * __dp)175106f32e7eSjoerg _mm_loadh_pd(__m128d __a, double const *__dp)
175206f32e7eSjoerg {
175306f32e7eSjoerg   struct __mm_loadh_pd_struct {
175406f32e7eSjoerg     double __u;
175506f32e7eSjoerg   } __attribute__((__packed__, __may_alias__));
1756*13fbcb42Sjoerg   double __u = ((const struct __mm_loadh_pd_struct*)__dp)->__u;
175706f32e7eSjoerg   return __extension__ (__m128d){ __a[0], __u };
175806f32e7eSjoerg }
175906f32e7eSjoerg 
176006f32e7eSjoerg /// Loads a double-precision value into the low-order bits of a 128-bit
176106f32e7eSjoerg ///    vector of [2 x double]. The high-order bits are copied from the
176206f32e7eSjoerg ///    high-order bits of the first operand.
176306f32e7eSjoerg ///
176406f32e7eSjoerg /// \headerfile <x86intrin.h>
176506f32e7eSjoerg ///
176606f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
176706f32e7eSjoerg ///
176806f32e7eSjoerg /// \param __a
176906f32e7eSjoerg ///    A 128-bit vector of [2 x double]. \n
177006f32e7eSjoerg ///    Bits [127:64] are written to bits [127:64] of the result.
177106f32e7eSjoerg /// \param __dp
177206f32e7eSjoerg ///    A pointer to a 64-bit memory location containing a double-precision
177306f32e7eSjoerg ///    floating-point value that is loaded. The loaded value is written to bits
177406f32e7eSjoerg ///    [63:0] of the result. The address of the memory location does not have to
177506f32e7eSjoerg ///    be aligned.
177606f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the moved values.
177706f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_loadl_pd(__m128d __a,double const * __dp)177806f32e7eSjoerg _mm_loadl_pd(__m128d __a, double const *__dp)
177906f32e7eSjoerg {
178006f32e7eSjoerg   struct __mm_loadl_pd_struct {
178106f32e7eSjoerg     double __u;
178206f32e7eSjoerg   } __attribute__((__packed__, __may_alias__));
1783*13fbcb42Sjoerg   double __u = ((const struct __mm_loadl_pd_struct*)__dp)->__u;
178406f32e7eSjoerg   return __extension__ (__m128d){ __u, __a[1] };
178506f32e7eSjoerg }
178606f32e7eSjoerg 
178706f32e7eSjoerg /// Constructs a 128-bit floating-point vector of [2 x double] with
178806f32e7eSjoerg ///    unspecified content. This could be used as an argument to another
178906f32e7eSjoerg ///    intrinsic function where the argument is required but the value is not
179006f32e7eSjoerg ///    actually used.
179106f32e7eSjoerg ///
179206f32e7eSjoerg /// \headerfile <x86intrin.h>
179306f32e7eSjoerg ///
179406f32e7eSjoerg /// This intrinsic has no corresponding instruction.
179506f32e7eSjoerg ///
179606f32e7eSjoerg /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
179706f32e7eSjoerg ///    content.
179806f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_undefined_pd(void)179906f32e7eSjoerg _mm_undefined_pd(void)
180006f32e7eSjoerg {
180106f32e7eSjoerg   return (__m128d)__builtin_ia32_undef128();
180206f32e7eSjoerg }
180306f32e7eSjoerg 
180406f32e7eSjoerg /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
180506f32e7eSjoerg ///    64 bits of the vector are initialized with the specified double-precision
180606f32e7eSjoerg ///    floating-point value. The upper 64 bits are set to zero.
180706f32e7eSjoerg ///
180806f32e7eSjoerg /// \headerfile <x86intrin.h>
180906f32e7eSjoerg ///
181006f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
181106f32e7eSjoerg ///
181206f32e7eSjoerg /// \param __w
181306f32e7eSjoerg ///    A double-precision floating-point value used to initialize the lower 64
181406f32e7eSjoerg ///    bits of the result.
181506f32e7eSjoerg /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
181606f32e7eSjoerg ///    lower 64 bits contain the value of the parameter. The upper 64 bits are
181706f32e7eSjoerg ///    set to zero.
181806f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_set_sd(double __w)181906f32e7eSjoerg _mm_set_sd(double __w)
182006f32e7eSjoerg {
182106f32e7eSjoerg   return __extension__ (__m128d){ __w, 0 };
182206f32e7eSjoerg }
182306f32e7eSjoerg 
182406f32e7eSjoerg /// Constructs a 128-bit floating-point vector of [2 x double], with each
182506f32e7eSjoerg ///    of the two double-precision floating-point vector elements set to the
182606f32e7eSjoerg ///    specified double-precision floating-point value.
182706f32e7eSjoerg ///
182806f32e7eSjoerg /// \headerfile <x86intrin.h>
182906f32e7eSjoerg ///
183006f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
183106f32e7eSjoerg ///
183206f32e7eSjoerg /// \param __w
183306f32e7eSjoerg ///    A double-precision floating-point value used to initialize each vector
183406f32e7eSjoerg ///    element of the result.
183506f32e7eSjoerg /// \returns An initialized 128-bit floating-point vector of [2 x double].
183606f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_set1_pd(double __w)183706f32e7eSjoerg _mm_set1_pd(double __w)
183806f32e7eSjoerg {
183906f32e7eSjoerg   return __extension__ (__m128d){ __w, __w };
184006f32e7eSjoerg }
184106f32e7eSjoerg 
184206f32e7eSjoerg /// Constructs a 128-bit floating-point vector of [2 x double], with each
184306f32e7eSjoerg ///    of the two double-precision floating-point vector elements set to the
184406f32e7eSjoerg ///    specified double-precision floating-point value.
184506f32e7eSjoerg ///
184606f32e7eSjoerg /// \headerfile <x86intrin.h>
184706f32e7eSjoerg ///
184806f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
184906f32e7eSjoerg ///
185006f32e7eSjoerg /// \param __w
185106f32e7eSjoerg ///    A double-precision floating-point value used to initialize each vector
185206f32e7eSjoerg ///    element of the result.
185306f32e7eSjoerg /// \returns An initialized 128-bit floating-point vector of [2 x double].
185406f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_set_pd1(double __w)185506f32e7eSjoerg _mm_set_pd1(double __w)
185606f32e7eSjoerg {
185706f32e7eSjoerg   return _mm_set1_pd(__w);
185806f32e7eSjoerg }
185906f32e7eSjoerg 
186006f32e7eSjoerg /// Constructs a 128-bit floating-point vector of [2 x double]
186106f32e7eSjoerg ///    initialized with the specified double-precision floating-point values.
186206f32e7eSjoerg ///
186306f32e7eSjoerg /// \headerfile <x86intrin.h>
186406f32e7eSjoerg ///
186506f32e7eSjoerg /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
186606f32e7eSjoerg ///
186706f32e7eSjoerg /// \param __w
186806f32e7eSjoerg ///    A double-precision floating-point value used to initialize the upper 64
186906f32e7eSjoerg ///    bits of the result.
187006f32e7eSjoerg /// \param __x
187106f32e7eSjoerg ///    A double-precision floating-point value used to initialize the lower 64
187206f32e7eSjoerg ///    bits of the result.
187306f32e7eSjoerg /// \returns An initialized 128-bit floating-point vector of [2 x double].
187406f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_set_pd(double __w,double __x)187506f32e7eSjoerg _mm_set_pd(double __w, double __x)
187606f32e7eSjoerg {
187706f32e7eSjoerg   return __extension__ (__m128d){ __x, __w };
187806f32e7eSjoerg }
187906f32e7eSjoerg 
188006f32e7eSjoerg /// Constructs a 128-bit floating-point vector of [2 x double],
188106f32e7eSjoerg ///    initialized in reverse order with the specified double-precision
188206f32e7eSjoerg ///    floating-point values.
188306f32e7eSjoerg ///
188406f32e7eSjoerg /// \headerfile <x86intrin.h>
188506f32e7eSjoerg ///
188606f32e7eSjoerg /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
188706f32e7eSjoerg ///
188806f32e7eSjoerg /// \param __w
188906f32e7eSjoerg ///    A double-precision floating-point value used to initialize the lower 64
189006f32e7eSjoerg ///    bits of the result.
189106f32e7eSjoerg /// \param __x
189206f32e7eSjoerg ///    A double-precision floating-point value used to initialize the upper 64
189306f32e7eSjoerg ///    bits of the result.
189406f32e7eSjoerg /// \returns An initialized 128-bit floating-point vector of [2 x double].
189506f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_setr_pd(double __w,double __x)189606f32e7eSjoerg _mm_setr_pd(double __w, double __x)
189706f32e7eSjoerg {
189806f32e7eSjoerg   return __extension__ (__m128d){ __w, __x };
189906f32e7eSjoerg }
190006f32e7eSjoerg 
190106f32e7eSjoerg /// Constructs a 128-bit floating-point vector of [2 x double]
190206f32e7eSjoerg ///    initialized to zero.
190306f32e7eSjoerg ///
190406f32e7eSjoerg /// \headerfile <x86intrin.h>
190506f32e7eSjoerg ///
190606f32e7eSjoerg /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
190706f32e7eSjoerg ///
190806f32e7eSjoerg /// \returns An initialized 128-bit floating-point vector of [2 x double] with
190906f32e7eSjoerg ///    all elements set to zero.
191006f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_setzero_pd(void)191106f32e7eSjoerg _mm_setzero_pd(void)
191206f32e7eSjoerg {
191306f32e7eSjoerg   return __extension__ (__m128d){ 0, 0 };
191406f32e7eSjoerg }
191506f32e7eSjoerg 
191606f32e7eSjoerg /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
191706f32e7eSjoerg ///    64 bits are set to the lower 64 bits of the second parameter. The upper
191806f32e7eSjoerg ///    64 bits are set to the upper 64 bits of the first parameter.
191906f32e7eSjoerg ///
192006f32e7eSjoerg /// \headerfile <x86intrin.h>
192106f32e7eSjoerg ///
192206f32e7eSjoerg /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
192306f32e7eSjoerg ///
192406f32e7eSjoerg /// \param __a
192506f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The upper 64 bits are written to the
192606f32e7eSjoerg ///    upper 64 bits of the result.
192706f32e7eSjoerg /// \param __b
192806f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower 64 bits are written to the
192906f32e7eSjoerg ///    lower 64 bits of the result.
193006f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the moved values.
193106f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_move_sd(__m128d __a,__m128d __b)193206f32e7eSjoerg _mm_move_sd(__m128d __a, __m128d __b)
193306f32e7eSjoerg {
193406f32e7eSjoerg   __a[0] = __b[0];
193506f32e7eSjoerg   return __a;
193606f32e7eSjoerg }
193706f32e7eSjoerg 
193806f32e7eSjoerg /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
193906f32e7eSjoerg ///    memory location.
194006f32e7eSjoerg ///
194106f32e7eSjoerg /// \headerfile <x86intrin.h>
194206f32e7eSjoerg ///
194306f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
194406f32e7eSjoerg ///
194506f32e7eSjoerg /// \param __dp
194606f32e7eSjoerg ///    A pointer to a 64-bit memory location.
194706f32e7eSjoerg /// \param __a
194806f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing the value to be stored.
194906f32e7eSjoerg static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_sd(double * __dp,__m128d __a)195006f32e7eSjoerg _mm_store_sd(double *__dp, __m128d __a)
195106f32e7eSjoerg {
195206f32e7eSjoerg   struct __mm_store_sd_struct {
195306f32e7eSjoerg     double __u;
195406f32e7eSjoerg   } __attribute__((__packed__, __may_alias__));
195506f32e7eSjoerg   ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
195606f32e7eSjoerg }
195706f32e7eSjoerg 
195806f32e7eSjoerg /// Moves packed double-precision values from a 128-bit vector of
195906f32e7eSjoerg ///    [2 x double] to a memory location.
196006f32e7eSjoerg ///
196106f32e7eSjoerg /// \headerfile <x86intrin.h>
196206f32e7eSjoerg ///
196306f32e7eSjoerg /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
196406f32e7eSjoerg ///
196506f32e7eSjoerg /// \param __dp
196606f32e7eSjoerg ///    A pointer to an aligned memory location that can store two
196706f32e7eSjoerg ///    double-precision values.
196806f32e7eSjoerg /// \param __a
196906f32e7eSjoerg ///    A packed 128-bit vector of [2 x double] containing the values to be
197006f32e7eSjoerg ///    moved.
197106f32e7eSjoerg static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_pd(double * __dp,__m128d __a)197206f32e7eSjoerg _mm_store_pd(double *__dp, __m128d __a)
197306f32e7eSjoerg {
197406f32e7eSjoerg   *(__m128d*)__dp = __a;
197506f32e7eSjoerg }
197606f32e7eSjoerg 
197706f32e7eSjoerg /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
197806f32e7eSjoerg ///    the upper and lower 64 bits of a memory location.
197906f32e7eSjoerg ///
198006f32e7eSjoerg /// \headerfile <x86intrin.h>
198106f32e7eSjoerg ///
198206f32e7eSjoerg /// This intrinsic corresponds to the
198306f32e7eSjoerg ///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
198406f32e7eSjoerg ///
198506f32e7eSjoerg /// \param __dp
198606f32e7eSjoerg ///    A pointer to a memory location that can store two double-precision
198706f32e7eSjoerg ///    values.
198806f32e7eSjoerg /// \param __a
198906f32e7eSjoerg ///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
199006f32e7eSjoerg ///    of the values in \a __dp.
199106f32e7eSjoerg static __inline__ void __DEFAULT_FN_ATTRS
_mm_store1_pd(double * __dp,__m128d __a)199206f32e7eSjoerg _mm_store1_pd(double *__dp, __m128d __a)
199306f32e7eSjoerg {
199406f32e7eSjoerg   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
199506f32e7eSjoerg   _mm_store_pd(__dp, __a);
199606f32e7eSjoerg }
199706f32e7eSjoerg 
199806f32e7eSjoerg /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
199906f32e7eSjoerg ///    the upper and lower 64 bits of a memory location.
200006f32e7eSjoerg ///
200106f32e7eSjoerg /// \headerfile <x86intrin.h>
200206f32e7eSjoerg ///
200306f32e7eSjoerg /// This intrinsic corresponds to the
200406f32e7eSjoerg ///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
200506f32e7eSjoerg ///
200606f32e7eSjoerg /// \param __dp
200706f32e7eSjoerg ///    A pointer to a memory location that can store two double-precision
200806f32e7eSjoerg ///    values.
200906f32e7eSjoerg /// \param __a
201006f32e7eSjoerg ///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
201106f32e7eSjoerg ///    of the values in \a __dp.
201206f32e7eSjoerg static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_pd1(double * __dp,__m128d __a)201306f32e7eSjoerg _mm_store_pd1(double *__dp, __m128d __a)
201406f32e7eSjoerg {
201506f32e7eSjoerg   _mm_store1_pd(__dp, __a);
201606f32e7eSjoerg }
201706f32e7eSjoerg 
201806f32e7eSjoerg /// Stores a 128-bit vector of [2 x double] into an unaligned memory
201906f32e7eSjoerg ///    location.
202006f32e7eSjoerg ///
202106f32e7eSjoerg /// \headerfile <x86intrin.h>
202206f32e7eSjoerg ///
202306f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
202406f32e7eSjoerg ///
202506f32e7eSjoerg /// \param __dp
202606f32e7eSjoerg ///    A pointer to a 128-bit memory location. The address of the memory
202706f32e7eSjoerg ///    location does not have to be aligned.
202806f32e7eSjoerg /// \param __a
202906f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing the values to be stored.
203006f32e7eSjoerg static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeu_pd(double * __dp,__m128d __a)203106f32e7eSjoerg _mm_storeu_pd(double *__dp, __m128d __a)
203206f32e7eSjoerg {
203306f32e7eSjoerg   struct __storeu_pd {
203406f32e7eSjoerg     __m128d_u __v;
203506f32e7eSjoerg   } __attribute__((__packed__, __may_alias__));
203606f32e7eSjoerg   ((struct __storeu_pd*)__dp)->__v = __a;
203706f32e7eSjoerg }
203806f32e7eSjoerg 
203906f32e7eSjoerg /// Stores two double-precision values, in reverse order, from a 128-bit
204006f32e7eSjoerg ///    vector of [2 x double] to a 16-byte aligned memory location.
204106f32e7eSjoerg ///
204206f32e7eSjoerg /// \headerfile <x86intrin.h>
204306f32e7eSjoerg ///
204406f32e7eSjoerg /// This intrinsic corresponds to a shuffling instruction followed by a
204506f32e7eSjoerg /// <c> VMOVAPD / MOVAPD </c> instruction.
204606f32e7eSjoerg ///
204706f32e7eSjoerg /// \param __dp
204806f32e7eSjoerg ///    A pointer to a 16-byte aligned memory location that can store two
204906f32e7eSjoerg ///    double-precision values.
205006f32e7eSjoerg /// \param __a
205106f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing the values to be reversed and
205206f32e7eSjoerg ///    stored.
205306f32e7eSjoerg static __inline__ void __DEFAULT_FN_ATTRS
_mm_storer_pd(double * __dp,__m128d __a)205406f32e7eSjoerg _mm_storer_pd(double *__dp, __m128d __a)
205506f32e7eSjoerg {
205606f32e7eSjoerg   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
205706f32e7eSjoerg   *(__m128d *)__dp = __a;
205806f32e7eSjoerg }
205906f32e7eSjoerg 
206006f32e7eSjoerg /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
206106f32e7eSjoerg ///    memory location.
206206f32e7eSjoerg ///
206306f32e7eSjoerg /// \headerfile <x86intrin.h>
206406f32e7eSjoerg ///
206506f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
206606f32e7eSjoerg ///
206706f32e7eSjoerg /// \param __dp
206806f32e7eSjoerg ///    A pointer to a 64-bit memory location.
206906f32e7eSjoerg /// \param __a
207006f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing the value to be stored.
207106f32e7eSjoerg static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeh_pd(double * __dp,__m128d __a)207206f32e7eSjoerg _mm_storeh_pd(double *__dp, __m128d __a)
207306f32e7eSjoerg {
207406f32e7eSjoerg   struct __mm_storeh_pd_struct {
207506f32e7eSjoerg     double __u;
207606f32e7eSjoerg   } __attribute__((__packed__, __may_alias__));
207706f32e7eSjoerg   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
207806f32e7eSjoerg }
207906f32e7eSjoerg 
208006f32e7eSjoerg /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
208106f32e7eSjoerg ///    memory location.
208206f32e7eSjoerg ///
208306f32e7eSjoerg /// \headerfile <x86intrin.h>
208406f32e7eSjoerg ///
208506f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
208606f32e7eSjoerg ///
208706f32e7eSjoerg /// \param __dp
208806f32e7eSjoerg ///    A pointer to a 64-bit memory location.
208906f32e7eSjoerg /// \param __a
209006f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing the value to be stored.
209106f32e7eSjoerg static __inline__ void __DEFAULT_FN_ATTRS
_mm_storel_pd(double * __dp,__m128d __a)209206f32e7eSjoerg _mm_storel_pd(double *__dp, __m128d __a)
209306f32e7eSjoerg {
209406f32e7eSjoerg   struct __mm_storeh_pd_struct {
209506f32e7eSjoerg     double __u;
209606f32e7eSjoerg   } __attribute__((__packed__, __may_alias__));
209706f32e7eSjoerg   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
209806f32e7eSjoerg }
209906f32e7eSjoerg 
210006f32e7eSjoerg /// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
210106f32e7eSjoerg ///    saving the lower 8 bits of each sum in the corresponding element of a
210206f32e7eSjoerg ///    128-bit result vector of [16 x i8].
210306f32e7eSjoerg ///
210406f32e7eSjoerg ///    The integer elements of both parameters can be either signed or unsigned.
210506f32e7eSjoerg ///
210606f32e7eSjoerg /// \headerfile <x86intrin.h>
210706f32e7eSjoerg ///
210806f32e7eSjoerg /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
210906f32e7eSjoerg ///
211006f32e7eSjoerg /// \param __a
211106f32e7eSjoerg ///    A 128-bit vector of [16 x i8].
211206f32e7eSjoerg /// \param __b
211306f32e7eSjoerg ///    A 128-bit vector of [16 x i8].
211406f32e7eSjoerg /// \returns A 128-bit vector of [16 x i8] containing the sums of both
211506f32e7eSjoerg ///    parameters.
211606f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_add_epi8(__m128i __a,__m128i __b)211706f32e7eSjoerg _mm_add_epi8(__m128i __a, __m128i __b)
211806f32e7eSjoerg {
211906f32e7eSjoerg   return (__m128i)((__v16qu)__a + (__v16qu)__b);
212006f32e7eSjoerg }
212106f32e7eSjoerg 
212206f32e7eSjoerg /// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
212306f32e7eSjoerg ///    saving the lower 16 bits of each sum in the corresponding element of a
212406f32e7eSjoerg ///    128-bit result vector of [8 x i16].
212506f32e7eSjoerg ///
212606f32e7eSjoerg ///    The integer elements of both parameters can be either signed or unsigned.
212706f32e7eSjoerg ///
212806f32e7eSjoerg /// \headerfile <x86intrin.h>
212906f32e7eSjoerg ///
213006f32e7eSjoerg /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
213106f32e7eSjoerg ///
213206f32e7eSjoerg /// \param __a
213306f32e7eSjoerg ///    A 128-bit vector of [8 x i16].
213406f32e7eSjoerg /// \param __b
213506f32e7eSjoerg ///    A 128-bit vector of [8 x i16].
213606f32e7eSjoerg /// \returns A 128-bit vector of [8 x i16] containing the sums of both
213706f32e7eSjoerg ///    parameters.
213806f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_add_epi16(__m128i __a,__m128i __b)213906f32e7eSjoerg _mm_add_epi16(__m128i __a, __m128i __b)
214006f32e7eSjoerg {
214106f32e7eSjoerg   return (__m128i)((__v8hu)__a + (__v8hu)__b);
214206f32e7eSjoerg }
214306f32e7eSjoerg 
214406f32e7eSjoerg /// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
214506f32e7eSjoerg ///    saving the lower 32 bits of each sum in the corresponding element of a
214606f32e7eSjoerg ///    128-bit result vector of [4 x i32].
214706f32e7eSjoerg ///
214806f32e7eSjoerg ///    The integer elements of both parameters can be either signed or unsigned.
214906f32e7eSjoerg ///
215006f32e7eSjoerg /// \headerfile <x86intrin.h>
215106f32e7eSjoerg ///
215206f32e7eSjoerg /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
215306f32e7eSjoerg ///
215406f32e7eSjoerg /// \param __a
215506f32e7eSjoerg ///    A 128-bit vector of [4 x i32].
215606f32e7eSjoerg /// \param __b
215706f32e7eSjoerg ///    A 128-bit vector of [4 x i32].
215806f32e7eSjoerg /// \returns A 128-bit vector of [4 x i32] containing the sums of both
215906f32e7eSjoerg ///    parameters.
216006f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_add_epi32(__m128i __a,__m128i __b)216106f32e7eSjoerg _mm_add_epi32(__m128i __a, __m128i __b)
216206f32e7eSjoerg {
216306f32e7eSjoerg   return (__m128i)((__v4su)__a + (__v4su)__b);
216406f32e7eSjoerg }
216506f32e7eSjoerg 
216606f32e7eSjoerg /// Adds two signed or unsigned 64-bit integer values, returning the
216706f32e7eSjoerg ///    lower 64 bits of the sum.
216806f32e7eSjoerg ///
216906f32e7eSjoerg /// \headerfile <x86intrin.h>
217006f32e7eSjoerg ///
217106f32e7eSjoerg /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
217206f32e7eSjoerg ///
217306f32e7eSjoerg /// \param __a
217406f32e7eSjoerg ///    A 64-bit integer.
217506f32e7eSjoerg /// \param __b
217606f32e7eSjoerg ///    A 64-bit integer.
217706f32e7eSjoerg /// \returns A 64-bit integer containing the sum of both parameters.
217806f32e7eSjoerg static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_add_si64(__m64 __a,__m64 __b)217906f32e7eSjoerg _mm_add_si64(__m64 __a, __m64 __b)
218006f32e7eSjoerg {
218106f32e7eSjoerg   return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
218206f32e7eSjoerg }
218306f32e7eSjoerg 
218406f32e7eSjoerg /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
218506f32e7eSjoerg ///    saving the lower 64 bits of each sum in the corresponding element of a
218606f32e7eSjoerg ///    128-bit result vector of [2 x i64].
218706f32e7eSjoerg ///
218806f32e7eSjoerg ///    The integer elements of both parameters can be either signed or unsigned.
218906f32e7eSjoerg ///
219006f32e7eSjoerg /// \headerfile <x86intrin.h>
219106f32e7eSjoerg ///
219206f32e7eSjoerg /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
219306f32e7eSjoerg ///
219406f32e7eSjoerg /// \param __a
219506f32e7eSjoerg ///    A 128-bit vector of [2 x i64].
219606f32e7eSjoerg /// \param __b
219706f32e7eSjoerg ///    A 128-bit vector of [2 x i64].
219806f32e7eSjoerg /// \returns A 128-bit vector of [2 x i64] containing the sums of both
219906f32e7eSjoerg ///    parameters.
220006f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_add_epi64(__m128i __a,__m128i __b)220106f32e7eSjoerg _mm_add_epi64(__m128i __a, __m128i __b)
220206f32e7eSjoerg {
220306f32e7eSjoerg   return (__m128i)((__v2du)__a + (__v2du)__b);
220406f32e7eSjoerg }
220506f32e7eSjoerg 
220606f32e7eSjoerg /// Adds, with saturation, the corresponding elements of two 128-bit
220706f32e7eSjoerg ///    signed [16 x i8] vectors, saving each sum in the corresponding element of
220806f32e7eSjoerg ///    a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
220906f32e7eSjoerg ///    saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
221006f32e7eSjoerg ///
221106f32e7eSjoerg /// \headerfile <x86intrin.h>
221206f32e7eSjoerg ///
221306f32e7eSjoerg /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
221406f32e7eSjoerg ///
221506f32e7eSjoerg /// \param __a
221606f32e7eSjoerg ///    A 128-bit signed [16 x i8] vector.
221706f32e7eSjoerg /// \param __b
221806f32e7eSjoerg ///    A 128-bit signed [16 x i8] vector.
221906f32e7eSjoerg /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
222006f32e7eSjoerg ///    both parameters.
222106f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epi8(__m128i __a,__m128i __b)222206f32e7eSjoerg _mm_adds_epi8(__m128i __a, __m128i __b)
222306f32e7eSjoerg {
222406f32e7eSjoerg   return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
222506f32e7eSjoerg }
222606f32e7eSjoerg 
222706f32e7eSjoerg /// Adds, with saturation, the corresponding elements of two 128-bit
222806f32e7eSjoerg ///    signed [8 x i16] vectors, saving each sum in the corresponding element of
222906f32e7eSjoerg ///    a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
223006f32e7eSjoerg ///    are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
223106f32e7eSjoerg ///    0x8000.
223206f32e7eSjoerg ///
223306f32e7eSjoerg /// \headerfile <x86intrin.h>
223406f32e7eSjoerg ///
223506f32e7eSjoerg /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
223606f32e7eSjoerg ///
223706f32e7eSjoerg /// \param __a
223806f32e7eSjoerg ///    A 128-bit signed [8 x i16] vector.
223906f32e7eSjoerg /// \param __b
224006f32e7eSjoerg ///    A 128-bit signed [8 x i16] vector.
224106f32e7eSjoerg /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
224206f32e7eSjoerg ///    both parameters.
224306f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epi16(__m128i __a,__m128i __b)224406f32e7eSjoerg _mm_adds_epi16(__m128i __a, __m128i __b)
224506f32e7eSjoerg {
224606f32e7eSjoerg   return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
224706f32e7eSjoerg }
224806f32e7eSjoerg 
224906f32e7eSjoerg /// Adds, with saturation, the corresponding elements of two 128-bit
225006f32e7eSjoerg ///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
225106f32e7eSjoerg ///    of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
225206f32e7eSjoerg ///    are saturated to 0xFF. Negative sums are saturated to 0x00.
225306f32e7eSjoerg ///
225406f32e7eSjoerg /// \headerfile <x86intrin.h>
225506f32e7eSjoerg ///
225606f32e7eSjoerg /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
225706f32e7eSjoerg ///
225806f32e7eSjoerg /// \param __a
225906f32e7eSjoerg ///    A 128-bit unsigned [16 x i8] vector.
226006f32e7eSjoerg /// \param __b
226106f32e7eSjoerg ///    A 128-bit unsigned [16 x i8] vector.
226206f32e7eSjoerg /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
226306f32e7eSjoerg ///    of both parameters.
226406f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epu8(__m128i __a,__m128i __b)226506f32e7eSjoerg _mm_adds_epu8(__m128i __a, __m128i __b)
226606f32e7eSjoerg {
226706f32e7eSjoerg   return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
226806f32e7eSjoerg }
226906f32e7eSjoerg 
227006f32e7eSjoerg /// Adds, with saturation, the corresponding elements of two 128-bit
227106f32e7eSjoerg ///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
227206f32e7eSjoerg ///    of a 128-bit result vector of [8 x i16]. Positive sums greater than
227306f32e7eSjoerg ///    0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
227406f32e7eSjoerg ///
227506f32e7eSjoerg /// \headerfile <x86intrin.h>
227606f32e7eSjoerg ///
227706f32e7eSjoerg /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
227806f32e7eSjoerg ///
227906f32e7eSjoerg /// \param __a
228006f32e7eSjoerg ///    A 128-bit unsigned [8 x i16] vector.
228106f32e7eSjoerg /// \param __b
228206f32e7eSjoerg ///    A 128-bit unsigned [8 x i16] vector.
228306f32e7eSjoerg /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
228406f32e7eSjoerg ///    of both parameters.
228506f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_adds_epu16(__m128i __a,__m128i __b)228606f32e7eSjoerg _mm_adds_epu16(__m128i __a, __m128i __b)
228706f32e7eSjoerg {
228806f32e7eSjoerg   return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
228906f32e7eSjoerg }
229006f32e7eSjoerg 
229106f32e7eSjoerg /// Computes the rounded averages of corresponding elements of two
229206f32e7eSjoerg ///    128-bit unsigned [16 x i8] vectors, saving each result in the
229306f32e7eSjoerg ///    corresponding element of a 128-bit result vector of [16 x i8].
229406f32e7eSjoerg ///
229506f32e7eSjoerg /// \headerfile <x86intrin.h>
229606f32e7eSjoerg ///
229706f32e7eSjoerg /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
229806f32e7eSjoerg ///
229906f32e7eSjoerg /// \param __a
230006f32e7eSjoerg ///    A 128-bit unsigned [16 x i8] vector.
230106f32e7eSjoerg /// \param __b
230206f32e7eSjoerg ///    A 128-bit unsigned [16 x i8] vector.
230306f32e7eSjoerg /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
230406f32e7eSjoerg ///    averages of both parameters.
230506f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_avg_epu8(__m128i __a,__m128i __b)230606f32e7eSjoerg _mm_avg_epu8(__m128i __a, __m128i __b)
230706f32e7eSjoerg {
230806f32e7eSjoerg   return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
230906f32e7eSjoerg }
231006f32e7eSjoerg 
231106f32e7eSjoerg /// Computes the rounded averages of corresponding elements of two
231206f32e7eSjoerg ///    128-bit unsigned [8 x i16] vectors, saving each result in the
231306f32e7eSjoerg ///    corresponding element of a 128-bit result vector of [8 x i16].
231406f32e7eSjoerg ///
231506f32e7eSjoerg /// \headerfile <x86intrin.h>
231606f32e7eSjoerg ///
231706f32e7eSjoerg /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
231806f32e7eSjoerg ///
231906f32e7eSjoerg /// \param __a
232006f32e7eSjoerg ///    A 128-bit unsigned [8 x i16] vector.
232106f32e7eSjoerg /// \param __b
232206f32e7eSjoerg ///    A 128-bit unsigned [8 x i16] vector.
232306f32e7eSjoerg /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
232406f32e7eSjoerg ///    averages of both parameters.
232506f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_avg_epu16(__m128i __a,__m128i __b)232606f32e7eSjoerg _mm_avg_epu16(__m128i __a, __m128i __b)
232706f32e7eSjoerg {
232806f32e7eSjoerg   return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
232906f32e7eSjoerg }
233006f32e7eSjoerg 
233106f32e7eSjoerg /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
233206f32e7eSjoerg ///    vectors, producing eight intermediate 32-bit signed integer products, and
233306f32e7eSjoerg ///    adds the consecutive pairs of 32-bit products to form a 128-bit signed
233406f32e7eSjoerg ///    [4 x i32] vector.
233506f32e7eSjoerg ///
233606f32e7eSjoerg ///    For example, bits [15:0] of both parameters are multiplied producing a
233706f32e7eSjoerg ///    32-bit product, bits [31:16] of both parameters are multiplied producing
233806f32e7eSjoerg ///    a 32-bit product, and the sum of those two products becomes bits [31:0]
233906f32e7eSjoerg ///    of the result.
234006f32e7eSjoerg ///
234106f32e7eSjoerg /// \headerfile <x86intrin.h>
234206f32e7eSjoerg ///
234306f32e7eSjoerg /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
234406f32e7eSjoerg ///
234506f32e7eSjoerg /// \param __a
234606f32e7eSjoerg ///    A 128-bit signed [8 x i16] vector.
234706f32e7eSjoerg /// \param __b
234806f32e7eSjoerg ///    A 128-bit signed [8 x i16] vector.
234906f32e7eSjoerg /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
235006f32e7eSjoerg ///    of both parameters.
235106f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_madd_epi16(__m128i __a,__m128i __b)235206f32e7eSjoerg _mm_madd_epi16(__m128i __a, __m128i __b)
235306f32e7eSjoerg {
235406f32e7eSjoerg   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
235506f32e7eSjoerg }
235606f32e7eSjoerg 
235706f32e7eSjoerg /// Compares corresponding elements of two 128-bit signed [8 x i16]
235806f32e7eSjoerg ///    vectors, saving the greater value from each comparison in the
235906f32e7eSjoerg ///    corresponding element of a 128-bit result vector of [8 x i16].
236006f32e7eSjoerg ///
236106f32e7eSjoerg /// \headerfile <x86intrin.h>
236206f32e7eSjoerg ///
236306f32e7eSjoerg /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
236406f32e7eSjoerg ///
236506f32e7eSjoerg /// \param __a
236606f32e7eSjoerg ///    A 128-bit signed [8 x i16] vector.
236706f32e7eSjoerg /// \param __b
236806f32e7eSjoerg ///    A 128-bit signed [8 x i16] vector.
236906f32e7eSjoerg /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
237006f32e7eSjoerg ///    each comparison.
237106f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_max_epi16(__m128i __a,__m128i __b)237206f32e7eSjoerg _mm_max_epi16(__m128i __a, __m128i __b)
237306f32e7eSjoerg {
237406f32e7eSjoerg   return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
237506f32e7eSjoerg }
237606f32e7eSjoerg 
237706f32e7eSjoerg /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
237806f32e7eSjoerg ///    vectors, saving the greater value from each comparison in the
237906f32e7eSjoerg ///    corresponding element of a 128-bit result vector of [16 x i8].
238006f32e7eSjoerg ///
238106f32e7eSjoerg /// \headerfile <x86intrin.h>
238206f32e7eSjoerg ///
238306f32e7eSjoerg /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
238406f32e7eSjoerg ///
238506f32e7eSjoerg /// \param __a
238606f32e7eSjoerg ///    A 128-bit unsigned [16 x i8] vector.
238706f32e7eSjoerg /// \param __b
238806f32e7eSjoerg ///    A 128-bit unsigned [16 x i8] vector.
238906f32e7eSjoerg /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
239006f32e7eSjoerg ///    each comparison.
239106f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_max_epu8(__m128i __a,__m128i __b)239206f32e7eSjoerg _mm_max_epu8(__m128i __a, __m128i __b)
239306f32e7eSjoerg {
239406f32e7eSjoerg   return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
239506f32e7eSjoerg }
239606f32e7eSjoerg 
239706f32e7eSjoerg /// Compares corresponding elements of two 128-bit signed [8 x i16]
239806f32e7eSjoerg ///    vectors, saving the smaller value from each comparison in the
239906f32e7eSjoerg ///    corresponding element of a 128-bit result vector of [8 x i16].
240006f32e7eSjoerg ///
240106f32e7eSjoerg /// \headerfile <x86intrin.h>
240206f32e7eSjoerg ///
240306f32e7eSjoerg /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
240406f32e7eSjoerg ///
240506f32e7eSjoerg /// \param __a
240606f32e7eSjoerg ///    A 128-bit signed [8 x i16] vector.
240706f32e7eSjoerg /// \param __b
240806f32e7eSjoerg ///    A 128-bit signed [8 x i16] vector.
240906f32e7eSjoerg /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
241006f32e7eSjoerg ///    each comparison.
241106f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_min_epi16(__m128i __a,__m128i __b)241206f32e7eSjoerg _mm_min_epi16(__m128i __a, __m128i __b)
241306f32e7eSjoerg {
241406f32e7eSjoerg   return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
241506f32e7eSjoerg }
241606f32e7eSjoerg 
241706f32e7eSjoerg /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
241806f32e7eSjoerg ///    vectors, saving the smaller value from each comparison in the
241906f32e7eSjoerg ///    corresponding element of a 128-bit result vector of [16 x i8].
242006f32e7eSjoerg ///
242106f32e7eSjoerg /// \headerfile <x86intrin.h>
242206f32e7eSjoerg ///
242306f32e7eSjoerg /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
242406f32e7eSjoerg ///
242506f32e7eSjoerg /// \param __a
242606f32e7eSjoerg ///    A 128-bit unsigned [16 x i8] vector.
242706f32e7eSjoerg /// \param __b
242806f32e7eSjoerg ///    A 128-bit unsigned [16 x i8] vector.
242906f32e7eSjoerg /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
243006f32e7eSjoerg ///    each comparison.
243106f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_min_epu8(__m128i __a,__m128i __b)243206f32e7eSjoerg _mm_min_epu8(__m128i __a, __m128i __b)
243306f32e7eSjoerg {
243406f32e7eSjoerg   return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
243506f32e7eSjoerg }
243606f32e7eSjoerg 
243706f32e7eSjoerg /// Multiplies the corresponding elements of two signed [8 x i16]
243806f32e7eSjoerg ///    vectors, saving the upper 16 bits of each 32-bit product in the
243906f32e7eSjoerg ///    corresponding element of a 128-bit signed [8 x i16] result vector.
244006f32e7eSjoerg ///
244106f32e7eSjoerg /// \headerfile <x86intrin.h>
244206f32e7eSjoerg ///
244306f32e7eSjoerg /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
244406f32e7eSjoerg ///
244506f32e7eSjoerg /// \param __a
244606f32e7eSjoerg ///    A 128-bit signed [8 x i16] vector.
244706f32e7eSjoerg /// \param __b
244806f32e7eSjoerg ///    A 128-bit signed [8 x i16] vector.
244906f32e7eSjoerg /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
245006f32e7eSjoerg ///    each of the eight 32-bit products.
245106f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mulhi_epi16(__m128i __a,__m128i __b)245206f32e7eSjoerg _mm_mulhi_epi16(__m128i __a, __m128i __b)
245306f32e7eSjoerg {
245406f32e7eSjoerg   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
245506f32e7eSjoerg }
245606f32e7eSjoerg 
245706f32e7eSjoerg /// Multiplies the corresponding elements of two unsigned [8 x i16]
245806f32e7eSjoerg ///    vectors, saving the upper 16 bits of each 32-bit product in the
245906f32e7eSjoerg ///    corresponding element of a 128-bit unsigned [8 x i16] result vector.
246006f32e7eSjoerg ///
246106f32e7eSjoerg /// \headerfile <x86intrin.h>
246206f32e7eSjoerg ///
246306f32e7eSjoerg /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
246406f32e7eSjoerg ///
246506f32e7eSjoerg /// \param __a
246606f32e7eSjoerg ///    A 128-bit unsigned [8 x i16] vector.
246706f32e7eSjoerg /// \param __b
246806f32e7eSjoerg ///    A 128-bit unsigned [8 x i16] vector.
246906f32e7eSjoerg /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
247006f32e7eSjoerg ///    of each of the eight 32-bit products.
247106f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mulhi_epu16(__m128i __a,__m128i __b)247206f32e7eSjoerg _mm_mulhi_epu16(__m128i __a, __m128i __b)
247306f32e7eSjoerg {
247406f32e7eSjoerg   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
247506f32e7eSjoerg }
247606f32e7eSjoerg 
247706f32e7eSjoerg /// Multiplies the corresponding elements of two signed [8 x i16]
247806f32e7eSjoerg ///    vectors, saving the lower 16 bits of each 32-bit product in the
247906f32e7eSjoerg ///    corresponding element of a 128-bit signed [8 x i16] result vector.
248006f32e7eSjoerg ///
248106f32e7eSjoerg /// \headerfile <x86intrin.h>
248206f32e7eSjoerg ///
248306f32e7eSjoerg /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
248406f32e7eSjoerg ///
248506f32e7eSjoerg /// \param __a
248606f32e7eSjoerg ///    A 128-bit signed [8 x i16] vector.
248706f32e7eSjoerg /// \param __b
248806f32e7eSjoerg ///    A 128-bit signed [8 x i16] vector.
248906f32e7eSjoerg /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
249006f32e7eSjoerg ///    each of the eight 32-bit products.
249106f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mullo_epi16(__m128i __a,__m128i __b)249206f32e7eSjoerg _mm_mullo_epi16(__m128i __a, __m128i __b)
249306f32e7eSjoerg {
249406f32e7eSjoerg   return (__m128i)((__v8hu)__a * (__v8hu)__b);
249506f32e7eSjoerg }
249606f32e7eSjoerg 
249706f32e7eSjoerg /// Multiplies 32-bit unsigned integer values contained in the lower bits
249806f32e7eSjoerg ///    of the two 64-bit integer vectors and returns the 64-bit unsigned
249906f32e7eSjoerg ///    product.
250006f32e7eSjoerg ///
250106f32e7eSjoerg /// \headerfile <x86intrin.h>
250206f32e7eSjoerg ///
250306f32e7eSjoerg /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
250406f32e7eSjoerg ///
250506f32e7eSjoerg /// \param __a
250606f32e7eSjoerg ///    A 64-bit integer containing one of the source operands.
250706f32e7eSjoerg /// \param __b
250806f32e7eSjoerg ///    A 64-bit integer containing one of the source operands.
250906f32e7eSjoerg /// \returns A 64-bit integer vector containing the product of both operands.
251006f32e7eSjoerg static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_mul_su32(__m64 __a,__m64 __b)251106f32e7eSjoerg _mm_mul_su32(__m64 __a, __m64 __b)
251206f32e7eSjoerg {
251306f32e7eSjoerg   return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
251406f32e7eSjoerg }
251506f32e7eSjoerg 
251606f32e7eSjoerg /// Multiplies 32-bit unsigned integer values contained in the lower
251706f32e7eSjoerg ///    bits of the corresponding elements of two [2 x i64] vectors, and returns
251806f32e7eSjoerg ///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
251906f32e7eSjoerg ///
252006f32e7eSjoerg /// \headerfile <x86intrin.h>
252106f32e7eSjoerg ///
252206f32e7eSjoerg /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
252306f32e7eSjoerg ///
252406f32e7eSjoerg /// \param __a
252506f32e7eSjoerg ///    A [2 x i64] vector containing one of the source operands.
252606f32e7eSjoerg /// \param __b
252706f32e7eSjoerg ///    A [2 x i64] vector containing one of the source operands.
252806f32e7eSjoerg /// \returns A [2 x i64] vector containing the product of both operands.
252906f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mul_epu32(__m128i __a,__m128i __b)253006f32e7eSjoerg _mm_mul_epu32(__m128i __a, __m128i __b)
253106f32e7eSjoerg {
253206f32e7eSjoerg   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
253306f32e7eSjoerg }
253406f32e7eSjoerg 
253506f32e7eSjoerg /// Computes the absolute differences of corresponding 8-bit integer
253606f32e7eSjoerg ///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
253706f32e7eSjoerg ///    separately sums the second 8 absolute differences. Packs these two
253806f32e7eSjoerg ///    unsigned 16-bit integer sums into the upper and lower elements of a
253906f32e7eSjoerg ///    [2 x i64] vector.
254006f32e7eSjoerg ///
254106f32e7eSjoerg /// \headerfile <x86intrin.h>
254206f32e7eSjoerg ///
254306f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
254406f32e7eSjoerg ///
254506f32e7eSjoerg /// \param __a
254606f32e7eSjoerg ///    A 128-bit integer vector containing one of the source operands.
254706f32e7eSjoerg /// \param __b
254806f32e7eSjoerg ///    A 128-bit integer vector containing one of the source operands.
254906f32e7eSjoerg /// \returns A [2 x i64] vector containing the sums of the sets of absolute
255006f32e7eSjoerg ///    differences between both operands.
255106f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sad_epu8(__m128i __a,__m128i __b)255206f32e7eSjoerg _mm_sad_epu8(__m128i __a, __m128i __b)
255306f32e7eSjoerg {
255406f32e7eSjoerg   return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
255506f32e7eSjoerg }
255606f32e7eSjoerg 
255706f32e7eSjoerg /// Subtracts the corresponding 8-bit integer values in the operands.
255806f32e7eSjoerg ///
255906f32e7eSjoerg /// \headerfile <x86intrin.h>
256006f32e7eSjoerg ///
256106f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
256206f32e7eSjoerg ///
256306f32e7eSjoerg /// \param __a
256406f32e7eSjoerg ///    A 128-bit integer vector containing the minuends.
256506f32e7eSjoerg /// \param __b
256606f32e7eSjoerg ///    A 128-bit integer vector containing the subtrahends.
256706f32e7eSjoerg /// \returns A 128-bit integer vector containing the differences of the values
256806f32e7eSjoerg ///    in the operands.
256906f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sub_epi8(__m128i __a,__m128i __b)257006f32e7eSjoerg _mm_sub_epi8(__m128i __a, __m128i __b)
257106f32e7eSjoerg {
257206f32e7eSjoerg   return (__m128i)((__v16qu)__a - (__v16qu)__b);
257306f32e7eSjoerg }
257406f32e7eSjoerg 
257506f32e7eSjoerg /// Subtracts the corresponding 16-bit integer values in the operands.
257606f32e7eSjoerg ///
257706f32e7eSjoerg /// \headerfile <x86intrin.h>
257806f32e7eSjoerg ///
257906f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
258006f32e7eSjoerg ///
258106f32e7eSjoerg /// \param __a
258206f32e7eSjoerg ///    A 128-bit integer vector containing the minuends.
258306f32e7eSjoerg /// \param __b
258406f32e7eSjoerg ///    A 128-bit integer vector containing the subtrahends.
258506f32e7eSjoerg /// \returns A 128-bit integer vector containing the differences of the values
258606f32e7eSjoerg ///    in the operands.
258706f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sub_epi16(__m128i __a,__m128i __b)258806f32e7eSjoerg _mm_sub_epi16(__m128i __a, __m128i __b)
258906f32e7eSjoerg {
259006f32e7eSjoerg   return (__m128i)((__v8hu)__a - (__v8hu)__b);
259106f32e7eSjoerg }
259206f32e7eSjoerg 
259306f32e7eSjoerg /// Subtracts the corresponding 32-bit integer values in the operands.
259406f32e7eSjoerg ///
259506f32e7eSjoerg /// \headerfile <x86intrin.h>
259606f32e7eSjoerg ///
259706f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
259806f32e7eSjoerg ///
259906f32e7eSjoerg /// \param __a
260006f32e7eSjoerg ///    A 128-bit integer vector containing the minuends.
260106f32e7eSjoerg /// \param __b
260206f32e7eSjoerg ///    A 128-bit integer vector containing the subtrahends.
260306f32e7eSjoerg /// \returns A 128-bit integer vector containing the differences of the values
260406f32e7eSjoerg ///    in the operands.
260506f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sub_epi32(__m128i __a,__m128i __b)260606f32e7eSjoerg _mm_sub_epi32(__m128i __a, __m128i __b)
260706f32e7eSjoerg {
260806f32e7eSjoerg   return (__m128i)((__v4su)__a - (__v4su)__b);
260906f32e7eSjoerg }
261006f32e7eSjoerg 
261106f32e7eSjoerg /// Subtracts signed or unsigned 64-bit integer values and writes the
261206f32e7eSjoerg ///    difference to the corresponding bits in the destination.
261306f32e7eSjoerg ///
261406f32e7eSjoerg /// \headerfile <x86intrin.h>
261506f32e7eSjoerg ///
261606f32e7eSjoerg /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
261706f32e7eSjoerg ///
261806f32e7eSjoerg /// \param __a
261906f32e7eSjoerg ///    A 64-bit integer vector containing the minuend.
262006f32e7eSjoerg /// \param __b
262106f32e7eSjoerg ///    A 64-bit integer vector containing the subtrahend.
262206f32e7eSjoerg /// \returns A 64-bit integer vector containing the difference of the values in
262306f32e7eSjoerg ///    the operands.
262406f32e7eSjoerg static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_sub_si64(__m64 __a,__m64 __b)262506f32e7eSjoerg _mm_sub_si64(__m64 __a, __m64 __b)
262606f32e7eSjoerg {
262706f32e7eSjoerg   return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
262806f32e7eSjoerg }
262906f32e7eSjoerg 
263006f32e7eSjoerg /// Subtracts the corresponding elements of two [2 x i64] vectors.
263106f32e7eSjoerg ///
263206f32e7eSjoerg /// \headerfile <x86intrin.h>
263306f32e7eSjoerg ///
263406f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
263506f32e7eSjoerg ///
263606f32e7eSjoerg /// \param __a
263706f32e7eSjoerg ///    A 128-bit integer vector containing the minuends.
263806f32e7eSjoerg /// \param __b
263906f32e7eSjoerg ///    A 128-bit integer vector containing the subtrahends.
264006f32e7eSjoerg /// \returns A 128-bit integer vector containing the differences of the values
264106f32e7eSjoerg ///    in the operands.
264206f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sub_epi64(__m128i __a,__m128i __b)264306f32e7eSjoerg _mm_sub_epi64(__m128i __a, __m128i __b)
264406f32e7eSjoerg {
264506f32e7eSjoerg   return (__m128i)((__v2du)__a - (__v2du)__b);
264606f32e7eSjoerg }
264706f32e7eSjoerg 
264806f32e7eSjoerg /// Subtracts corresponding 8-bit signed integer values in the input and
264906f32e7eSjoerg ///    returns the differences in the corresponding bytes in the destination.
265006f32e7eSjoerg ///    Differences greater than 0x7F are saturated to 0x7F, and differences less
265106f32e7eSjoerg ///    than 0x80 are saturated to 0x80.
265206f32e7eSjoerg ///
265306f32e7eSjoerg /// \headerfile <x86intrin.h>
265406f32e7eSjoerg ///
265506f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
265606f32e7eSjoerg ///
265706f32e7eSjoerg /// \param __a
265806f32e7eSjoerg ///    A 128-bit integer vector containing the minuends.
265906f32e7eSjoerg /// \param __b
266006f32e7eSjoerg ///    A 128-bit integer vector containing the subtrahends.
266106f32e7eSjoerg /// \returns A 128-bit integer vector containing the differences of the values
266206f32e7eSjoerg ///    in the operands.
266306f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epi8(__m128i __a,__m128i __b)266406f32e7eSjoerg _mm_subs_epi8(__m128i __a, __m128i __b)
266506f32e7eSjoerg {
266606f32e7eSjoerg   return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
266706f32e7eSjoerg }
266806f32e7eSjoerg 
266906f32e7eSjoerg /// Subtracts corresponding 16-bit signed integer values in the input and
267006f32e7eSjoerg ///    returns the differences in the corresponding bytes in the destination.
267106f32e7eSjoerg ///    Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
267206f32e7eSjoerg ///    than 0x8000 are saturated to 0x8000.
267306f32e7eSjoerg ///
267406f32e7eSjoerg /// \headerfile <x86intrin.h>
267506f32e7eSjoerg ///
267606f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
267706f32e7eSjoerg ///
267806f32e7eSjoerg /// \param __a
267906f32e7eSjoerg ///    A 128-bit integer vector containing the minuends.
268006f32e7eSjoerg /// \param __b
268106f32e7eSjoerg ///    A 128-bit integer vector containing the subtrahends.
268206f32e7eSjoerg /// \returns A 128-bit integer vector containing the differences of the values
268306f32e7eSjoerg ///    in the operands.
268406f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epi16(__m128i __a,__m128i __b)268506f32e7eSjoerg _mm_subs_epi16(__m128i __a, __m128i __b)
268606f32e7eSjoerg {
268706f32e7eSjoerg   return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
268806f32e7eSjoerg }
268906f32e7eSjoerg 
269006f32e7eSjoerg /// Subtracts corresponding 8-bit unsigned integer values in the input
269106f32e7eSjoerg ///    and returns the differences in the corresponding bytes in the
269206f32e7eSjoerg ///    destination. Differences less than 0x00 are saturated to 0x00.
269306f32e7eSjoerg ///
269406f32e7eSjoerg /// \headerfile <x86intrin.h>
269506f32e7eSjoerg ///
269606f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
269706f32e7eSjoerg ///
269806f32e7eSjoerg /// \param __a
269906f32e7eSjoerg ///    A 128-bit integer vector containing the minuends.
270006f32e7eSjoerg /// \param __b
270106f32e7eSjoerg ///    A 128-bit integer vector containing the subtrahends.
270206f32e7eSjoerg /// \returns A 128-bit integer vector containing the unsigned integer
270306f32e7eSjoerg ///    differences of the values in the operands.
270406f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epu8(__m128i __a,__m128i __b)270506f32e7eSjoerg _mm_subs_epu8(__m128i __a, __m128i __b)
270606f32e7eSjoerg {
270706f32e7eSjoerg   return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
270806f32e7eSjoerg }
270906f32e7eSjoerg 
271006f32e7eSjoerg /// Subtracts corresponding 16-bit unsigned integer values in the input
271106f32e7eSjoerg ///    and returns the differences in the corresponding bytes in the
271206f32e7eSjoerg ///    destination. Differences less than 0x0000 are saturated to 0x0000.
271306f32e7eSjoerg ///
271406f32e7eSjoerg /// \headerfile <x86intrin.h>
271506f32e7eSjoerg ///
271606f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
271706f32e7eSjoerg ///
271806f32e7eSjoerg /// \param __a
271906f32e7eSjoerg ///    A 128-bit integer vector containing the minuends.
272006f32e7eSjoerg /// \param __b
272106f32e7eSjoerg ///    A 128-bit integer vector containing the subtrahends.
272206f32e7eSjoerg /// \returns A 128-bit integer vector containing the unsigned integer
272306f32e7eSjoerg ///    differences of the values in the operands.
272406f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_subs_epu16(__m128i __a,__m128i __b)272506f32e7eSjoerg _mm_subs_epu16(__m128i __a, __m128i __b)
272606f32e7eSjoerg {
272706f32e7eSjoerg   return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
272806f32e7eSjoerg }
272906f32e7eSjoerg 
273006f32e7eSjoerg /// Performs a bitwise AND of two 128-bit integer vectors.
273106f32e7eSjoerg ///
273206f32e7eSjoerg /// \headerfile <x86intrin.h>
273306f32e7eSjoerg ///
273406f32e7eSjoerg /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
273506f32e7eSjoerg ///
273606f32e7eSjoerg /// \param __a
273706f32e7eSjoerg ///    A 128-bit integer vector containing one of the source operands.
273806f32e7eSjoerg /// \param __b
273906f32e7eSjoerg ///    A 128-bit integer vector containing one of the source operands.
274006f32e7eSjoerg /// \returns A 128-bit integer vector containing the bitwise AND of the values
274106f32e7eSjoerg ///    in both operands.
274206f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_and_si128(__m128i __a,__m128i __b)274306f32e7eSjoerg _mm_and_si128(__m128i __a, __m128i __b)
274406f32e7eSjoerg {
274506f32e7eSjoerg   return (__m128i)((__v2du)__a & (__v2du)__b);
274606f32e7eSjoerg }
274706f32e7eSjoerg 
274806f32e7eSjoerg /// Performs a bitwise AND of two 128-bit integer vectors, using the
274906f32e7eSjoerg ///    one's complement of the values contained in the first source operand.
275006f32e7eSjoerg ///
275106f32e7eSjoerg /// \headerfile <x86intrin.h>
275206f32e7eSjoerg ///
275306f32e7eSjoerg /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
275406f32e7eSjoerg ///
275506f32e7eSjoerg /// \param __a
275606f32e7eSjoerg ///    A 128-bit vector containing the left source operand. The one's complement
275706f32e7eSjoerg ///    of this value is used in the bitwise AND.
275806f32e7eSjoerg /// \param __b
275906f32e7eSjoerg ///    A 128-bit vector containing the right source operand.
276006f32e7eSjoerg /// \returns A 128-bit integer vector containing the bitwise AND of the one's
276106f32e7eSjoerg ///    complement of the first operand and the values in the second operand.
276206f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_andnot_si128(__m128i __a,__m128i __b)276306f32e7eSjoerg _mm_andnot_si128(__m128i __a, __m128i __b)
276406f32e7eSjoerg {
276506f32e7eSjoerg   return (__m128i)(~(__v2du)__a & (__v2du)__b);
276606f32e7eSjoerg }
276706f32e7eSjoerg /// Performs a bitwise OR of two 128-bit integer vectors.
276806f32e7eSjoerg ///
276906f32e7eSjoerg /// \headerfile <x86intrin.h>
277006f32e7eSjoerg ///
277106f32e7eSjoerg /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
277206f32e7eSjoerg ///
277306f32e7eSjoerg /// \param __a
277406f32e7eSjoerg ///    A 128-bit integer vector containing one of the source operands.
277506f32e7eSjoerg /// \param __b
277606f32e7eSjoerg ///    A 128-bit integer vector containing one of the source operands.
277706f32e7eSjoerg /// \returns A 128-bit integer vector containing the bitwise OR of the values
277806f32e7eSjoerg ///    in both operands.
277906f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_or_si128(__m128i __a,__m128i __b)278006f32e7eSjoerg _mm_or_si128(__m128i __a, __m128i __b)
278106f32e7eSjoerg {
278206f32e7eSjoerg   return (__m128i)((__v2du)__a | (__v2du)__b);
278306f32e7eSjoerg }
278406f32e7eSjoerg 
278506f32e7eSjoerg /// Performs a bitwise exclusive OR of two 128-bit integer vectors.
278606f32e7eSjoerg ///
278706f32e7eSjoerg /// \headerfile <x86intrin.h>
278806f32e7eSjoerg ///
278906f32e7eSjoerg /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
279006f32e7eSjoerg ///
279106f32e7eSjoerg /// \param __a
279206f32e7eSjoerg ///    A 128-bit integer vector containing one of the source operands.
279306f32e7eSjoerg /// \param __b
279406f32e7eSjoerg ///    A 128-bit integer vector containing one of the source operands.
279506f32e7eSjoerg /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
279606f32e7eSjoerg ///    values in both operands.
279706f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_xor_si128(__m128i __a,__m128i __b)279806f32e7eSjoerg _mm_xor_si128(__m128i __a, __m128i __b)
279906f32e7eSjoerg {
280006f32e7eSjoerg   return (__m128i)((__v2du)__a ^ (__v2du)__b);
280106f32e7eSjoerg }
280206f32e7eSjoerg 
280306f32e7eSjoerg /// Left-shifts the 128-bit integer vector operand by the specified
280406f32e7eSjoerg ///    number of bytes. Low-order bits are cleared.
280506f32e7eSjoerg ///
280606f32e7eSjoerg /// \headerfile <x86intrin.h>
280706f32e7eSjoerg ///
280806f32e7eSjoerg /// \code
280906f32e7eSjoerg /// __m128i _mm_slli_si128(__m128i a, const int imm);
281006f32e7eSjoerg /// \endcode
281106f32e7eSjoerg ///
281206f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
281306f32e7eSjoerg ///
281406f32e7eSjoerg /// \param a
281506f32e7eSjoerg ///    A 128-bit integer vector containing the source operand.
281606f32e7eSjoerg /// \param imm
281706f32e7eSjoerg ///    An immediate value specifying the number of bytes to left-shift operand
281806f32e7eSjoerg ///    \a a.
281906f32e7eSjoerg /// \returns A 128-bit integer vector containing the left-shifted value.
282006f32e7eSjoerg #define _mm_slli_si128(a, imm) \
282106f32e7eSjoerg   (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
282206f32e7eSjoerg 
282306f32e7eSjoerg #define _mm_bslli_si128(a, imm) \
282406f32e7eSjoerg   (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
282506f32e7eSjoerg 
282606f32e7eSjoerg /// Left-shifts each 16-bit value in the 128-bit integer vector operand
282706f32e7eSjoerg ///    by the specified number of bits. Low-order bits are cleared.
282806f32e7eSjoerg ///
282906f32e7eSjoerg /// \headerfile <x86intrin.h>
283006f32e7eSjoerg ///
283106f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
283206f32e7eSjoerg ///
283306f32e7eSjoerg /// \param __a
283406f32e7eSjoerg ///    A 128-bit integer vector containing the source operand.
283506f32e7eSjoerg /// \param __count
283606f32e7eSjoerg ///    An integer value specifying the number of bits to left-shift each value
283706f32e7eSjoerg ///    in operand \a __a.
283806f32e7eSjoerg /// \returns A 128-bit integer vector containing the left-shifted values.
283906f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_slli_epi16(__m128i __a,int __count)284006f32e7eSjoerg _mm_slli_epi16(__m128i __a, int __count)
284106f32e7eSjoerg {
284206f32e7eSjoerg   return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
284306f32e7eSjoerg }
284406f32e7eSjoerg 
284506f32e7eSjoerg /// Left-shifts each 16-bit value in the 128-bit integer vector operand
284606f32e7eSjoerg ///    by the specified number of bits. Low-order bits are cleared.
284706f32e7eSjoerg ///
284806f32e7eSjoerg /// \headerfile <x86intrin.h>
284906f32e7eSjoerg ///
285006f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
285106f32e7eSjoerg ///
285206f32e7eSjoerg /// \param __a
285306f32e7eSjoerg ///    A 128-bit integer vector containing the source operand.
285406f32e7eSjoerg /// \param __count
285506f32e7eSjoerg ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
285606f32e7eSjoerg ///    to left-shift each value in operand \a __a.
285706f32e7eSjoerg /// \returns A 128-bit integer vector containing the left-shifted values.
285806f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sll_epi16(__m128i __a,__m128i __count)285906f32e7eSjoerg _mm_sll_epi16(__m128i __a, __m128i __count)
286006f32e7eSjoerg {
286106f32e7eSjoerg   return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
286206f32e7eSjoerg }
286306f32e7eSjoerg 
286406f32e7eSjoerg /// Left-shifts each 32-bit value in the 128-bit integer vector operand
286506f32e7eSjoerg ///    by the specified number of bits. Low-order bits are cleared.
286606f32e7eSjoerg ///
286706f32e7eSjoerg /// \headerfile <x86intrin.h>
286806f32e7eSjoerg ///
286906f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
287006f32e7eSjoerg ///
287106f32e7eSjoerg /// \param __a
287206f32e7eSjoerg ///    A 128-bit integer vector containing the source operand.
287306f32e7eSjoerg /// \param __count
287406f32e7eSjoerg ///    An integer value specifying the number of bits to left-shift each value
287506f32e7eSjoerg ///    in operand \a __a.
287606f32e7eSjoerg /// \returns A 128-bit integer vector containing the left-shifted values.
287706f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_slli_epi32(__m128i __a,int __count)287806f32e7eSjoerg _mm_slli_epi32(__m128i __a, int __count)
287906f32e7eSjoerg {
288006f32e7eSjoerg   return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
288106f32e7eSjoerg }
288206f32e7eSjoerg 
288306f32e7eSjoerg /// Left-shifts each 32-bit value in the 128-bit integer vector operand
288406f32e7eSjoerg ///    by the specified number of bits. Low-order bits are cleared.
288506f32e7eSjoerg ///
288606f32e7eSjoerg /// \headerfile <x86intrin.h>
288706f32e7eSjoerg ///
288806f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
288906f32e7eSjoerg ///
289006f32e7eSjoerg /// \param __a
289106f32e7eSjoerg ///    A 128-bit integer vector containing the source operand.
289206f32e7eSjoerg /// \param __count
289306f32e7eSjoerg ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
289406f32e7eSjoerg ///    to left-shift each value in operand \a __a.
289506f32e7eSjoerg /// \returns A 128-bit integer vector containing the left-shifted values.
289606f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sll_epi32(__m128i __a,__m128i __count)289706f32e7eSjoerg _mm_sll_epi32(__m128i __a, __m128i __count)
289806f32e7eSjoerg {
289906f32e7eSjoerg   return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
290006f32e7eSjoerg }
290106f32e7eSjoerg 
290206f32e7eSjoerg /// Left-shifts each 64-bit value in the 128-bit integer vector operand
290306f32e7eSjoerg ///    by the specified number of bits. Low-order bits are cleared.
290406f32e7eSjoerg ///
290506f32e7eSjoerg /// \headerfile <x86intrin.h>
290606f32e7eSjoerg ///
290706f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
290806f32e7eSjoerg ///
290906f32e7eSjoerg /// \param __a
291006f32e7eSjoerg ///    A 128-bit integer vector containing the source operand.
291106f32e7eSjoerg /// \param __count
291206f32e7eSjoerg ///    An integer value specifying the number of bits to left-shift each value
291306f32e7eSjoerg ///    in operand \a __a.
291406f32e7eSjoerg /// \returns A 128-bit integer vector containing the left-shifted values.
291506f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_slli_epi64(__m128i __a,int __count)291606f32e7eSjoerg _mm_slli_epi64(__m128i __a, int __count)
291706f32e7eSjoerg {
291806f32e7eSjoerg   return __builtin_ia32_psllqi128((__v2di)__a, __count);
291906f32e7eSjoerg }
292006f32e7eSjoerg 
292106f32e7eSjoerg /// Left-shifts each 64-bit value in the 128-bit integer vector operand
292206f32e7eSjoerg ///    by the specified number of bits. Low-order bits are cleared.
292306f32e7eSjoerg ///
292406f32e7eSjoerg /// \headerfile <x86intrin.h>
292506f32e7eSjoerg ///
292606f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
292706f32e7eSjoerg ///
292806f32e7eSjoerg /// \param __a
292906f32e7eSjoerg ///    A 128-bit integer vector containing the source operand.
293006f32e7eSjoerg /// \param __count
293106f32e7eSjoerg ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
293206f32e7eSjoerg ///    to left-shift each value in operand \a __a.
293306f32e7eSjoerg /// \returns A 128-bit integer vector containing the left-shifted values.
293406f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sll_epi64(__m128i __a,__m128i __count)293506f32e7eSjoerg _mm_sll_epi64(__m128i __a, __m128i __count)
293606f32e7eSjoerg {
293706f32e7eSjoerg   return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
293806f32e7eSjoerg }
293906f32e7eSjoerg 
294006f32e7eSjoerg /// Right-shifts each 16-bit value in the 128-bit integer vector operand
294106f32e7eSjoerg ///    by the specified number of bits. High-order bits are filled with the sign
294206f32e7eSjoerg ///    bit of the initial value.
294306f32e7eSjoerg ///
294406f32e7eSjoerg /// \headerfile <x86intrin.h>
294506f32e7eSjoerg ///
294606f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
294706f32e7eSjoerg ///
294806f32e7eSjoerg /// \param __a
294906f32e7eSjoerg ///    A 128-bit integer vector containing the source operand.
295006f32e7eSjoerg /// \param __count
295106f32e7eSjoerg ///    An integer value specifying the number of bits to right-shift each value
295206f32e7eSjoerg ///    in operand \a __a.
295306f32e7eSjoerg /// \returns A 128-bit integer vector containing the right-shifted values.
295406f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srai_epi16(__m128i __a,int __count)295506f32e7eSjoerg _mm_srai_epi16(__m128i __a, int __count)
295606f32e7eSjoerg {
295706f32e7eSjoerg   return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
295806f32e7eSjoerg }
295906f32e7eSjoerg 
296006f32e7eSjoerg /// Right-shifts each 16-bit value in the 128-bit integer vector operand
296106f32e7eSjoerg ///    by the specified number of bits. High-order bits are filled with the sign
296206f32e7eSjoerg ///    bit of the initial value.
296306f32e7eSjoerg ///
296406f32e7eSjoerg /// \headerfile <x86intrin.h>
296506f32e7eSjoerg ///
296606f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
296706f32e7eSjoerg ///
296806f32e7eSjoerg /// \param __a
296906f32e7eSjoerg ///    A 128-bit integer vector containing the source operand.
297006f32e7eSjoerg /// \param __count
297106f32e7eSjoerg ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
297206f32e7eSjoerg ///    to right-shift each value in operand \a __a.
297306f32e7eSjoerg /// \returns A 128-bit integer vector containing the right-shifted values.
297406f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sra_epi16(__m128i __a,__m128i __count)297506f32e7eSjoerg _mm_sra_epi16(__m128i __a, __m128i __count)
297606f32e7eSjoerg {
297706f32e7eSjoerg   return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
297806f32e7eSjoerg }
297906f32e7eSjoerg 
298006f32e7eSjoerg /// Right-shifts each 32-bit value in the 128-bit integer vector operand
298106f32e7eSjoerg ///    by the specified number of bits. High-order bits are filled with the sign
298206f32e7eSjoerg ///    bit of the initial value.
298306f32e7eSjoerg ///
298406f32e7eSjoerg /// \headerfile <x86intrin.h>
298506f32e7eSjoerg ///
298606f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
298706f32e7eSjoerg ///
298806f32e7eSjoerg /// \param __a
298906f32e7eSjoerg ///    A 128-bit integer vector containing the source operand.
299006f32e7eSjoerg /// \param __count
299106f32e7eSjoerg ///    An integer value specifying the number of bits to right-shift each value
299206f32e7eSjoerg ///    in operand \a __a.
299306f32e7eSjoerg /// \returns A 128-bit integer vector containing the right-shifted values.
299406f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srai_epi32(__m128i __a,int __count)299506f32e7eSjoerg _mm_srai_epi32(__m128i __a, int __count)
299606f32e7eSjoerg {
299706f32e7eSjoerg   return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
299806f32e7eSjoerg }
299906f32e7eSjoerg 
300006f32e7eSjoerg /// Right-shifts each 32-bit value in the 128-bit integer vector operand
300106f32e7eSjoerg ///    by the specified number of bits. High-order bits are filled with the sign
300206f32e7eSjoerg ///    bit of the initial value.
300306f32e7eSjoerg ///
300406f32e7eSjoerg /// \headerfile <x86intrin.h>
300506f32e7eSjoerg ///
300606f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
300706f32e7eSjoerg ///
300806f32e7eSjoerg /// \param __a
300906f32e7eSjoerg ///    A 128-bit integer vector containing the source operand.
301006f32e7eSjoerg /// \param __count
301106f32e7eSjoerg ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
301206f32e7eSjoerg ///    to right-shift each value in operand \a __a.
301306f32e7eSjoerg /// \returns A 128-bit integer vector containing the right-shifted values.
301406f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sra_epi32(__m128i __a,__m128i __count)301506f32e7eSjoerg _mm_sra_epi32(__m128i __a, __m128i __count)
301606f32e7eSjoerg {
301706f32e7eSjoerg   return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
301806f32e7eSjoerg }
301906f32e7eSjoerg 
302006f32e7eSjoerg /// Right-shifts the 128-bit integer vector operand by the specified
302106f32e7eSjoerg ///    number of bytes. High-order bits are cleared.
302206f32e7eSjoerg ///
302306f32e7eSjoerg /// \headerfile <x86intrin.h>
302406f32e7eSjoerg ///
302506f32e7eSjoerg /// \code
302606f32e7eSjoerg /// __m128i _mm_srli_si128(__m128i a, const int imm);
302706f32e7eSjoerg /// \endcode
302806f32e7eSjoerg ///
302906f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
303006f32e7eSjoerg ///
303106f32e7eSjoerg /// \param a
303206f32e7eSjoerg ///    A 128-bit integer vector containing the source operand.
303306f32e7eSjoerg /// \param imm
303406f32e7eSjoerg ///    An immediate value specifying the number of bytes to right-shift operand
303506f32e7eSjoerg ///    \a a.
303606f32e7eSjoerg /// \returns A 128-bit integer vector containing the right-shifted value.
303706f32e7eSjoerg #define _mm_srli_si128(a, imm) \
303806f32e7eSjoerg   (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
303906f32e7eSjoerg 
304006f32e7eSjoerg #define _mm_bsrli_si128(a, imm) \
304106f32e7eSjoerg   (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
304206f32e7eSjoerg 
304306f32e7eSjoerg /// Right-shifts each of 16-bit values in the 128-bit integer vector
304406f32e7eSjoerg ///    operand by the specified number of bits. High-order bits are cleared.
304506f32e7eSjoerg ///
304606f32e7eSjoerg /// \headerfile <x86intrin.h>
304706f32e7eSjoerg ///
304806f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
304906f32e7eSjoerg ///
305006f32e7eSjoerg /// \param __a
305106f32e7eSjoerg ///    A 128-bit integer vector containing the source operand.
305206f32e7eSjoerg /// \param __count
305306f32e7eSjoerg ///    An integer value specifying the number of bits to right-shift each value
305406f32e7eSjoerg ///    in operand \a __a.
305506f32e7eSjoerg /// \returns A 128-bit integer vector containing the right-shifted values.
305606f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srli_epi16(__m128i __a,int __count)305706f32e7eSjoerg _mm_srli_epi16(__m128i __a, int __count)
305806f32e7eSjoerg {
305906f32e7eSjoerg   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
306006f32e7eSjoerg }
306106f32e7eSjoerg 
306206f32e7eSjoerg /// Right-shifts each of 16-bit values in the 128-bit integer vector
306306f32e7eSjoerg ///    operand by the specified number of bits. High-order bits are cleared.
306406f32e7eSjoerg ///
306506f32e7eSjoerg /// \headerfile <x86intrin.h>
306606f32e7eSjoerg ///
306706f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
306806f32e7eSjoerg ///
306906f32e7eSjoerg /// \param __a
307006f32e7eSjoerg ///    A 128-bit integer vector containing the source operand.
307106f32e7eSjoerg /// \param __count
307206f32e7eSjoerg ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
307306f32e7eSjoerg ///    to right-shift each value in operand \a __a.
307406f32e7eSjoerg /// \returns A 128-bit integer vector containing the right-shifted values.
307506f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srl_epi16(__m128i __a,__m128i __count)307606f32e7eSjoerg _mm_srl_epi16(__m128i __a, __m128i __count)
307706f32e7eSjoerg {
307806f32e7eSjoerg   return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
307906f32e7eSjoerg }
308006f32e7eSjoerg 
308106f32e7eSjoerg /// Right-shifts each of 32-bit values in the 128-bit integer vector
308206f32e7eSjoerg ///    operand by the specified number of bits. High-order bits are cleared.
308306f32e7eSjoerg ///
308406f32e7eSjoerg /// \headerfile <x86intrin.h>
308506f32e7eSjoerg ///
308606f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
308706f32e7eSjoerg ///
308806f32e7eSjoerg /// \param __a
308906f32e7eSjoerg ///    A 128-bit integer vector containing the source operand.
309006f32e7eSjoerg /// \param __count
309106f32e7eSjoerg ///    An integer value specifying the number of bits to right-shift each value
309206f32e7eSjoerg ///    in operand \a __a.
309306f32e7eSjoerg /// \returns A 128-bit integer vector containing the right-shifted values.
309406f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srli_epi32(__m128i __a,int __count)309506f32e7eSjoerg _mm_srli_epi32(__m128i __a, int __count)
309606f32e7eSjoerg {
309706f32e7eSjoerg   return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
309806f32e7eSjoerg }
309906f32e7eSjoerg 
310006f32e7eSjoerg /// Right-shifts each of 32-bit values in the 128-bit integer vector
310106f32e7eSjoerg ///    operand by the specified number of bits. High-order bits are cleared.
310206f32e7eSjoerg ///
310306f32e7eSjoerg /// \headerfile <x86intrin.h>
310406f32e7eSjoerg ///
310506f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
310606f32e7eSjoerg ///
310706f32e7eSjoerg /// \param __a
310806f32e7eSjoerg ///    A 128-bit integer vector containing the source operand.
310906f32e7eSjoerg /// \param __count
311006f32e7eSjoerg ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
311106f32e7eSjoerg ///    to right-shift each value in operand \a __a.
311206f32e7eSjoerg /// \returns A 128-bit integer vector containing the right-shifted values.
311306f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srl_epi32(__m128i __a,__m128i __count)311406f32e7eSjoerg _mm_srl_epi32(__m128i __a, __m128i __count)
311506f32e7eSjoerg {
311606f32e7eSjoerg   return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
311706f32e7eSjoerg }
311806f32e7eSjoerg 
311906f32e7eSjoerg /// Right-shifts each of 64-bit values in the 128-bit integer vector
312006f32e7eSjoerg ///    operand by the specified number of bits. High-order bits are cleared.
312106f32e7eSjoerg ///
312206f32e7eSjoerg /// \headerfile <x86intrin.h>
312306f32e7eSjoerg ///
312406f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
312506f32e7eSjoerg ///
312606f32e7eSjoerg /// \param __a
312706f32e7eSjoerg ///    A 128-bit integer vector containing the source operand.
312806f32e7eSjoerg /// \param __count
312906f32e7eSjoerg ///    An integer value specifying the number of bits to right-shift each value
313006f32e7eSjoerg ///    in operand \a __a.
313106f32e7eSjoerg /// \returns A 128-bit integer vector containing the right-shifted values.
313206f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srli_epi64(__m128i __a,int __count)313306f32e7eSjoerg _mm_srli_epi64(__m128i __a, int __count)
313406f32e7eSjoerg {
313506f32e7eSjoerg   return __builtin_ia32_psrlqi128((__v2di)__a, __count);
313606f32e7eSjoerg }
313706f32e7eSjoerg 
313806f32e7eSjoerg /// Right-shifts each of 64-bit values in the 128-bit integer vector
313906f32e7eSjoerg ///    operand by the specified number of bits. High-order bits are cleared.
314006f32e7eSjoerg ///
314106f32e7eSjoerg /// \headerfile <x86intrin.h>
314206f32e7eSjoerg ///
314306f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
314406f32e7eSjoerg ///
314506f32e7eSjoerg /// \param __a
314606f32e7eSjoerg ///    A 128-bit integer vector containing the source operand.
314706f32e7eSjoerg /// \param __count
314806f32e7eSjoerg ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
314906f32e7eSjoerg ///    to right-shift each value in operand \a __a.
315006f32e7eSjoerg /// \returns A 128-bit integer vector containing the right-shifted values.
315106f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_srl_epi64(__m128i __a,__m128i __count)315206f32e7eSjoerg _mm_srl_epi64(__m128i __a, __m128i __count)
315306f32e7eSjoerg {
315406f32e7eSjoerg   return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
315506f32e7eSjoerg }
315606f32e7eSjoerg 
315706f32e7eSjoerg /// Compares each of the corresponding 8-bit values of the 128-bit
315806f32e7eSjoerg ///    integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
315906f32e7eSjoerg ///    for true.
316006f32e7eSjoerg ///
316106f32e7eSjoerg /// \headerfile <x86intrin.h>
316206f32e7eSjoerg ///
316306f32e7eSjoerg /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
316406f32e7eSjoerg ///
316506f32e7eSjoerg /// \param __a
316606f32e7eSjoerg ///    A 128-bit integer vector.
316706f32e7eSjoerg /// \param __b
316806f32e7eSjoerg ///    A 128-bit integer vector.
316906f32e7eSjoerg /// \returns A 128-bit integer vector containing the comparison results.
317006f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpeq_epi8(__m128i __a,__m128i __b)317106f32e7eSjoerg _mm_cmpeq_epi8(__m128i __a, __m128i __b)
317206f32e7eSjoerg {
317306f32e7eSjoerg   return (__m128i)((__v16qi)__a == (__v16qi)__b);
317406f32e7eSjoerg }
317506f32e7eSjoerg 
317606f32e7eSjoerg /// Compares each of the corresponding 16-bit values of the 128-bit
317706f32e7eSjoerg ///    integer vectors for equality. Each comparison yields 0x0 for false,
317806f32e7eSjoerg ///    0xFFFF for true.
317906f32e7eSjoerg ///
318006f32e7eSjoerg /// \headerfile <x86intrin.h>
318106f32e7eSjoerg ///
318206f32e7eSjoerg /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
318306f32e7eSjoerg ///
318406f32e7eSjoerg /// \param __a
318506f32e7eSjoerg ///    A 128-bit integer vector.
318606f32e7eSjoerg /// \param __b
318706f32e7eSjoerg ///    A 128-bit integer vector.
318806f32e7eSjoerg /// \returns A 128-bit integer vector containing the comparison results.
318906f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpeq_epi16(__m128i __a,__m128i __b)319006f32e7eSjoerg _mm_cmpeq_epi16(__m128i __a, __m128i __b)
319106f32e7eSjoerg {
319206f32e7eSjoerg   return (__m128i)((__v8hi)__a == (__v8hi)__b);
319306f32e7eSjoerg }
319406f32e7eSjoerg 
319506f32e7eSjoerg /// Compares each of the corresponding 32-bit values of the 128-bit
319606f32e7eSjoerg ///    integer vectors for equality. Each comparison yields 0x0 for false,
319706f32e7eSjoerg ///    0xFFFFFFFF for true.
319806f32e7eSjoerg ///
319906f32e7eSjoerg /// \headerfile <x86intrin.h>
320006f32e7eSjoerg ///
320106f32e7eSjoerg /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
320206f32e7eSjoerg ///
320306f32e7eSjoerg /// \param __a
320406f32e7eSjoerg ///    A 128-bit integer vector.
320506f32e7eSjoerg /// \param __b
320606f32e7eSjoerg ///    A 128-bit integer vector.
320706f32e7eSjoerg /// \returns A 128-bit integer vector containing the comparison results.
320806f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpeq_epi32(__m128i __a,__m128i __b)320906f32e7eSjoerg _mm_cmpeq_epi32(__m128i __a, __m128i __b)
321006f32e7eSjoerg {
321106f32e7eSjoerg   return (__m128i)((__v4si)__a == (__v4si)__b);
321206f32e7eSjoerg }
321306f32e7eSjoerg 
321406f32e7eSjoerg /// Compares each of the corresponding signed 8-bit values of the 128-bit
321506f32e7eSjoerg ///    integer vectors to determine if the values in the first operand are
321606f32e7eSjoerg ///    greater than those in the second operand. Each comparison yields 0x0 for
321706f32e7eSjoerg ///    false, 0xFF for true.
321806f32e7eSjoerg ///
321906f32e7eSjoerg /// \headerfile <x86intrin.h>
322006f32e7eSjoerg ///
322106f32e7eSjoerg /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
322206f32e7eSjoerg ///
322306f32e7eSjoerg /// \param __a
322406f32e7eSjoerg ///    A 128-bit integer vector.
322506f32e7eSjoerg /// \param __b
322606f32e7eSjoerg ///    A 128-bit integer vector.
322706f32e7eSjoerg /// \returns A 128-bit integer vector containing the comparison results.
322806f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpgt_epi8(__m128i __a,__m128i __b)322906f32e7eSjoerg _mm_cmpgt_epi8(__m128i __a, __m128i __b)
323006f32e7eSjoerg {
323106f32e7eSjoerg   /* This function always performs a signed comparison, but __v16qi is a char
323206f32e7eSjoerg      which may be signed or unsigned, so use __v16qs. */
323306f32e7eSjoerg   return (__m128i)((__v16qs)__a > (__v16qs)__b);
323406f32e7eSjoerg }
323506f32e7eSjoerg 
323606f32e7eSjoerg /// Compares each of the corresponding signed 16-bit values of the
323706f32e7eSjoerg ///    128-bit integer vectors to determine if the values in the first operand
323806f32e7eSjoerg ///    are greater than those in the second operand.
323906f32e7eSjoerg ///
324006f32e7eSjoerg ///    Each comparison yields 0x0 for false, 0xFFFF for true.
324106f32e7eSjoerg ///
324206f32e7eSjoerg /// \headerfile <x86intrin.h>
324306f32e7eSjoerg ///
324406f32e7eSjoerg /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
324506f32e7eSjoerg ///
324606f32e7eSjoerg /// \param __a
324706f32e7eSjoerg ///    A 128-bit integer vector.
324806f32e7eSjoerg /// \param __b
324906f32e7eSjoerg ///    A 128-bit integer vector.
325006f32e7eSjoerg /// \returns A 128-bit integer vector containing the comparison results.
325106f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpgt_epi16(__m128i __a,__m128i __b)325206f32e7eSjoerg _mm_cmpgt_epi16(__m128i __a, __m128i __b)
325306f32e7eSjoerg {
325406f32e7eSjoerg   return (__m128i)((__v8hi)__a > (__v8hi)__b);
325506f32e7eSjoerg }
325606f32e7eSjoerg 
325706f32e7eSjoerg /// Compares each of the corresponding signed 32-bit values of the
325806f32e7eSjoerg ///    128-bit integer vectors to determine if the values in the first operand
325906f32e7eSjoerg ///    are greater than those in the second operand.
326006f32e7eSjoerg ///
326106f32e7eSjoerg ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
326206f32e7eSjoerg ///
326306f32e7eSjoerg /// \headerfile <x86intrin.h>
326406f32e7eSjoerg ///
326506f32e7eSjoerg /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
326606f32e7eSjoerg ///
326706f32e7eSjoerg /// \param __a
326806f32e7eSjoerg ///    A 128-bit integer vector.
326906f32e7eSjoerg /// \param __b
327006f32e7eSjoerg ///    A 128-bit integer vector.
327106f32e7eSjoerg /// \returns A 128-bit integer vector containing the comparison results.
327206f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmpgt_epi32(__m128i __a,__m128i __b)327306f32e7eSjoerg _mm_cmpgt_epi32(__m128i __a, __m128i __b)
327406f32e7eSjoerg {
327506f32e7eSjoerg   return (__m128i)((__v4si)__a > (__v4si)__b);
327606f32e7eSjoerg }
327706f32e7eSjoerg 
327806f32e7eSjoerg /// Compares each of the corresponding signed 8-bit values of the 128-bit
327906f32e7eSjoerg ///    integer vectors to determine if the values in the first operand are less
328006f32e7eSjoerg ///    than those in the second operand.
328106f32e7eSjoerg ///
328206f32e7eSjoerg ///    Each comparison yields 0x0 for false, 0xFF for true.
328306f32e7eSjoerg ///
328406f32e7eSjoerg /// \headerfile <x86intrin.h>
328506f32e7eSjoerg ///
328606f32e7eSjoerg /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
328706f32e7eSjoerg ///
328806f32e7eSjoerg /// \param __a
328906f32e7eSjoerg ///    A 128-bit integer vector.
329006f32e7eSjoerg /// \param __b
329106f32e7eSjoerg ///    A 128-bit integer vector.
329206f32e7eSjoerg /// \returns A 128-bit integer vector containing the comparison results.
329306f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmplt_epi8(__m128i __a,__m128i __b)329406f32e7eSjoerg _mm_cmplt_epi8(__m128i __a, __m128i __b)
329506f32e7eSjoerg {
329606f32e7eSjoerg   return _mm_cmpgt_epi8(__b, __a);
329706f32e7eSjoerg }
329806f32e7eSjoerg 
329906f32e7eSjoerg /// Compares each of the corresponding signed 16-bit values of the
330006f32e7eSjoerg ///    128-bit integer vectors to determine if the values in the first operand
330106f32e7eSjoerg ///    are less than those in the second operand.
330206f32e7eSjoerg ///
330306f32e7eSjoerg ///    Each comparison yields 0x0 for false, 0xFFFF for true.
330406f32e7eSjoerg ///
330506f32e7eSjoerg /// \headerfile <x86intrin.h>
330606f32e7eSjoerg ///
330706f32e7eSjoerg /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
330806f32e7eSjoerg ///
330906f32e7eSjoerg /// \param __a
331006f32e7eSjoerg ///    A 128-bit integer vector.
331106f32e7eSjoerg /// \param __b
331206f32e7eSjoerg ///    A 128-bit integer vector.
331306f32e7eSjoerg /// \returns A 128-bit integer vector containing the comparison results.
331406f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmplt_epi16(__m128i __a,__m128i __b)331506f32e7eSjoerg _mm_cmplt_epi16(__m128i __a, __m128i __b)
331606f32e7eSjoerg {
331706f32e7eSjoerg   return _mm_cmpgt_epi16(__b, __a);
331806f32e7eSjoerg }
331906f32e7eSjoerg 
332006f32e7eSjoerg /// Compares each of the corresponding signed 32-bit values of the
332106f32e7eSjoerg ///    128-bit integer vectors to determine if the values in the first operand
332206f32e7eSjoerg ///    are less than those in the second operand.
332306f32e7eSjoerg ///
332406f32e7eSjoerg ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
332506f32e7eSjoerg ///
332606f32e7eSjoerg /// \headerfile <x86intrin.h>
332706f32e7eSjoerg ///
332806f32e7eSjoerg /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
332906f32e7eSjoerg ///
333006f32e7eSjoerg /// \param __a
333106f32e7eSjoerg ///    A 128-bit integer vector.
333206f32e7eSjoerg /// \param __b
333306f32e7eSjoerg ///    A 128-bit integer vector.
333406f32e7eSjoerg /// \returns A 128-bit integer vector containing the comparison results.
333506f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cmplt_epi32(__m128i __a,__m128i __b)333606f32e7eSjoerg _mm_cmplt_epi32(__m128i __a, __m128i __b)
333706f32e7eSjoerg {
333806f32e7eSjoerg   return _mm_cmpgt_epi32(__b, __a);
333906f32e7eSjoerg }
334006f32e7eSjoerg 
334106f32e7eSjoerg #ifdef __x86_64__
334206f32e7eSjoerg /// Converts a 64-bit signed integer value from the second operand into a
334306f32e7eSjoerg ///    double-precision value and returns it in the lower element of a [2 x
334406f32e7eSjoerg ///    double] vector; the upper element of the returned vector is copied from
334506f32e7eSjoerg ///    the upper element of the first operand.
334606f32e7eSjoerg ///
334706f32e7eSjoerg /// \headerfile <x86intrin.h>
334806f32e7eSjoerg ///
334906f32e7eSjoerg /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
335006f32e7eSjoerg ///
335106f32e7eSjoerg /// \param __a
335206f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
335306f32e7eSjoerg ///    copied to the upper 64 bits of the destination.
335406f32e7eSjoerg /// \param __b
335506f32e7eSjoerg ///    A 64-bit signed integer operand containing the value to be converted.
335606f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
335706f32e7eSjoerg ///    converted value of the second operand. The upper 64 bits are copied from
335806f32e7eSjoerg ///    the upper 64 bits of the first operand.
335906f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtsi64_sd(__m128d __a,long long __b)336006f32e7eSjoerg _mm_cvtsi64_sd(__m128d __a, long long __b)
336106f32e7eSjoerg {
336206f32e7eSjoerg   __a[0] = __b;
336306f32e7eSjoerg   return __a;
336406f32e7eSjoerg }
336506f32e7eSjoerg 
336606f32e7eSjoerg /// Converts the first (lower) element of a vector of [2 x double] into a
336706f32e7eSjoerg ///    64-bit signed integer value, according to the current rounding mode.
336806f32e7eSjoerg ///
336906f32e7eSjoerg /// \headerfile <x86intrin.h>
337006f32e7eSjoerg ///
337106f32e7eSjoerg /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
337206f32e7eSjoerg ///
337306f32e7eSjoerg /// \param __a
337406f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
337506f32e7eSjoerg ///    conversion.
337606f32e7eSjoerg /// \returns A 64-bit signed integer containing the converted value.
337706f32e7eSjoerg static __inline__ long long __DEFAULT_FN_ATTRS
_mm_cvtsd_si64(__m128d __a)337806f32e7eSjoerg _mm_cvtsd_si64(__m128d __a)
337906f32e7eSjoerg {
338006f32e7eSjoerg   return __builtin_ia32_cvtsd2si64((__v2df)__a);
338106f32e7eSjoerg }
338206f32e7eSjoerg 
338306f32e7eSjoerg /// Converts the first (lower) element of a vector of [2 x double] into a
338406f32e7eSjoerg ///    64-bit signed integer value, truncating the result when it is inexact.
338506f32e7eSjoerg ///
338606f32e7eSjoerg /// \headerfile <x86intrin.h>
338706f32e7eSjoerg ///
338806f32e7eSjoerg /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
338906f32e7eSjoerg ///   instruction.
339006f32e7eSjoerg ///
339106f32e7eSjoerg /// \param __a
339206f32e7eSjoerg ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
339306f32e7eSjoerg ///    conversion.
339406f32e7eSjoerg /// \returns A 64-bit signed integer containing the converted value.
339506f32e7eSjoerg static __inline__ long long __DEFAULT_FN_ATTRS
_mm_cvttsd_si64(__m128d __a)339606f32e7eSjoerg _mm_cvttsd_si64(__m128d __a)
339706f32e7eSjoerg {
339806f32e7eSjoerg   return __builtin_ia32_cvttsd2si64((__v2df)__a);
339906f32e7eSjoerg }
340006f32e7eSjoerg #endif
340106f32e7eSjoerg 
340206f32e7eSjoerg /// Converts a vector of [4 x i32] into a vector of [4 x float].
340306f32e7eSjoerg ///
340406f32e7eSjoerg /// \headerfile <x86intrin.h>
340506f32e7eSjoerg ///
340606f32e7eSjoerg /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
340706f32e7eSjoerg ///
340806f32e7eSjoerg /// \param __a
340906f32e7eSjoerg ///    A 128-bit integer vector.
341006f32e7eSjoerg /// \returns A 128-bit vector of [4 x float] containing the converted values.
341106f32e7eSjoerg static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtepi32_ps(__m128i __a)341206f32e7eSjoerg _mm_cvtepi32_ps(__m128i __a)
341306f32e7eSjoerg {
341406f32e7eSjoerg   return (__m128)__builtin_convertvector((__v4si)__a, __v4sf);
341506f32e7eSjoerg }
341606f32e7eSjoerg 
341706f32e7eSjoerg /// Converts a vector of [4 x float] into a vector of [4 x i32].
341806f32e7eSjoerg ///
341906f32e7eSjoerg /// \headerfile <x86intrin.h>
342006f32e7eSjoerg ///
342106f32e7eSjoerg /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
342206f32e7eSjoerg ///
342306f32e7eSjoerg /// \param __a
342406f32e7eSjoerg ///    A 128-bit vector of [4 x float].
342506f32e7eSjoerg /// \returns A 128-bit integer vector of [4 x i32] containing the converted
342606f32e7eSjoerg ///    values.
342706f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtps_epi32(__m128 __a)342806f32e7eSjoerg _mm_cvtps_epi32(__m128 __a)
342906f32e7eSjoerg {
343006f32e7eSjoerg   return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
343106f32e7eSjoerg }
343206f32e7eSjoerg 
343306f32e7eSjoerg /// Converts a vector of [4 x float] into a vector of [4 x i32],
343406f32e7eSjoerg ///    truncating the result when it is inexact.
343506f32e7eSjoerg ///
343606f32e7eSjoerg /// \headerfile <x86intrin.h>
343706f32e7eSjoerg ///
343806f32e7eSjoerg /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
343906f32e7eSjoerg ///   instruction.
344006f32e7eSjoerg ///
344106f32e7eSjoerg /// \param __a
344206f32e7eSjoerg ///    A 128-bit vector of [4 x float].
344306f32e7eSjoerg /// \returns A 128-bit vector of [4 x i32] containing the converted values.
344406f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvttps_epi32(__m128 __a)344506f32e7eSjoerg _mm_cvttps_epi32(__m128 __a)
344606f32e7eSjoerg {
344706f32e7eSjoerg   return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
344806f32e7eSjoerg }
344906f32e7eSjoerg 
345006f32e7eSjoerg /// Returns a vector of [4 x i32] where the lowest element is the input
345106f32e7eSjoerg ///    operand and the remaining elements are zero.
345206f32e7eSjoerg ///
345306f32e7eSjoerg /// \headerfile <x86intrin.h>
345406f32e7eSjoerg ///
345506f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
345606f32e7eSjoerg ///
345706f32e7eSjoerg /// \param __a
345806f32e7eSjoerg ///    A 32-bit signed integer operand.
345906f32e7eSjoerg /// \returns A 128-bit vector of [4 x i32].
346006f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtsi32_si128(int __a)346106f32e7eSjoerg _mm_cvtsi32_si128(int __a)
346206f32e7eSjoerg {
346306f32e7eSjoerg   return __extension__ (__m128i)(__v4si){ __a, 0, 0, 0 };
346406f32e7eSjoerg }
346506f32e7eSjoerg 
346606f32e7eSjoerg #ifdef __x86_64__
346706f32e7eSjoerg /// Returns a vector of [2 x i64] where the lower element is the input
346806f32e7eSjoerg ///    operand and the upper element is zero.
346906f32e7eSjoerg ///
347006f32e7eSjoerg /// \headerfile <x86intrin.h>
347106f32e7eSjoerg ///
347206f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
347306f32e7eSjoerg ///
347406f32e7eSjoerg /// \param __a
347506f32e7eSjoerg ///    A 64-bit signed integer operand containing the value to be converted.
347606f32e7eSjoerg /// \returns A 128-bit vector of [2 x i64] containing the converted value.
347706f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtsi64_si128(long long __a)347806f32e7eSjoerg _mm_cvtsi64_si128(long long __a)
347906f32e7eSjoerg {
348006f32e7eSjoerg   return __extension__ (__m128i)(__v2di){ __a, 0 };
348106f32e7eSjoerg }
348206f32e7eSjoerg #endif
348306f32e7eSjoerg 
348406f32e7eSjoerg /// Moves the least significant 32 bits of a vector of [4 x i32] to a
348506f32e7eSjoerg ///    32-bit signed integer value.
348606f32e7eSjoerg ///
348706f32e7eSjoerg /// \headerfile <x86intrin.h>
348806f32e7eSjoerg ///
348906f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
349006f32e7eSjoerg ///
349106f32e7eSjoerg /// \param __a
349206f32e7eSjoerg ///    A vector of [4 x i32]. The least significant 32 bits are moved to the
349306f32e7eSjoerg ///    destination.
349406f32e7eSjoerg /// \returns A 32-bit signed integer containing the moved value.
349506f32e7eSjoerg static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvtsi128_si32(__m128i __a)349606f32e7eSjoerg _mm_cvtsi128_si32(__m128i __a)
349706f32e7eSjoerg {
349806f32e7eSjoerg   __v4si __b = (__v4si)__a;
349906f32e7eSjoerg   return __b[0];
350006f32e7eSjoerg }
350106f32e7eSjoerg 
350206f32e7eSjoerg #ifdef __x86_64__
350306f32e7eSjoerg /// Moves the least significant 64 bits of a vector of [2 x i64] to a
350406f32e7eSjoerg ///    64-bit signed integer value.
350506f32e7eSjoerg ///
350606f32e7eSjoerg /// \headerfile <x86intrin.h>
350706f32e7eSjoerg ///
350806f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
350906f32e7eSjoerg ///
351006f32e7eSjoerg /// \param __a
351106f32e7eSjoerg ///    A vector of [2 x i64]. The least significant 64 bits are moved to the
351206f32e7eSjoerg ///    destination.
351306f32e7eSjoerg /// \returns A 64-bit signed integer containing the moved value.
351406f32e7eSjoerg static __inline__ long long __DEFAULT_FN_ATTRS
_mm_cvtsi128_si64(__m128i __a)351506f32e7eSjoerg _mm_cvtsi128_si64(__m128i __a)
351606f32e7eSjoerg {
351706f32e7eSjoerg   return __a[0];
351806f32e7eSjoerg }
351906f32e7eSjoerg #endif
352006f32e7eSjoerg 
352106f32e7eSjoerg /// Moves packed integer values from an aligned 128-bit memory location
352206f32e7eSjoerg ///    to elements in a 128-bit integer vector.
352306f32e7eSjoerg ///
352406f32e7eSjoerg /// \headerfile <x86intrin.h>
352506f32e7eSjoerg ///
352606f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
352706f32e7eSjoerg ///
352806f32e7eSjoerg /// \param __p
352906f32e7eSjoerg ///    An aligned pointer to a memory location containing integer values.
353006f32e7eSjoerg /// \returns A 128-bit integer vector containing the moved values.
353106f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_load_si128(__m128i const * __p)353206f32e7eSjoerg _mm_load_si128(__m128i const *__p)
353306f32e7eSjoerg {
353406f32e7eSjoerg   return *__p;
353506f32e7eSjoerg }
353606f32e7eSjoerg 
353706f32e7eSjoerg /// Moves packed integer values from an unaligned 128-bit memory location
353806f32e7eSjoerg ///    to elements in a 128-bit integer vector.
353906f32e7eSjoerg ///
354006f32e7eSjoerg /// \headerfile <x86intrin.h>
354106f32e7eSjoerg ///
354206f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
354306f32e7eSjoerg ///
354406f32e7eSjoerg /// \param __p
354506f32e7eSjoerg ///    A pointer to a memory location containing integer values.
354606f32e7eSjoerg /// \returns A 128-bit integer vector containing the moved values.
354706f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si128(__m128i_u const * __p)354806f32e7eSjoerg _mm_loadu_si128(__m128i_u const *__p)
354906f32e7eSjoerg {
355006f32e7eSjoerg   struct __loadu_si128 {
355106f32e7eSjoerg     __m128i_u __v;
355206f32e7eSjoerg   } __attribute__((__packed__, __may_alias__));
3553*13fbcb42Sjoerg   return ((const struct __loadu_si128*)__p)->__v;
355406f32e7eSjoerg }
355506f32e7eSjoerg 
355606f32e7eSjoerg /// Returns a vector of [2 x i64] where the lower element is taken from
355706f32e7eSjoerg ///    the lower element of the operand, and the upper element is zero.
355806f32e7eSjoerg ///
355906f32e7eSjoerg /// \headerfile <x86intrin.h>
356006f32e7eSjoerg ///
356106f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
356206f32e7eSjoerg ///
356306f32e7eSjoerg /// \param __p
356406f32e7eSjoerg ///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
356506f32e7eSjoerg ///    the destination.
356606f32e7eSjoerg /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
356706f32e7eSjoerg ///    moved value. The higher order bits are cleared.
356806f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadl_epi64(__m128i_u const * __p)356906f32e7eSjoerg _mm_loadl_epi64(__m128i_u const *__p)
357006f32e7eSjoerg {
357106f32e7eSjoerg   struct __mm_loadl_epi64_struct {
357206f32e7eSjoerg     long long __u;
357306f32e7eSjoerg   } __attribute__((__packed__, __may_alias__));
3574*13fbcb42Sjoerg   return __extension__ (__m128i) { ((const struct __mm_loadl_epi64_struct*)__p)->__u, 0};
357506f32e7eSjoerg }
357606f32e7eSjoerg 
357706f32e7eSjoerg /// Generates a 128-bit vector of [4 x i32] with unspecified content.
357806f32e7eSjoerg ///    This could be used as an argument to another intrinsic function where the
357906f32e7eSjoerg ///    argument is required but the value is not actually used.
358006f32e7eSjoerg ///
358106f32e7eSjoerg /// \headerfile <x86intrin.h>
358206f32e7eSjoerg ///
358306f32e7eSjoerg /// This intrinsic has no corresponding instruction.
358406f32e7eSjoerg ///
358506f32e7eSjoerg /// \returns A 128-bit vector of [4 x i32] with unspecified content.
358606f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_undefined_si128(void)358706f32e7eSjoerg _mm_undefined_si128(void)
358806f32e7eSjoerg {
358906f32e7eSjoerg   return (__m128i)__builtin_ia32_undef128();
359006f32e7eSjoerg }
359106f32e7eSjoerg 
359206f32e7eSjoerg /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
359306f32e7eSjoerg ///    the specified 64-bit integer values.
359406f32e7eSjoerg ///
359506f32e7eSjoerg /// \headerfile <x86intrin.h>
359606f32e7eSjoerg ///
359706f32e7eSjoerg /// This intrinsic is a utility function and does not correspond to a specific
359806f32e7eSjoerg ///    instruction.
359906f32e7eSjoerg ///
360006f32e7eSjoerg /// \param __q1
360106f32e7eSjoerg ///    A 64-bit integer value used to initialize the upper 64 bits of the
360206f32e7eSjoerg ///    destination vector of [2 x i64].
360306f32e7eSjoerg /// \param __q0
360406f32e7eSjoerg ///    A 64-bit integer value used to initialize the lower 64 bits of the
360506f32e7eSjoerg ///    destination vector of [2 x i64].
360606f32e7eSjoerg /// \returns An initialized 128-bit vector of [2 x i64] containing the values
360706f32e7eSjoerg ///    provided in the operands.
360806f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi64x(long long __q1,long long __q0)360906f32e7eSjoerg _mm_set_epi64x(long long __q1, long long __q0)
361006f32e7eSjoerg {
361106f32e7eSjoerg   return __extension__ (__m128i)(__v2di){ __q0, __q1 };
361206f32e7eSjoerg }
361306f32e7eSjoerg 
361406f32e7eSjoerg /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
361506f32e7eSjoerg ///    the specified 64-bit integer values.
361606f32e7eSjoerg ///
361706f32e7eSjoerg /// \headerfile <x86intrin.h>
361806f32e7eSjoerg ///
361906f32e7eSjoerg /// This intrinsic is a utility function and does not correspond to a specific
362006f32e7eSjoerg ///    instruction.
362106f32e7eSjoerg ///
362206f32e7eSjoerg /// \param __q1
362306f32e7eSjoerg ///    A 64-bit integer value used to initialize the upper 64 bits of the
362406f32e7eSjoerg ///    destination vector of [2 x i64].
362506f32e7eSjoerg /// \param __q0
362606f32e7eSjoerg ///    A 64-bit integer value used to initialize the lower 64 bits of the
362706f32e7eSjoerg ///    destination vector of [2 x i64].
362806f32e7eSjoerg /// \returns An initialized 128-bit vector of [2 x i64] containing the values
362906f32e7eSjoerg ///    provided in the operands.
363006f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi64(__m64 __q1,__m64 __q0)363106f32e7eSjoerg _mm_set_epi64(__m64 __q1, __m64 __q0)
363206f32e7eSjoerg {
363306f32e7eSjoerg   return _mm_set_epi64x((long long)__q1, (long long)__q0);
363406f32e7eSjoerg }
363506f32e7eSjoerg 
363606f32e7eSjoerg /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
363706f32e7eSjoerg ///    the specified 32-bit integer values.
363806f32e7eSjoerg ///
363906f32e7eSjoerg /// \headerfile <x86intrin.h>
364006f32e7eSjoerg ///
364106f32e7eSjoerg /// This intrinsic is a utility function and does not correspond to a specific
364206f32e7eSjoerg ///    instruction.
364306f32e7eSjoerg ///
364406f32e7eSjoerg /// \param __i3
364506f32e7eSjoerg ///    A 32-bit integer value used to initialize bits [127:96] of the
364606f32e7eSjoerg ///    destination vector.
364706f32e7eSjoerg /// \param __i2
364806f32e7eSjoerg ///    A 32-bit integer value used to initialize bits [95:64] of the destination
364906f32e7eSjoerg ///    vector.
365006f32e7eSjoerg /// \param __i1
365106f32e7eSjoerg ///    A 32-bit integer value used to initialize bits [63:32] of the destination
365206f32e7eSjoerg ///    vector.
365306f32e7eSjoerg /// \param __i0
365406f32e7eSjoerg ///    A 32-bit integer value used to initialize bits [31:0] of the destination
365506f32e7eSjoerg ///    vector.
365606f32e7eSjoerg /// \returns An initialized 128-bit vector of [4 x i32] containing the values
365706f32e7eSjoerg ///    provided in the operands.
365806f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi32(int __i3,int __i2,int __i1,int __i0)365906f32e7eSjoerg _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
366006f32e7eSjoerg {
366106f32e7eSjoerg   return __extension__ (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
366206f32e7eSjoerg }
366306f32e7eSjoerg 
366406f32e7eSjoerg /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
366506f32e7eSjoerg ///    the specified 16-bit integer values.
366606f32e7eSjoerg ///
366706f32e7eSjoerg /// \headerfile <x86intrin.h>
366806f32e7eSjoerg ///
366906f32e7eSjoerg /// This intrinsic is a utility function and does not correspond to a specific
367006f32e7eSjoerg ///    instruction.
367106f32e7eSjoerg ///
367206f32e7eSjoerg /// \param __w7
367306f32e7eSjoerg ///    A 16-bit integer value used to initialize bits [127:112] of the
367406f32e7eSjoerg ///    destination vector.
367506f32e7eSjoerg /// \param __w6
367606f32e7eSjoerg ///    A 16-bit integer value used to initialize bits [111:96] of the
367706f32e7eSjoerg ///    destination vector.
367806f32e7eSjoerg /// \param __w5
367906f32e7eSjoerg ///    A 16-bit integer value used to initialize bits [95:80] of the destination
368006f32e7eSjoerg ///    vector.
368106f32e7eSjoerg /// \param __w4
368206f32e7eSjoerg ///    A 16-bit integer value used to initialize bits [79:64] of the destination
368306f32e7eSjoerg ///    vector.
368406f32e7eSjoerg /// \param __w3
368506f32e7eSjoerg ///    A 16-bit integer value used to initialize bits [63:48] of the destination
368606f32e7eSjoerg ///    vector.
368706f32e7eSjoerg /// \param __w2
368806f32e7eSjoerg ///    A 16-bit integer value used to initialize bits [47:32] of the destination
368906f32e7eSjoerg ///    vector.
369006f32e7eSjoerg /// \param __w1
369106f32e7eSjoerg ///    A 16-bit integer value used to initialize bits [31:16] of the destination
369206f32e7eSjoerg ///    vector.
369306f32e7eSjoerg /// \param __w0
369406f32e7eSjoerg ///    A 16-bit integer value used to initialize bits [15:0] of the destination
369506f32e7eSjoerg ///    vector.
369606f32e7eSjoerg /// \returns An initialized 128-bit vector of [8 x i16] containing the values
369706f32e7eSjoerg ///    provided in the operands.
369806f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi16(short __w7,short __w6,short __w5,short __w4,short __w3,short __w2,short __w1,short __w0)369906f32e7eSjoerg _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
370006f32e7eSjoerg {
370106f32e7eSjoerg   return __extension__ (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
370206f32e7eSjoerg }
370306f32e7eSjoerg 
370406f32e7eSjoerg /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
370506f32e7eSjoerg ///    the specified 8-bit integer values.
370606f32e7eSjoerg ///
370706f32e7eSjoerg /// \headerfile <x86intrin.h>
370806f32e7eSjoerg ///
370906f32e7eSjoerg /// This intrinsic is a utility function and does not correspond to a specific
371006f32e7eSjoerg ///    instruction.
371106f32e7eSjoerg ///
371206f32e7eSjoerg /// \param __b15
371306f32e7eSjoerg ///    Initializes bits [127:120] of the destination vector.
371406f32e7eSjoerg /// \param __b14
371506f32e7eSjoerg ///    Initializes bits [119:112] of the destination vector.
371606f32e7eSjoerg /// \param __b13
371706f32e7eSjoerg ///    Initializes bits [111:104] of the destination vector.
371806f32e7eSjoerg /// \param __b12
371906f32e7eSjoerg ///    Initializes bits [103:96] of the destination vector.
372006f32e7eSjoerg /// \param __b11
372106f32e7eSjoerg ///    Initializes bits [95:88] of the destination vector.
372206f32e7eSjoerg /// \param __b10
372306f32e7eSjoerg ///    Initializes bits [87:80] of the destination vector.
372406f32e7eSjoerg /// \param __b9
372506f32e7eSjoerg ///    Initializes bits [79:72] of the destination vector.
372606f32e7eSjoerg /// \param __b8
372706f32e7eSjoerg ///    Initializes bits [71:64] of the destination vector.
372806f32e7eSjoerg /// \param __b7
372906f32e7eSjoerg ///    Initializes bits [63:56] of the destination vector.
373006f32e7eSjoerg /// \param __b6
373106f32e7eSjoerg ///    Initializes bits [55:48] of the destination vector.
373206f32e7eSjoerg /// \param __b5
373306f32e7eSjoerg ///    Initializes bits [47:40] of the destination vector.
373406f32e7eSjoerg /// \param __b4
373506f32e7eSjoerg ///    Initializes bits [39:32] of the destination vector.
373606f32e7eSjoerg /// \param __b3
373706f32e7eSjoerg ///    Initializes bits [31:24] of the destination vector.
373806f32e7eSjoerg /// \param __b2
373906f32e7eSjoerg ///    Initializes bits [23:16] of the destination vector.
374006f32e7eSjoerg /// \param __b1
374106f32e7eSjoerg ///    Initializes bits [15:8] of the destination vector.
374206f32e7eSjoerg /// \param __b0
374306f32e7eSjoerg ///    Initializes bits [7:0] of the destination vector.
374406f32e7eSjoerg /// \returns An initialized 128-bit vector of [16 x i8] containing the values
374506f32e7eSjoerg ///    provided in the operands.
374606f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi8(char __b15,char __b14,char __b13,char __b12,char __b11,char __b10,char __b9,char __b8,char __b7,char __b6,char __b5,char __b4,char __b3,char __b2,char __b1,char __b0)374706f32e7eSjoerg _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
374806f32e7eSjoerg {
374906f32e7eSjoerg   return __extension__ (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
375006f32e7eSjoerg }
375106f32e7eSjoerg 
375206f32e7eSjoerg /// Initializes both values in a 128-bit integer vector with the
375306f32e7eSjoerg ///    specified 64-bit integer value.
375406f32e7eSjoerg ///
375506f32e7eSjoerg /// \headerfile <x86intrin.h>
375606f32e7eSjoerg ///
375706f32e7eSjoerg /// This intrinsic is a utility function and does not correspond to a specific
375806f32e7eSjoerg ///    instruction.
375906f32e7eSjoerg ///
376006f32e7eSjoerg /// \param __q
376106f32e7eSjoerg ///    Integer value used to initialize the elements of the destination integer
376206f32e7eSjoerg ///    vector.
376306f32e7eSjoerg /// \returns An initialized 128-bit integer vector of [2 x i64] with both
376406f32e7eSjoerg ///    elements containing the value provided in the operand.
376506f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set1_epi64x(long long __q)376606f32e7eSjoerg _mm_set1_epi64x(long long __q)
376706f32e7eSjoerg {
376806f32e7eSjoerg   return _mm_set_epi64x(__q, __q);
376906f32e7eSjoerg }
377006f32e7eSjoerg 
377106f32e7eSjoerg /// Initializes both values in a 128-bit vector of [2 x i64] with the
377206f32e7eSjoerg ///    specified 64-bit value.
377306f32e7eSjoerg ///
377406f32e7eSjoerg /// \headerfile <x86intrin.h>
377506f32e7eSjoerg ///
377606f32e7eSjoerg /// This intrinsic is a utility function and does not correspond to a specific
377706f32e7eSjoerg ///    instruction.
377806f32e7eSjoerg ///
377906f32e7eSjoerg /// \param __q
378006f32e7eSjoerg ///    A 64-bit value used to initialize the elements of the destination integer
378106f32e7eSjoerg ///    vector.
378206f32e7eSjoerg /// \returns An initialized 128-bit vector of [2 x i64] with all elements
378306f32e7eSjoerg ///    containing the value provided in the operand.
378406f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set1_epi64(__m64 __q)378506f32e7eSjoerg _mm_set1_epi64(__m64 __q)
378606f32e7eSjoerg {
378706f32e7eSjoerg   return _mm_set_epi64(__q, __q);
378806f32e7eSjoerg }
378906f32e7eSjoerg 
379006f32e7eSjoerg /// Initializes all values in a 128-bit vector of [4 x i32] with the
379106f32e7eSjoerg ///    specified 32-bit value.
379206f32e7eSjoerg ///
379306f32e7eSjoerg /// \headerfile <x86intrin.h>
379406f32e7eSjoerg ///
379506f32e7eSjoerg /// This intrinsic is a utility function and does not correspond to a specific
379606f32e7eSjoerg ///    instruction.
379706f32e7eSjoerg ///
379806f32e7eSjoerg /// \param __i
379906f32e7eSjoerg ///    A 32-bit value used to initialize the elements of the destination integer
380006f32e7eSjoerg ///    vector.
380106f32e7eSjoerg /// \returns An initialized 128-bit vector of [4 x i32] with all elements
380206f32e7eSjoerg ///    containing the value provided in the operand.
380306f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set1_epi32(int __i)380406f32e7eSjoerg _mm_set1_epi32(int __i)
380506f32e7eSjoerg {
380606f32e7eSjoerg   return _mm_set_epi32(__i, __i, __i, __i);
380706f32e7eSjoerg }
380806f32e7eSjoerg 
380906f32e7eSjoerg /// Initializes all values in a 128-bit vector of [8 x i16] with the
381006f32e7eSjoerg ///    specified 16-bit value.
381106f32e7eSjoerg ///
381206f32e7eSjoerg /// \headerfile <x86intrin.h>
381306f32e7eSjoerg ///
381406f32e7eSjoerg /// This intrinsic is a utility function and does not correspond to a specific
381506f32e7eSjoerg ///    instruction.
381606f32e7eSjoerg ///
381706f32e7eSjoerg /// \param __w
381806f32e7eSjoerg ///    A 16-bit value used to initialize the elements of the destination integer
381906f32e7eSjoerg ///    vector.
382006f32e7eSjoerg /// \returns An initialized 128-bit vector of [8 x i16] with all elements
382106f32e7eSjoerg ///    containing the value provided in the operand.
382206f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set1_epi16(short __w)382306f32e7eSjoerg _mm_set1_epi16(short __w)
382406f32e7eSjoerg {
382506f32e7eSjoerg   return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
382606f32e7eSjoerg }
382706f32e7eSjoerg 
382806f32e7eSjoerg /// Initializes all values in a 128-bit vector of [16 x i8] with the
382906f32e7eSjoerg ///    specified 8-bit value.
383006f32e7eSjoerg ///
383106f32e7eSjoerg /// \headerfile <x86intrin.h>
383206f32e7eSjoerg ///
383306f32e7eSjoerg /// This intrinsic is a utility function and does not correspond to a specific
383406f32e7eSjoerg ///    instruction.
383506f32e7eSjoerg ///
383606f32e7eSjoerg /// \param __b
383706f32e7eSjoerg ///    An 8-bit value used to initialize the elements of the destination integer
383806f32e7eSjoerg ///    vector.
383906f32e7eSjoerg /// \returns An initialized 128-bit vector of [16 x i8] with all elements
384006f32e7eSjoerg ///    containing the value provided in the operand.
384106f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set1_epi8(char __b)384206f32e7eSjoerg _mm_set1_epi8(char __b)
384306f32e7eSjoerg {
384406f32e7eSjoerg   return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b);
384506f32e7eSjoerg }
384606f32e7eSjoerg 
384706f32e7eSjoerg /// Constructs a 128-bit integer vector, initialized in reverse order
384806f32e7eSjoerg ///     with the specified 64-bit integral values.
384906f32e7eSjoerg ///
385006f32e7eSjoerg /// \headerfile <x86intrin.h>
385106f32e7eSjoerg ///
385206f32e7eSjoerg /// This intrinsic does not correspond to a specific instruction.
385306f32e7eSjoerg ///
385406f32e7eSjoerg /// \param __q0
385506f32e7eSjoerg ///    A 64-bit integral value used to initialize the lower 64 bits of the
385606f32e7eSjoerg ///    result.
385706f32e7eSjoerg /// \param __q1
385806f32e7eSjoerg ///    A 64-bit integral value used to initialize the upper 64 bits of the
385906f32e7eSjoerg ///    result.
386006f32e7eSjoerg /// \returns An initialized 128-bit integer vector.
386106f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi64(__m64 __q0,__m64 __q1)386206f32e7eSjoerg _mm_setr_epi64(__m64 __q0, __m64 __q1)
386306f32e7eSjoerg {
386406f32e7eSjoerg   return _mm_set_epi64(__q1, __q0);
386506f32e7eSjoerg }
386606f32e7eSjoerg 
386706f32e7eSjoerg /// Constructs a 128-bit integer vector, initialized in reverse order
386806f32e7eSjoerg ///     with the specified 32-bit integral values.
386906f32e7eSjoerg ///
387006f32e7eSjoerg /// \headerfile <x86intrin.h>
387106f32e7eSjoerg ///
387206f32e7eSjoerg /// This intrinsic is a utility function and does not correspond to a specific
387306f32e7eSjoerg ///    instruction.
387406f32e7eSjoerg ///
387506f32e7eSjoerg /// \param __i0
387606f32e7eSjoerg ///    A 32-bit integral value used to initialize bits [31:0] of the result.
387706f32e7eSjoerg /// \param __i1
387806f32e7eSjoerg ///    A 32-bit integral value used to initialize bits [63:32] of the result.
387906f32e7eSjoerg /// \param __i2
388006f32e7eSjoerg ///    A 32-bit integral value used to initialize bits [95:64] of the result.
388106f32e7eSjoerg /// \param __i3
388206f32e7eSjoerg ///    A 32-bit integral value used to initialize bits [127:96] of the result.
388306f32e7eSjoerg /// \returns An initialized 128-bit integer vector.
388406f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi32(int __i0,int __i1,int __i2,int __i3)388506f32e7eSjoerg _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
388606f32e7eSjoerg {
388706f32e7eSjoerg   return _mm_set_epi32(__i3, __i2, __i1, __i0);
388806f32e7eSjoerg }
388906f32e7eSjoerg 
389006f32e7eSjoerg /// Constructs a 128-bit integer vector, initialized in reverse order
389106f32e7eSjoerg ///     with the specified 16-bit integral values.
389206f32e7eSjoerg ///
389306f32e7eSjoerg /// \headerfile <x86intrin.h>
389406f32e7eSjoerg ///
389506f32e7eSjoerg /// This intrinsic is a utility function and does not correspond to a specific
389606f32e7eSjoerg ///    instruction.
389706f32e7eSjoerg ///
389806f32e7eSjoerg /// \param __w0
389906f32e7eSjoerg ///    A 16-bit integral value used to initialize bits [15:0] of the result.
390006f32e7eSjoerg /// \param __w1
390106f32e7eSjoerg ///    A 16-bit integral value used to initialize bits [31:16] of the result.
390206f32e7eSjoerg /// \param __w2
390306f32e7eSjoerg ///    A 16-bit integral value used to initialize bits [47:32] of the result.
390406f32e7eSjoerg /// \param __w3
390506f32e7eSjoerg ///    A 16-bit integral value used to initialize bits [63:48] of the result.
390606f32e7eSjoerg /// \param __w4
390706f32e7eSjoerg ///    A 16-bit integral value used to initialize bits [79:64] of the result.
390806f32e7eSjoerg /// \param __w5
390906f32e7eSjoerg ///    A 16-bit integral value used to initialize bits [95:80] of the result.
391006f32e7eSjoerg /// \param __w6
391106f32e7eSjoerg ///    A 16-bit integral value used to initialize bits [111:96] of the result.
391206f32e7eSjoerg /// \param __w7
391306f32e7eSjoerg ///    A 16-bit integral value used to initialize bits [127:112] of the result.
391406f32e7eSjoerg /// \returns An initialized 128-bit integer vector.
391506f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi16(short __w0,short __w1,short __w2,short __w3,short __w4,short __w5,short __w6,short __w7)391606f32e7eSjoerg _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
391706f32e7eSjoerg {
391806f32e7eSjoerg   return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
391906f32e7eSjoerg }
392006f32e7eSjoerg 
392106f32e7eSjoerg /// Constructs a 128-bit integer vector, initialized in reverse order
392206f32e7eSjoerg ///     with the specified 8-bit integral values.
392306f32e7eSjoerg ///
392406f32e7eSjoerg /// \headerfile <x86intrin.h>
392506f32e7eSjoerg ///
392606f32e7eSjoerg /// This intrinsic is a utility function and does not correspond to a specific
392706f32e7eSjoerg ///    instruction.
392806f32e7eSjoerg ///
392906f32e7eSjoerg /// \param __b0
393006f32e7eSjoerg ///    An 8-bit integral value used to initialize bits [7:0] of the result.
393106f32e7eSjoerg /// \param __b1
393206f32e7eSjoerg ///    An 8-bit integral value used to initialize bits [15:8] of the result.
393306f32e7eSjoerg /// \param __b2
393406f32e7eSjoerg ///    An 8-bit integral value used to initialize bits [23:16] of the result.
393506f32e7eSjoerg /// \param __b3
393606f32e7eSjoerg ///    An 8-bit integral value used to initialize bits [31:24] of the result.
393706f32e7eSjoerg /// \param __b4
393806f32e7eSjoerg ///    An 8-bit integral value used to initialize bits [39:32] of the result.
393906f32e7eSjoerg /// \param __b5
394006f32e7eSjoerg ///    An 8-bit integral value used to initialize bits [47:40] of the result.
394106f32e7eSjoerg /// \param __b6
394206f32e7eSjoerg ///    An 8-bit integral value used to initialize bits [55:48] of the result.
394306f32e7eSjoerg /// \param __b7
394406f32e7eSjoerg ///    An 8-bit integral value used to initialize bits [63:56] of the result.
394506f32e7eSjoerg /// \param __b8
394606f32e7eSjoerg ///    An 8-bit integral value used to initialize bits [71:64] of the result.
394706f32e7eSjoerg /// \param __b9
394806f32e7eSjoerg ///    An 8-bit integral value used to initialize bits [79:72] of the result.
394906f32e7eSjoerg /// \param __b10
395006f32e7eSjoerg ///    An 8-bit integral value used to initialize bits [87:80] of the result.
395106f32e7eSjoerg /// \param __b11
395206f32e7eSjoerg ///    An 8-bit integral value used to initialize bits [95:88] of the result.
395306f32e7eSjoerg /// \param __b12
395406f32e7eSjoerg ///    An 8-bit integral value used to initialize bits [103:96] of the result.
395506f32e7eSjoerg /// \param __b13
395606f32e7eSjoerg ///    An 8-bit integral value used to initialize bits [111:104] of the result.
395706f32e7eSjoerg /// \param __b14
395806f32e7eSjoerg ///    An 8-bit integral value used to initialize bits [119:112] of the result.
395906f32e7eSjoerg /// \param __b15
396006f32e7eSjoerg ///    An 8-bit integral value used to initialize bits [127:120] of the result.
396106f32e7eSjoerg /// \returns An initialized 128-bit integer vector.
396206f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi8(char __b0,char __b1,char __b2,char __b3,char __b4,char __b5,char __b6,char __b7,char __b8,char __b9,char __b10,char __b11,char __b12,char __b13,char __b14,char __b15)396306f32e7eSjoerg _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
396406f32e7eSjoerg {
396506f32e7eSjoerg   return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8, __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
396606f32e7eSjoerg }
396706f32e7eSjoerg 
396806f32e7eSjoerg /// Creates a 128-bit integer vector initialized to zero.
396906f32e7eSjoerg ///
397006f32e7eSjoerg /// \headerfile <x86intrin.h>
397106f32e7eSjoerg ///
397206f32e7eSjoerg /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
397306f32e7eSjoerg ///
397406f32e7eSjoerg /// \returns An initialized 128-bit integer vector with all elements set to
397506f32e7eSjoerg ///    zero.
397606f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setzero_si128(void)397706f32e7eSjoerg _mm_setzero_si128(void)
397806f32e7eSjoerg {
397906f32e7eSjoerg   return __extension__ (__m128i)(__v2di){ 0LL, 0LL };
398006f32e7eSjoerg }
398106f32e7eSjoerg 
398206f32e7eSjoerg /// Stores a 128-bit integer vector to a memory location aligned on a
398306f32e7eSjoerg ///    128-bit boundary.
398406f32e7eSjoerg ///
398506f32e7eSjoerg /// \headerfile <x86intrin.h>
398606f32e7eSjoerg ///
398706f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
398806f32e7eSjoerg ///
398906f32e7eSjoerg /// \param __p
399006f32e7eSjoerg ///    A pointer to an aligned memory location that will receive the integer
399106f32e7eSjoerg ///    values.
399206f32e7eSjoerg /// \param __b
399306f32e7eSjoerg ///    A 128-bit integer vector containing the values to be moved.
399406f32e7eSjoerg static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_si128(__m128i * __p,__m128i __b)399506f32e7eSjoerg _mm_store_si128(__m128i *__p, __m128i __b)
399606f32e7eSjoerg {
399706f32e7eSjoerg   *__p = __b;
399806f32e7eSjoerg }
399906f32e7eSjoerg 
400006f32e7eSjoerg /// Stores a 128-bit integer vector to an unaligned memory location.
400106f32e7eSjoerg ///
400206f32e7eSjoerg /// \headerfile <x86intrin.h>
400306f32e7eSjoerg ///
400406f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
400506f32e7eSjoerg ///
400606f32e7eSjoerg /// \param __p
400706f32e7eSjoerg ///    A pointer to a memory location that will receive the integer values.
400806f32e7eSjoerg /// \param __b
400906f32e7eSjoerg ///    A 128-bit integer vector containing the values to be moved.
401006f32e7eSjoerg static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeu_si128(__m128i_u * __p,__m128i __b)401106f32e7eSjoerg _mm_storeu_si128(__m128i_u *__p, __m128i __b)
401206f32e7eSjoerg {
401306f32e7eSjoerg   struct __storeu_si128 {
401406f32e7eSjoerg     __m128i_u __v;
401506f32e7eSjoerg   } __attribute__((__packed__, __may_alias__));
401606f32e7eSjoerg   ((struct __storeu_si128*)__p)->__v = __b;
401706f32e7eSjoerg }
401806f32e7eSjoerg 
401906f32e7eSjoerg /// Stores a 64-bit integer value from the low element of a 128-bit integer
402006f32e7eSjoerg ///    vector.
402106f32e7eSjoerg ///
402206f32e7eSjoerg /// \headerfile <x86intrin.h>
402306f32e7eSjoerg ///
402406f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
402506f32e7eSjoerg ///
402606f32e7eSjoerg /// \param __p
402706f32e7eSjoerg ///    A pointer to a 64-bit memory location. The address of the memory
4028*13fbcb42Sjoerg ///    location does not have to be aligned.
402906f32e7eSjoerg /// \param __b
403006f32e7eSjoerg ///    A 128-bit integer vector containing the value to be stored.
403106f32e7eSjoerg static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeu_si64(void * __p,__m128i __b)403206f32e7eSjoerg _mm_storeu_si64(void *__p, __m128i __b)
403306f32e7eSjoerg {
403406f32e7eSjoerg   struct __storeu_si64 {
403506f32e7eSjoerg     long long __v;
403606f32e7eSjoerg   } __attribute__((__packed__, __may_alias__));
403706f32e7eSjoerg   ((struct __storeu_si64*)__p)->__v = ((__v2di)__b)[0];
403806f32e7eSjoerg }
403906f32e7eSjoerg 
404006f32e7eSjoerg /// Stores a 32-bit integer value from the low element of a 128-bit integer
404106f32e7eSjoerg ///    vector.
404206f32e7eSjoerg ///
404306f32e7eSjoerg /// \headerfile <x86intrin.h>
404406f32e7eSjoerg ///
404506f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
404606f32e7eSjoerg ///
404706f32e7eSjoerg /// \param __p
404806f32e7eSjoerg ///    A pointer to a 32-bit memory location. The address of the memory
404906f32e7eSjoerg ///    location does not have to be aligned.
405006f32e7eSjoerg /// \param __b
405106f32e7eSjoerg ///    A 128-bit integer vector containing the value to be stored.
405206f32e7eSjoerg static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeu_si32(void * __p,__m128i __b)405306f32e7eSjoerg _mm_storeu_si32(void *__p, __m128i __b)
405406f32e7eSjoerg {
405506f32e7eSjoerg   struct __storeu_si32 {
405606f32e7eSjoerg     int __v;
405706f32e7eSjoerg   } __attribute__((__packed__, __may_alias__));
405806f32e7eSjoerg   ((struct __storeu_si32*)__p)->__v = ((__v4si)__b)[0];
405906f32e7eSjoerg }
406006f32e7eSjoerg 
406106f32e7eSjoerg /// Stores a 16-bit integer value from the low element of a 128-bit integer
406206f32e7eSjoerg ///    vector.
406306f32e7eSjoerg ///
406406f32e7eSjoerg /// \headerfile <x86intrin.h>
406506f32e7eSjoerg ///
406606f32e7eSjoerg /// This intrinsic does not correspond to a specific instruction.
406706f32e7eSjoerg ///
406806f32e7eSjoerg /// \param __p
406906f32e7eSjoerg ///    A pointer to a 16-bit memory location. The address of the memory
407006f32e7eSjoerg ///    location does not have to be aligned.
407106f32e7eSjoerg /// \param __b
407206f32e7eSjoerg ///    A 128-bit integer vector containing the value to be stored.
407306f32e7eSjoerg static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeu_si16(void * __p,__m128i __b)407406f32e7eSjoerg _mm_storeu_si16(void *__p, __m128i __b)
407506f32e7eSjoerg {
407606f32e7eSjoerg   struct __storeu_si16 {
407706f32e7eSjoerg     short __v;
407806f32e7eSjoerg   } __attribute__((__packed__, __may_alias__));
407906f32e7eSjoerg   ((struct __storeu_si16*)__p)->__v = ((__v8hi)__b)[0];
408006f32e7eSjoerg }
408106f32e7eSjoerg 
408206f32e7eSjoerg /// Moves bytes selected by the mask from the first operand to the
408306f32e7eSjoerg ///    specified unaligned memory location. When a mask bit is 1, the
408406f32e7eSjoerg ///    corresponding byte is written, otherwise it is not written.
408506f32e7eSjoerg ///
408606f32e7eSjoerg ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
408706f32e7eSjoerg ///    used again soon). Exception and trap behavior for elements not selected
408806f32e7eSjoerg ///    for storage to memory are implementation dependent.
408906f32e7eSjoerg ///
409006f32e7eSjoerg /// \headerfile <x86intrin.h>
409106f32e7eSjoerg ///
409206f32e7eSjoerg /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
409306f32e7eSjoerg ///   instruction.
409406f32e7eSjoerg ///
409506f32e7eSjoerg /// \param __d
409606f32e7eSjoerg ///    A 128-bit integer vector containing the values to be moved.
409706f32e7eSjoerg /// \param __n
409806f32e7eSjoerg ///    A 128-bit integer vector containing the mask. The most significant bit of
409906f32e7eSjoerg ///    each byte represents the mask bits.
410006f32e7eSjoerg /// \param __p
410106f32e7eSjoerg ///    A pointer to an unaligned 128-bit memory location where the specified
410206f32e7eSjoerg ///    values are moved.
410306f32e7eSjoerg static __inline__ void __DEFAULT_FN_ATTRS
_mm_maskmoveu_si128(__m128i __d,__m128i __n,char * __p)410406f32e7eSjoerg _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
410506f32e7eSjoerg {
410606f32e7eSjoerg   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
410706f32e7eSjoerg }
410806f32e7eSjoerg 
410906f32e7eSjoerg /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
411006f32e7eSjoerg ///    a memory location.
411106f32e7eSjoerg ///
411206f32e7eSjoerg /// \headerfile <x86intrin.h>
411306f32e7eSjoerg ///
411406f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
411506f32e7eSjoerg ///
411606f32e7eSjoerg /// \param __p
411706f32e7eSjoerg ///    A pointer to a 64-bit memory location that will receive the lower 64 bits
411806f32e7eSjoerg ///    of the integer vector parameter.
411906f32e7eSjoerg /// \param __a
412006f32e7eSjoerg ///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
412106f32e7eSjoerg ///    value to be stored.
412206f32e7eSjoerg static __inline__ void __DEFAULT_FN_ATTRS
_mm_storel_epi64(__m128i_u * __p,__m128i __a)412306f32e7eSjoerg _mm_storel_epi64(__m128i_u *__p, __m128i __a)
412406f32e7eSjoerg {
412506f32e7eSjoerg   struct __mm_storel_epi64_struct {
412606f32e7eSjoerg     long long __u;
412706f32e7eSjoerg   } __attribute__((__packed__, __may_alias__));
412806f32e7eSjoerg   ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
412906f32e7eSjoerg }
413006f32e7eSjoerg 
413106f32e7eSjoerg /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
413206f32e7eSjoerg ///    aligned memory location.
413306f32e7eSjoerg ///
413406f32e7eSjoerg ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
413506f32e7eSjoerg ///    used again soon).
413606f32e7eSjoerg ///
413706f32e7eSjoerg /// \headerfile <x86intrin.h>
413806f32e7eSjoerg ///
413906f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
414006f32e7eSjoerg ///
414106f32e7eSjoerg /// \param __p
414206f32e7eSjoerg ///    A pointer to the 128-bit aligned memory location used to store the value.
414306f32e7eSjoerg /// \param __a
414406f32e7eSjoerg ///    A vector of [2 x double] containing the 64-bit values to be stored.
414506f32e7eSjoerg static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_pd(double * __p,__m128d __a)414606f32e7eSjoerg _mm_stream_pd(double *__p, __m128d __a)
414706f32e7eSjoerg {
414806f32e7eSjoerg   __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
414906f32e7eSjoerg }
415006f32e7eSjoerg 
415106f32e7eSjoerg /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
415206f32e7eSjoerg ///
415306f32e7eSjoerg ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
415406f32e7eSjoerg ///    used again soon).
415506f32e7eSjoerg ///
415606f32e7eSjoerg /// \headerfile <x86intrin.h>
415706f32e7eSjoerg ///
415806f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
415906f32e7eSjoerg ///
416006f32e7eSjoerg /// \param __p
416106f32e7eSjoerg ///    A pointer to the 128-bit aligned memory location used to store the value.
416206f32e7eSjoerg /// \param __a
416306f32e7eSjoerg ///    A 128-bit integer vector containing the values to be stored.
416406f32e7eSjoerg static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_si128(__m128i * __p,__m128i __a)416506f32e7eSjoerg _mm_stream_si128(__m128i *__p, __m128i __a)
416606f32e7eSjoerg {
416706f32e7eSjoerg   __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
416806f32e7eSjoerg }
416906f32e7eSjoerg 
417006f32e7eSjoerg /// Stores a 32-bit integer value in the specified memory location.
417106f32e7eSjoerg ///
417206f32e7eSjoerg ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
417306f32e7eSjoerg ///    used again soon).
417406f32e7eSjoerg ///
417506f32e7eSjoerg /// \headerfile <x86intrin.h>
417606f32e7eSjoerg ///
417706f32e7eSjoerg /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
417806f32e7eSjoerg ///
417906f32e7eSjoerg /// \param __p
418006f32e7eSjoerg ///    A pointer to the 32-bit memory location used to store the value.
418106f32e7eSjoerg /// \param __a
418206f32e7eSjoerg ///    A 32-bit integer containing the value to be stored.
418306f32e7eSjoerg static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
_mm_stream_si32(int * __p,int __a)418406f32e7eSjoerg _mm_stream_si32(int *__p, int __a)
418506f32e7eSjoerg {
418606f32e7eSjoerg   __builtin_ia32_movnti(__p, __a);
418706f32e7eSjoerg }
418806f32e7eSjoerg 
418906f32e7eSjoerg #ifdef __x86_64__
419006f32e7eSjoerg /// Stores a 64-bit integer value in the specified memory location.
419106f32e7eSjoerg ///
419206f32e7eSjoerg ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
419306f32e7eSjoerg ///    used again soon).
419406f32e7eSjoerg ///
419506f32e7eSjoerg /// \headerfile <x86intrin.h>
419606f32e7eSjoerg ///
419706f32e7eSjoerg /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
419806f32e7eSjoerg ///
419906f32e7eSjoerg /// \param __p
420006f32e7eSjoerg ///    A pointer to the 64-bit memory location used to store the value.
420106f32e7eSjoerg /// \param __a
420206f32e7eSjoerg ///    A 64-bit integer containing the value to be stored.
420306f32e7eSjoerg static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
_mm_stream_si64(long long * __p,long long __a)420406f32e7eSjoerg _mm_stream_si64(long long *__p, long long __a)
420506f32e7eSjoerg {
420606f32e7eSjoerg   __builtin_ia32_movnti64(__p, __a);
420706f32e7eSjoerg }
420806f32e7eSjoerg #endif
420906f32e7eSjoerg 
421006f32e7eSjoerg #if defined(__cplusplus)
421106f32e7eSjoerg extern "C" {
421206f32e7eSjoerg #endif
421306f32e7eSjoerg 
421406f32e7eSjoerg /// The cache line containing \a __p is flushed and invalidated from all
421506f32e7eSjoerg ///    caches in the coherency domain.
421606f32e7eSjoerg ///
421706f32e7eSjoerg /// \headerfile <x86intrin.h>
421806f32e7eSjoerg ///
421906f32e7eSjoerg /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
422006f32e7eSjoerg ///
422106f32e7eSjoerg /// \param __p
422206f32e7eSjoerg ///    A pointer to the memory location used to identify the cache line to be
422306f32e7eSjoerg ///    flushed.
422406f32e7eSjoerg void _mm_clflush(void const * __p);
422506f32e7eSjoerg 
422606f32e7eSjoerg /// Forces strong memory ordering (serialization) between load
422706f32e7eSjoerg ///    instructions preceding this instruction and load instructions following
422806f32e7eSjoerg ///    this instruction, ensuring the system completes all previous loads before
422906f32e7eSjoerg ///    executing subsequent loads.
423006f32e7eSjoerg ///
423106f32e7eSjoerg /// \headerfile <x86intrin.h>
423206f32e7eSjoerg ///
423306f32e7eSjoerg /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
423406f32e7eSjoerg ///
423506f32e7eSjoerg void _mm_lfence(void);
423606f32e7eSjoerg 
423706f32e7eSjoerg /// Forces strong memory ordering (serialization) between load and store
423806f32e7eSjoerg ///    instructions preceding this instruction and load and store instructions
423906f32e7eSjoerg ///    following this instruction, ensuring that the system completes all
424006f32e7eSjoerg ///    previous memory accesses before executing subsequent memory accesses.
424106f32e7eSjoerg ///
424206f32e7eSjoerg /// \headerfile <x86intrin.h>
424306f32e7eSjoerg ///
424406f32e7eSjoerg /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
424506f32e7eSjoerg ///
424606f32e7eSjoerg void _mm_mfence(void);
424706f32e7eSjoerg 
424806f32e7eSjoerg #if defined(__cplusplus)
424906f32e7eSjoerg } // extern "C"
425006f32e7eSjoerg #endif
425106f32e7eSjoerg 
425206f32e7eSjoerg /// Converts 16-bit signed integers from both 128-bit integer vector
425306f32e7eSjoerg ///    operands into 8-bit signed integers, and packs the results into the
425406f32e7eSjoerg ///    destination. Positive values greater than 0x7F are saturated to 0x7F.
425506f32e7eSjoerg ///    Negative values less than 0x80 are saturated to 0x80.
425606f32e7eSjoerg ///
425706f32e7eSjoerg /// \headerfile <x86intrin.h>
425806f32e7eSjoerg ///
425906f32e7eSjoerg /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
426006f32e7eSjoerg ///
426106f32e7eSjoerg /// \param __a
426206f32e7eSjoerg ///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
426306f32e7eSjoerg ///   a signed integer and is converted to a 8-bit signed integer with
426406f32e7eSjoerg ///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
426506f32e7eSjoerg ///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
426606f32e7eSjoerg ///   written to the lower 64 bits of the result.
426706f32e7eSjoerg /// \param __b
426806f32e7eSjoerg ///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
426906f32e7eSjoerg ///   a signed integer and is converted to a 8-bit signed integer with
427006f32e7eSjoerg ///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
427106f32e7eSjoerg ///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
427206f32e7eSjoerg ///   written to the higher 64 bits of the result.
427306f32e7eSjoerg /// \returns A 128-bit vector of [16 x i8] containing the converted values.
427406f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_packs_epi16(__m128i __a,__m128i __b)427506f32e7eSjoerg _mm_packs_epi16(__m128i __a, __m128i __b)
427606f32e7eSjoerg {
427706f32e7eSjoerg   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
427806f32e7eSjoerg }
427906f32e7eSjoerg 
428006f32e7eSjoerg /// Converts 32-bit signed integers from both 128-bit integer vector
428106f32e7eSjoerg ///    operands into 16-bit signed integers, and packs the results into the
428206f32e7eSjoerg ///    destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
428306f32e7eSjoerg ///    Negative values less than 0x8000 are saturated to 0x8000.
428406f32e7eSjoerg ///
428506f32e7eSjoerg /// \headerfile <x86intrin.h>
428606f32e7eSjoerg ///
428706f32e7eSjoerg /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
428806f32e7eSjoerg ///
428906f32e7eSjoerg /// \param __a
429006f32e7eSjoerg ///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
429106f32e7eSjoerg ///    a signed integer and is converted to a 16-bit signed integer with
429206f32e7eSjoerg ///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
429306f32e7eSjoerg ///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
429406f32e7eSjoerg ///    are written to the lower 64 bits of the result.
429506f32e7eSjoerg /// \param __b
429606f32e7eSjoerg ///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
429706f32e7eSjoerg ///    a signed integer and is converted to a 16-bit signed integer with
429806f32e7eSjoerg ///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
429906f32e7eSjoerg ///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
430006f32e7eSjoerg ///    are written to the higher 64 bits of the result.
430106f32e7eSjoerg /// \returns A 128-bit vector of [8 x i16] containing the converted values.
430206f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_packs_epi32(__m128i __a,__m128i __b)430306f32e7eSjoerg _mm_packs_epi32(__m128i __a, __m128i __b)
430406f32e7eSjoerg {
430506f32e7eSjoerg   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
430606f32e7eSjoerg }
430706f32e7eSjoerg 
430806f32e7eSjoerg /// Converts 16-bit signed integers from both 128-bit integer vector
430906f32e7eSjoerg ///    operands into 8-bit unsigned integers, and packs the results into the
431006f32e7eSjoerg ///    destination. Values greater than 0xFF are saturated to 0xFF. Values less
431106f32e7eSjoerg ///    than 0x00 are saturated to 0x00.
431206f32e7eSjoerg ///
431306f32e7eSjoerg /// \headerfile <x86intrin.h>
431406f32e7eSjoerg ///
431506f32e7eSjoerg /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
431606f32e7eSjoerg ///
431706f32e7eSjoerg /// \param __a
431806f32e7eSjoerg ///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
431906f32e7eSjoerg ///    a signed integer and is converted to an 8-bit unsigned integer with
432006f32e7eSjoerg ///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
432106f32e7eSjoerg ///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
432206f32e7eSjoerg ///    written to the lower 64 bits of the result.
432306f32e7eSjoerg /// \param __b
432406f32e7eSjoerg ///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
432506f32e7eSjoerg ///    a signed integer and is converted to an 8-bit unsigned integer with
432606f32e7eSjoerg ///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
432706f32e7eSjoerg ///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
432806f32e7eSjoerg ///    written to the higher 64 bits of the result.
432906f32e7eSjoerg /// \returns A 128-bit vector of [16 x i8] containing the converted values.
433006f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_packus_epi16(__m128i __a,__m128i __b)433106f32e7eSjoerg _mm_packus_epi16(__m128i __a, __m128i __b)
433206f32e7eSjoerg {
433306f32e7eSjoerg   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
433406f32e7eSjoerg }
433506f32e7eSjoerg 
433606f32e7eSjoerg /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
433706f32e7eSjoerg ///    the immediate-value parameter as a selector.
433806f32e7eSjoerg ///
433906f32e7eSjoerg /// \headerfile <x86intrin.h>
434006f32e7eSjoerg ///
434106f32e7eSjoerg /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
434206f32e7eSjoerg ///
434306f32e7eSjoerg /// \param __a
434406f32e7eSjoerg ///    A 128-bit integer vector.
434506f32e7eSjoerg /// \param __imm
434606f32e7eSjoerg ///    An immediate value. Bits [2:0] selects values from \a __a to be assigned
434706f32e7eSjoerg ///    to bits[15:0] of the result. \n
434806f32e7eSjoerg ///    000: assign values from bits [15:0] of \a __a. \n
434906f32e7eSjoerg ///    001: assign values from bits [31:16] of \a __a. \n
435006f32e7eSjoerg ///    010: assign values from bits [47:32] of \a __a. \n
435106f32e7eSjoerg ///    011: assign values from bits [63:48] of \a __a. \n
435206f32e7eSjoerg ///    100: assign values from bits [79:64] of \a __a. \n
435306f32e7eSjoerg ///    101: assign values from bits [95:80] of \a __a. \n
435406f32e7eSjoerg ///    110: assign values from bits [111:96] of \a __a. \n
435506f32e7eSjoerg ///    111: assign values from bits [127:112] of \a __a.
435606f32e7eSjoerg /// \returns An integer, whose lower 16 bits are selected from the 128-bit
435706f32e7eSjoerg ///    integer vector parameter and the remaining bits are assigned zeros.
435806f32e7eSjoerg #define _mm_extract_epi16(a, imm) \
435906f32e7eSjoerg   (int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
436006f32e7eSjoerg                                                    (int)(imm))
436106f32e7eSjoerg 
436206f32e7eSjoerg /// Constructs a 128-bit integer vector by first making a copy of the
436306f32e7eSjoerg ///    128-bit integer vector parameter, and then inserting the lower 16 bits
436406f32e7eSjoerg ///    of an integer parameter into an offset specified by the immediate-value
436506f32e7eSjoerg ///    parameter.
436606f32e7eSjoerg ///
436706f32e7eSjoerg /// \headerfile <x86intrin.h>
436806f32e7eSjoerg ///
436906f32e7eSjoerg /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
437006f32e7eSjoerg ///
437106f32e7eSjoerg /// \param __a
437206f32e7eSjoerg ///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
437306f32e7eSjoerg ///    result and then one of the eight elements in the result is replaced by
437406f32e7eSjoerg ///    the lower 16 bits of \a __b.
437506f32e7eSjoerg /// \param __b
437606f32e7eSjoerg ///    An integer. The lower 16 bits of this parameter are written to the
437706f32e7eSjoerg ///    result beginning at an offset specified by \a __imm.
437806f32e7eSjoerg /// \param __imm
437906f32e7eSjoerg ///    An immediate value specifying the bit offset in the result at which the
438006f32e7eSjoerg ///    lower 16 bits of \a __b are written.
438106f32e7eSjoerg /// \returns A 128-bit integer vector containing the constructed values.
438206f32e7eSjoerg #define _mm_insert_epi16(a, b, imm) \
438306f32e7eSjoerg   (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
438406f32e7eSjoerg                                        (int)(imm))
438506f32e7eSjoerg 
438606f32e7eSjoerg /// Copies the values of the most significant bits from each 8-bit
438706f32e7eSjoerg ///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
438806f32e7eSjoerg ///    value, zero-extends the value, and writes it to the destination.
438906f32e7eSjoerg ///
439006f32e7eSjoerg /// \headerfile <x86intrin.h>
439106f32e7eSjoerg ///
439206f32e7eSjoerg /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
439306f32e7eSjoerg ///
439406f32e7eSjoerg /// \param __a
439506f32e7eSjoerg ///    A 128-bit integer vector containing the values with bits to be extracted.
439606f32e7eSjoerg /// \returns The most significant bits from each 8-bit element in \a __a,
439706f32e7eSjoerg ///    written to bits [15:0]. The other bits are assigned zeros.
439806f32e7eSjoerg static __inline__ int __DEFAULT_FN_ATTRS
_mm_movemask_epi8(__m128i __a)439906f32e7eSjoerg _mm_movemask_epi8(__m128i __a)
440006f32e7eSjoerg {
440106f32e7eSjoerg   return __builtin_ia32_pmovmskb128((__v16qi)__a);
440206f32e7eSjoerg }
440306f32e7eSjoerg 
440406f32e7eSjoerg /// Constructs a 128-bit integer vector by shuffling four 32-bit
440506f32e7eSjoerg ///    elements of a 128-bit integer vector parameter, using the immediate-value
440606f32e7eSjoerg ///    parameter as a specifier.
440706f32e7eSjoerg ///
440806f32e7eSjoerg /// \headerfile <x86intrin.h>
440906f32e7eSjoerg ///
441006f32e7eSjoerg /// \code
441106f32e7eSjoerg /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
441206f32e7eSjoerg /// \endcode
441306f32e7eSjoerg ///
441406f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
441506f32e7eSjoerg ///
441606f32e7eSjoerg /// \param a
441706f32e7eSjoerg ///    A 128-bit integer vector containing the values to be copied.
441806f32e7eSjoerg /// \param imm
441906f32e7eSjoerg ///    An immediate value containing an 8-bit value specifying which elements to
442006f32e7eSjoerg ///    copy from a. The destinations within the 128-bit destination are assigned
442106f32e7eSjoerg ///    values as follows: \n
442206f32e7eSjoerg ///    Bits [1:0] are used to assign values to bits [31:0] of the result. \n
442306f32e7eSjoerg ///    Bits [3:2] are used to assign values to bits [63:32] of the result. \n
442406f32e7eSjoerg ///    Bits [5:4] are used to assign values to bits [95:64] of the result. \n
442506f32e7eSjoerg ///    Bits [7:6] are used to assign values to bits [127:96] of the result. \n
442606f32e7eSjoerg ///    Bit value assignments: \n
442706f32e7eSjoerg ///    00: assign values from bits [31:0] of \a a. \n
442806f32e7eSjoerg ///    01: assign values from bits [63:32] of \a a. \n
442906f32e7eSjoerg ///    10: assign values from bits [95:64] of \a a. \n
443006f32e7eSjoerg ///    11: assign values from bits [127:96] of \a a.
443106f32e7eSjoerg /// \returns A 128-bit integer vector containing the shuffled values.
443206f32e7eSjoerg #define _mm_shuffle_epi32(a, imm) \
443306f32e7eSjoerg   (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))
443406f32e7eSjoerg 
443506f32e7eSjoerg /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
443606f32e7eSjoerg ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
443706f32e7eSjoerg ///    value parameter as a specifier.
443806f32e7eSjoerg ///
443906f32e7eSjoerg /// \headerfile <x86intrin.h>
444006f32e7eSjoerg ///
444106f32e7eSjoerg /// \code
444206f32e7eSjoerg /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
444306f32e7eSjoerg /// \endcode
444406f32e7eSjoerg ///
444506f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
444606f32e7eSjoerg ///
444706f32e7eSjoerg /// \param a
444806f32e7eSjoerg ///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
444906f32e7eSjoerg ///    [127:64] of the result.
445006f32e7eSjoerg /// \param imm
445106f32e7eSjoerg ///    An 8-bit immediate value specifying which elements to copy from \a a. \n
445206f32e7eSjoerg ///    Bits[1:0] are used to assign values to bits [15:0] of the result. \n
445306f32e7eSjoerg ///    Bits[3:2] are used to assign values to bits [31:16] of the result. \n
445406f32e7eSjoerg ///    Bits[5:4] are used to assign values to bits [47:32] of the result. \n
445506f32e7eSjoerg ///    Bits[7:6] are used to assign values to bits [63:48] of the result. \n
445606f32e7eSjoerg ///    Bit value assignments: \n
445706f32e7eSjoerg ///    00: assign values from bits [15:0] of \a a. \n
445806f32e7eSjoerg ///    01: assign values from bits [31:16] of \a a. \n
445906f32e7eSjoerg ///    10: assign values from bits [47:32] of \a a. \n
446006f32e7eSjoerg ///    11: assign values from bits [63:48] of \a a. \n
446106f32e7eSjoerg /// \returns A 128-bit integer vector containing the shuffled values.
446206f32e7eSjoerg #define _mm_shufflelo_epi16(a, imm) \
446306f32e7eSjoerg   (__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))
446406f32e7eSjoerg 
446506f32e7eSjoerg /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
446606f32e7eSjoerg ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
446706f32e7eSjoerg ///    value parameter as a specifier.
446806f32e7eSjoerg ///
446906f32e7eSjoerg /// \headerfile <x86intrin.h>
447006f32e7eSjoerg ///
447106f32e7eSjoerg /// \code
447206f32e7eSjoerg /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
447306f32e7eSjoerg /// \endcode
447406f32e7eSjoerg ///
447506f32e7eSjoerg /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
447606f32e7eSjoerg ///
447706f32e7eSjoerg /// \param a
447806f32e7eSjoerg ///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
447906f32e7eSjoerg ///    [63:0] of the result.
448006f32e7eSjoerg /// \param imm
448106f32e7eSjoerg ///    An 8-bit immediate value specifying which elements to copy from \a a. \n
448206f32e7eSjoerg ///    Bits[1:0] are used to assign values to bits [79:64] of the result. \n
448306f32e7eSjoerg ///    Bits[3:2] are used to assign values to bits [95:80] of the result. \n
448406f32e7eSjoerg ///    Bits[5:4] are used to assign values to bits [111:96] of the result. \n
448506f32e7eSjoerg ///    Bits[7:6] are used to assign values to bits [127:112] of the result. \n
448606f32e7eSjoerg ///    Bit value assignments: \n
448706f32e7eSjoerg ///    00: assign values from bits [79:64] of \a a. \n
448806f32e7eSjoerg ///    01: assign values from bits [95:80] of \a a. \n
448906f32e7eSjoerg ///    10: assign values from bits [111:96] of \a a. \n
449006f32e7eSjoerg ///    11: assign values from bits [127:112] of \a a. \n
449106f32e7eSjoerg /// \returns A 128-bit integer vector containing the shuffled values.
449206f32e7eSjoerg #define _mm_shufflehi_epi16(a, imm) \
449306f32e7eSjoerg   (__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))
449406f32e7eSjoerg 
449506f32e7eSjoerg /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
449606f32e7eSjoerg ///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
449706f32e7eSjoerg ///
449806f32e7eSjoerg /// \headerfile <x86intrin.h>
449906f32e7eSjoerg ///
450006f32e7eSjoerg /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
450106f32e7eSjoerg ///   instruction.
450206f32e7eSjoerg ///
450306f32e7eSjoerg /// \param __a
450406f32e7eSjoerg ///    A 128-bit vector of [16 x i8].
450506f32e7eSjoerg ///    Bits [71:64] are written to bits [7:0] of the result. \n
450606f32e7eSjoerg ///    Bits [79:72] are written to bits [23:16] of the result. \n
450706f32e7eSjoerg ///    Bits [87:80] are written to bits [39:32] of the result. \n
450806f32e7eSjoerg ///    Bits [95:88] are written to bits [55:48] of the result. \n
450906f32e7eSjoerg ///    Bits [103:96] are written to bits [71:64] of the result. \n
451006f32e7eSjoerg ///    Bits [111:104] are written to bits [87:80] of the result. \n
451106f32e7eSjoerg ///    Bits [119:112] are written to bits [103:96] of the result. \n
451206f32e7eSjoerg ///    Bits [127:120] are written to bits [119:112] of the result.
451306f32e7eSjoerg /// \param __b
451406f32e7eSjoerg ///    A 128-bit vector of [16 x i8]. \n
451506f32e7eSjoerg ///    Bits [71:64] are written to bits [15:8] of the result. \n
451606f32e7eSjoerg ///    Bits [79:72] are written to bits [31:24] of the result. \n
451706f32e7eSjoerg ///    Bits [87:80] are written to bits [47:40] of the result. \n
451806f32e7eSjoerg ///    Bits [95:88] are written to bits [63:56] of the result. \n
451906f32e7eSjoerg ///    Bits [103:96] are written to bits [79:72] of the result. \n
452006f32e7eSjoerg ///    Bits [111:104] are written to bits [95:88] of the result. \n
452106f32e7eSjoerg ///    Bits [119:112] are written to bits [111:104] of the result. \n
452206f32e7eSjoerg ///    Bits [127:120] are written to bits [127:120] of the result.
452306f32e7eSjoerg /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
452406f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi8(__m128i __a,__m128i __b)452506f32e7eSjoerg _mm_unpackhi_epi8(__m128i __a, __m128i __b)
452606f32e7eSjoerg {
452706f32e7eSjoerg   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
452806f32e7eSjoerg }
452906f32e7eSjoerg 
453006f32e7eSjoerg /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
453106f32e7eSjoerg ///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
453206f32e7eSjoerg ///
453306f32e7eSjoerg /// \headerfile <x86intrin.h>
453406f32e7eSjoerg ///
453506f32e7eSjoerg /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
453606f32e7eSjoerg ///   instruction.
453706f32e7eSjoerg ///
453806f32e7eSjoerg /// \param __a
453906f32e7eSjoerg ///    A 128-bit vector of [8 x i16].
454006f32e7eSjoerg ///    Bits [79:64] are written to bits [15:0] of the result. \n
454106f32e7eSjoerg ///    Bits [95:80] are written to bits [47:32] of the result. \n
454206f32e7eSjoerg ///    Bits [111:96] are written to bits [79:64] of the result. \n
454306f32e7eSjoerg ///    Bits [127:112] are written to bits [111:96] of the result.
454406f32e7eSjoerg /// \param __b
454506f32e7eSjoerg ///    A 128-bit vector of [8 x i16].
454606f32e7eSjoerg ///    Bits [79:64] are written to bits [31:16] of the result. \n
454706f32e7eSjoerg ///    Bits [95:80] are written to bits [63:48] of the result. \n
454806f32e7eSjoerg ///    Bits [111:96] are written to bits [95:80] of the result. \n
454906f32e7eSjoerg ///    Bits [127:112] are written to bits [127:112] of the result.
455006f32e7eSjoerg /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
455106f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi16(__m128i __a,__m128i __b)455206f32e7eSjoerg _mm_unpackhi_epi16(__m128i __a, __m128i __b)
455306f32e7eSjoerg {
455406f32e7eSjoerg   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
455506f32e7eSjoerg }
455606f32e7eSjoerg 
455706f32e7eSjoerg /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
455806f32e7eSjoerg ///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
455906f32e7eSjoerg ///
456006f32e7eSjoerg /// \headerfile <x86intrin.h>
456106f32e7eSjoerg ///
456206f32e7eSjoerg /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
456306f32e7eSjoerg ///   instruction.
456406f32e7eSjoerg ///
456506f32e7eSjoerg /// \param __a
456606f32e7eSjoerg ///    A 128-bit vector of [4 x i32]. \n
456706f32e7eSjoerg ///    Bits [95:64] are written to bits [31:0] of the destination. \n
456806f32e7eSjoerg ///    Bits [127:96] are written to bits [95:64] of the destination.
456906f32e7eSjoerg /// \param __b
457006f32e7eSjoerg ///    A 128-bit vector of [4 x i32]. \n
457106f32e7eSjoerg ///    Bits [95:64] are written to bits [64:32] of the destination. \n
457206f32e7eSjoerg ///    Bits [127:96] are written to bits [127:96] of the destination.
457306f32e7eSjoerg /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
457406f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi32(__m128i __a,__m128i __b)457506f32e7eSjoerg _mm_unpackhi_epi32(__m128i __a, __m128i __b)
457606f32e7eSjoerg {
457706f32e7eSjoerg   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
457806f32e7eSjoerg }
457906f32e7eSjoerg 
458006f32e7eSjoerg /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
458106f32e7eSjoerg ///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
458206f32e7eSjoerg ///
458306f32e7eSjoerg /// \headerfile <x86intrin.h>
458406f32e7eSjoerg ///
458506f32e7eSjoerg /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
458606f32e7eSjoerg ///   instruction.
458706f32e7eSjoerg ///
458806f32e7eSjoerg /// \param __a
458906f32e7eSjoerg ///    A 128-bit vector of [2 x i64]. \n
459006f32e7eSjoerg ///    Bits [127:64] are written to bits [63:0] of the destination.
459106f32e7eSjoerg /// \param __b
459206f32e7eSjoerg ///    A 128-bit vector of [2 x i64]. \n
459306f32e7eSjoerg ///    Bits [127:64] are written to bits [127:64] of the destination.
459406f32e7eSjoerg /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
459506f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpackhi_epi64(__m128i __a,__m128i __b)459606f32e7eSjoerg _mm_unpackhi_epi64(__m128i __a, __m128i __b)
459706f32e7eSjoerg {
459806f32e7eSjoerg   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
459906f32e7eSjoerg }
460006f32e7eSjoerg 
460106f32e7eSjoerg /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
460206f32e7eSjoerg ///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
460306f32e7eSjoerg ///
460406f32e7eSjoerg /// \headerfile <x86intrin.h>
460506f32e7eSjoerg ///
460606f32e7eSjoerg /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
460706f32e7eSjoerg ///   instruction.
460806f32e7eSjoerg ///
460906f32e7eSjoerg /// \param __a
461006f32e7eSjoerg ///    A 128-bit vector of [16 x i8]. \n
461106f32e7eSjoerg ///    Bits [7:0] are written to bits [7:0] of the result. \n
461206f32e7eSjoerg ///    Bits [15:8] are written to bits [23:16] of the result. \n
461306f32e7eSjoerg ///    Bits [23:16] are written to bits [39:32] of the result. \n
461406f32e7eSjoerg ///    Bits [31:24] are written to bits [55:48] of the result. \n
461506f32e7eSjoerg ///    Bits [39:32] are written to bits [71:64] of the result. \n
461606f32e7eSjoerg ///    Bits [47:40] are written to bits [87:80] of the result. \n
461706f32e7eSjoerg ///    Bits [55:48] are written to bits [103:96] of the result. \n
461806f32e7eSjoerg ///    Bits [63:56] are written to bits [119:112] of the result.
461906f32e7eSjoerg /// \param __b
462006f32e7eSjoerg ///    A 128-bit vector of [16 x i8].
462106f32e7eSjoerg ///    Bits [7:0] are written to bits [15:8] of the result. \n
462206f32e7eSjoerg ///    Bits [15:8] are written to bits [31:24] of the result. \n
462306f32e7eSjoerg ///    Bits [23:16] are written to bits [47:40] of the result. \n
462406f32e7eSjoerg ///    Bits [31:24] are written to bits [63:56] of the result. \n
462506f32e7eSjoerg ///    Bits [39:32] are written to bits [79:72] of the result. \n
462606f32e7eSjoerg ///    Bits [47:40] are written to bits [95:88] of the result. \n
462706f32e7eSjoerg ///    Bits [55:48] are written to bits [111:104] of the result. \n
462806f32e7eSjoerg ///    Bits [63:56] are written to bits [127:120] of the result.
462906f32e7eSjoerg /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
463006f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi8(__m128i __a,__m128i __b)463106f32e7eSjoerg _mm_unpacklo_epi8(__m128i __a, __m128i __b)
463206f32e7eSjoerg {
463306f32e7eSjoerg   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
463406f32e7eSjoerg }
463506f32e7eSjoerg 
463606f32e7eSjoerg /// Unpacks the low-order (index 0-3) values from each of the two 128-bit
463706f32e7eSjoerg ///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
463806f32e7eSjoerg ///    [8 x i16].
463906f32e7eSjoerg ///
464006f32e7eSjoerg /// \headerfile <x86intrin.h>
464106f32e7eSjoerg ///
464206f32e7eSjoerg /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
464306f32e7eSjoerg ///   instruction.
464406f32e7eSjoerg ///
464506f32e7eSjoerg /// \param __a
464606f32e7eSjoerg ///    A 128-bit vector of [8 x i16].
464706f32e7eSjoerg ///    Bits [15:0] are written to bits [15:0] of the result. \n
464806f32e7eSjoerg ///    Bits [31:16] are written to bits [47:32] of the result. \n
464906f32e7eSjoerg ///    Bits [47:32] are written to bits [79:64] of the result. \n
465006f32e7eSjoerg ///    Bits [63:48] are written to bits [111:96] of the result.
465106f32e7eSjoerg /// \param __b
465206f32e7eSjoerg ///    A 128-bit vector of [8 x i16].
465306f32e7eSjoerg ///    Bits [15:0] are written to bits [31:16] of the result. \n
465406f32e7eSjoerg ///    Bits [31:16] are written to bits [63:48] of the result. \n
465506f32e7eSjoerg ///    Bits [47:32] are written to bits [95:80] of the result. \n
465606f32e7eSjoerg ///    Bits [63:48] are written to bits [127:112] of the result.
465706f32e7eSjoerg /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
465806f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi16(__m128i __a,__m128i __b)465906f32e7eSjoerg _mm_unpacklo_epi16(__m128i __a, __m128i __b)
466006f32e7eSjoerg {
466106f32e7eSjoerg   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
466206f32e7eSjoerg }
466306f32e7eSjoerg 
466406f32e7eSjoerg /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
466506f32e7eSjoerg ///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
466606f32e7eSjoerg ///
466706f32e7eSjoerg /// \headerfile <x86intrin.h>
466806f32e7eSjoerg ///
466906f32e7eSjoerg /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
467006f32e7eSjoerg ///   instruction.
467106f32e7eSjoerg ///
467206f32e7eSjoerg /// \param __a
467306f32e7eSjoerg ///    A 128-bit vector of [4 x i32]. \n
467406f32e7eSjoerg ///    Bits [31:0] are written to bits [31:0] of the destination. \n
467506f32e7eSjoerg ///    Bits [63:32] are written to bits [95:64] of the destination.
467606f32e7eSjoerg /// \param __b
467706f32e7eSjoerg ///    A 128-bit vector of [4 x i32]. \n
467806f32e7eSjoerg ///    Bits [31:0] are written to bits [64:32] of the destination. \n
467906f32e7eSjoerg ///    Bits [63:32] are written to bits [127:96] of the destination.
468006f32e7eSjoerg /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
468106f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi32(__m128i __a,__m128i __b)468206f32e7eSjoerg _mm_unpacklo_epi32(__m128i __a, __m128i __b)
468306f32e7eSjoerg {
468406f32e7eSjoerg   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
468506f32e7eSjoerg }
468606f32e7eSjoerg 
468706f32e7eSjoerg /// Unpacks the low-order 64-bit elements from two 128-bit vectors of
468806f32e7eSjoerg ///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
468906f32e7eSjoerg ///
469006f32e7eSjoerg /// \headerfile <x86intrin.h>
469106f32e7eSjoerg ///
469206f32e7eSjoerg /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
469306f32e7eSjoerg ///   instruction.
469406f32e7eSjoerg ///
469506f32e7eSjoerg /// \param __a
469606f32e7eSjoerg ///    A 128-bit vector of [2 x i64]. \n
469706f32e7eSjoerg ///    Bits [63:0] are written to bits [63:0] of the destination. \n
469806f32e7eSjoerg /// \param __b
469906f32e7eSjoerg ///    A 128-bit vector of [2 x i64]. \n
470006f32e7eSjoerg ///    Bits [63:0] are written to bits [127:64] of the destination. \n
470106f32e7eSjoerg /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
470206f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_unpacklo_epi64(__m128i __a,__m128i __b)470306f32e7eSjoerg _mm_unpacklo_epi64(__m128i __a, __m128i __b)
470406f32e7eSjoerg {
470506f32e7eSjoerg   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
470606f32e7eSjoerg }
470706f32e7eSjoerg 
470806f32e7eSjoerg /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
470906f32e7eSjoerg ///    integer.
471006f32e7eSjoerg ///
471106f32e7eSjoerg /// \headerfile <x86intrin.h>
471206f32e7eSjoerg ///
471306f32e7eSjoerg /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
471406f32e7eSjoerg ///
471506f32e7eSjoerg /// \param __a
471606f32e7eSjoerg ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
471706f32e7eSjoerg ///    destination.
471806f32e7eSjoerg /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
471906f32e7eSjoerg static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_movepi64_pi64(__m128i __a)472006f32e7eSjoerg _mm_movepi64_pi64(__m128i __a)
472106f32e7eSjoerg {
472206f32e7eSjoerg   return (__m64)__a[0];
472306f32e7eSjoerg }
472406f32e7eSjoerg 
472506f32e7eSjoerg /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
472606f32e7eSjoerg ///    upper bits.
472706f32e7eSjoerg ///
472806f32e7eSjoerg /// \headerfile <x86intrin.h>
472906f32e7eSjoerg ///
473006f32e7eSjoerg /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
473106f32e7eSjoerg ///
473206f32e7eSjoerg /// \param __a
473306f32e7eSjoerg ///    A 64-bit value.
473406f32e7eSjoerg /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
473506f32e7eSjoerg ///    the operand. The upper 64 bits are assigned zeros.
473606f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_movpi64_epi64(__m64 __a)473706f32e7eSjoerg _mm_movpi64_epi64(__m64 __a)
473806f32e7eSjoerg {
473906f32e7eSjoerg   return __extension__ (__m128i)(__v2di){ (long long)__a, 0 };
474006f32e7eSjoerg }
474106f32e7eSjoerg 
474206f32e7eSjoerg /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
474306f32e7eSjoerg ///    integer vector, zeroing the upper bits.
474406f32e7eSjoerg ///
474506f32e7eSjoerg /// \headerfile <x86intrin.h>
474606f32e7eSjoerg ///
474706f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
474806f32e7eSjoerg ///
474906f32e7eSjoerg /// \param __a
475006f32e7eSjoerg ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
475106f32e7eSjoerg ///    destination.
475206f32e7eSjoerg /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
475306f32e7eSjoerg ///    the operand. The upper 64 bits are assigned zeros.
475406f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_move_epi64(__m128i __a)475506f32e7eSjoerg _mm_move_epi64(__m128i __a)
475606f32e7eSjoerg {
475706f32e7eSjoerg   return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
475806f32e7eSjoerg }
475906f32e7eSjoerg 
476006f32e7eSjoerg /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
476106f32e7eSjoerg ///    [2 x double] and interleaves them into a 128-bit vector of [2 x
476206f32e7eSjoerg ///    double].
476306f32e7eSjoerg ///
476406f32e7eSjoerg /// \headerfile <x86intrin.h>
476506f32e7eSjoerg ///
476606f32e7eSjoerg /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
476706f32e7eSjoerg ///
476806f32e7eSjoerg /// \param __a
476906f32e7eSjoerg ///    A 128-bit vector of [2 x double]. \n
477006f32e7eSjoerg ///    Bits [127:64] are written to bits [63:0] of the destination.
477106f32e7eSjoerg /// \param __b
477206f32e7eSjoerg ///    A 128-bit vector of [2 x double]. \n
477306f32e7eSjoerg ///    Bits [127:64] are written to bits [127:64] of the destination.
477406f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
477506f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_unpackhi_pd(__m128d __a,__m128d __b)477606f32e7eSjoerg _mm_unpackhi_pd(__m128d __a, __m128d __b)
477706f32e7eSjoerg {
477806f32e7eSjoerg   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
477906f32e7eSjoerg }
478006f32e7eSjoerg 
478106f32e7eSjoerg /// Unpacks the low-order 64-bit elements from two 128-bit vectors
478206f32e7eSjoerg ///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
478306f32e7eSjoerg ///    double].
478406f32e7eSjoerg ///
478506f32e7eSjoerg /// \headerfile <x86intrin.h>
478606f32e7eSjoerg ///
478706f32e7eSjoerg /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
478806f32e7eSjoerg ///
478906f32e7eSjoerg /// \param __a
479006f32e7eSjoerg ///    A 128-bit vector of [2 x double]. \n
479106f32e7eSjoerg ///    Bits [63:0] are written to bits [63:0] of the destination.
479206f32e7eSjoerg /// \param __b
479306f32e7eSjoerg ///    A 128-bit vector of [2 x double]. \n
479406f32e7eSjoerg ///    Bits [63:0] are written to bits [127:64] of the destination.
479506f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
479606f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_unpacklo_pd(__m128d __a,__m128d __b)479706f32e7eSjoerg _mm_unpacklo_pd(__m128d __a, __m128d __b)
479806f32e7eSjoerg {
479906f32e7eSjoerg   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
480006f32e7eSjoerg }
480106f32e7eSjoerg 
480206f32e7eSjoerg /// Extracts the sign bits of the double-precision values in the 128-bit
480306f32e7eSjoerg ///    vector of [2 x double], zero-extends the value, and writes it to the
480406f32e7eSjoerg ///    low-order bits of the destination.
480506f32e7eSjoerg ///
480606f32e7eSjoerg /// \headerfile <x86intrin.h>
480706f32e7eSjoerg ///
480806f32e7eSjoerg /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
480906f32e7eSjoerg ///
481006f32e7eSjoerg /// \param __a
481106f32e7eSjoerg ///    A 128-bit vector of [2 x double] containing the values with sign bits to
481206f32e7eSjoerg ///    be extracted.
481306f32e7eSjoerg /// \returns The sign bits from each of the double-precision elements in \a __a,
481406f32e7eSjoerg ///    written to bits [1:0]. The remaining bits are assigned values of zero.
481506f32e7eSjoerg static __inline__ int __DEFAULT_FN_ATTRS
_mm_movemask_pd(__m128d __a)481606f32e7eSjoerg _mm_movemask_pd(__m128d __a)
481706f32e7eSjoerg {
481806f32e7eSjoerg   return __builtin_ia32_movmskpd((__v2df)__a);
481906f32e7eSjoerg }
482006f32e7eSjoerg 
482106f32e7eSjoerg 
482206f32e7eSjoerg /// Constructs a 128-bit floating-point vector of [2 x double] from two
482306f32e7eSjoerg ///    128-bit vector parameters of [2 x double], using the immediate-value
482406f32e7eSjoerg ///     parameter as a specifier.
482506f32e7eSjoerg ///
482606f32e7eSjoerg /// \headerfile <x86intrin.h>
482706f32e7eSjoerg ///
482806f32e7eSjoerg /// \code
482906f32e7eSjoerg /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
483006f32e7eSjoerg /// \endcode
483106f32e7eSjoerg ///
483206f32e7eSjoerg /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
483306f32e7eSjoerg ///
483406f32e7eSjoerg /// \param a
483506f32e7eSjoerg ///    A 128-bit vector of [2 x double].
483606f32e7eSjoerg /// \param b
483706f32e7eSjoerg ///    A 128-bit vector of [2 x double].
483806f32e7eSjoerg /// \param i
483906f32e7eSjoerg ///    An 8-bit immediate value. The least significant two bits specify which
484006f32e7eSjoerg ///    elements to copy from \a a and \a b: \n
484106f32e7eSjoerg ///    Bit[0] = 0: lower element of \a a copied to lower element of result. \n
484206f32e7eSjoerg ///    Bit[0] = 1: upper element of \a a copied to lower element of result. \n
484306f32e7eSjoerg ///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
484406f32e7eSjoerg ///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
484506f32e7eSjoerg /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
484606f32e7eSjoerg #define _mm_shuffle_pd(a, b, i) \
484706f32e7eSjoerg   (__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
484806f32e7eSjoerg                                  (int)(i))
484906f32e7eSjoerg 
485006f32e7eSjoerg /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
485106f32e7eSjoerg ///    floating-point vector of [4 x float].
485206f32e7eSjoerg ///
485306f32e7eSjoerg /// \headerfile <x86intrin.h>
485406f32e7eSjoerg ///
485506f32e7eSjoerg /// This intrinsic has no corresponding instruction.
485606f32e7eSjoerg ///
485706f32e7eSjoerg /// \param __a
485806f32e7eSjoerg ///    A 128-bit floating-point vector of [2 x double].
485906f32e7eSjoerg /// \returns A 128-bit floating-point vector of [4 x float] containing the same
486006f32e7eSjoerg ///    bitwise pattern as the parameter.
486106f32e7eSjoerg static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_castpd_ps(__m128d __a)486206f32e7eSjoerg _mm_castpd_ps(__m128d __a)
486306f32e7eSjoerg {
486406f32e7eSjoerg   return (__m128)__a;
486506f32e7eSjoerg }
486606f32e7eSjoerg 
486706f32e7eSjoerg /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
486806f32e7eSjoerg ///    integer vector.
486906f32e7eSjoerg ///
487006f32e7eSjoerg /// \headerfile <x86intrin.h>
487106f32e7eSjoerg ///
487206f32e7eSjoerg /// This intrinsic has no corresponding instruction.
487306f32e7eSjoerg ///
487406f32e7eSjoerg /// \param __a
487506f32e7eSjoerg ///    A 128-bit floating-point vector of [2 x double].
487606f32e7eSjoerg /// \returns A 128-bit integer vector containing the same bitwise pattern as the
487706f32e7eSjoerg ///    parameter.
487806f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_castpd_si128(__m128d __a)487906f32e7eSjoerg _mm_castpd_si128(__m128d __a)
488006f32e7eSjoerg {
488106f32e7eSjoerg   return (__m128i)__a;
488206f32e7eSjoerg }
488306f32e7eSjoerg 
488406f32e7eSjoerg /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
488506f32e7eSjoerg ///    floating-point vector of [2 x double].
488606f32e7eSjoerg ///
488706f32e7eSjoerg /// \headerfile <x86intrin.h>
488806f32e7eSjoerg ///
488906f32e7eSjoerg /// This intrinsic has no corresponding instruction.
489006f32e7eSjoerg ///
489106f32e7eSjoerg /// \param __a
489206f32e7eSjoerg ///    A 128-bit floating-point vector of [4 x float].
489306f32e7eSjoerg /// \returns A 128-bit floating-point vector of [2 x double] containing the same
489406f32e7eSjoerg ///    bitwise pattern as the parameter.
489506f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_castps_pd(__m128 __a)489606f32e7eSjoerg _mm_castps_pd(__m128 __a)
489706f32e7eSjoerg {
489806f32e7eSjoerg   return (__m128d)__a;
489906f32e7eSjoerg }
490006f32e7eSjoerg 
490106f32e7eSjoerg /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
490206f32e7eSjoerg ///    integer vector.
490306f32e7eSjoerg ///
490406f32e7eSjoerg /// \headerfile <x86intrin.h>
490506f32e7eSjoerg ///
490606f32e7eSjoerg /// This intrinsic has no corresponding instruction.
490706f32e7eSjoerg ///
490806f32e7eSjoerg /// \param __a
490906f32e7eSjoerg ///    A 128-bit floating-point vector of [4 x float].
491006f32e7eSjoerg /// \returns A 128-bit integer vector containing the same bitwise pattern as the
491106f32e7eSjoerg ///    parameter.
491206f32e7eSjoerg static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_castps_si128(__m128 __a)491306f32e7eSjoerg _mm_castps_si128(__m128 __a)
491406f32e7eSjoerg {
491506f32e7eSjoerg   return (__m128i)__a;
491606f32e7eSjoerg }
491706f32e7eSjoerg 
491806f32e7eSjoerg /// Casts a 128-bit integer vector into a 128-bit floating-point vector
491906f32e7eSjoerg ///    of [4 x float].
492006f32e7eSjoerg ///
492106f32e7eSjoerg /// \headerfile <x86intrin.h>
492206f32e7eSjoerg ///
492306f32e7eSjoerg /// This intrinsic has no corresponding instruction.
492406f32e7eSjoerg ///
492506f32e7eSjoerg /// \param __a
492606f32e7eSjoerg ///    A 128-bit integer vector.
492706f32e7eSjoerg /// \returns A 128-bit floating-point vector of [4 x float] containing the same
492806f32e7eSjoerg ///    bitwise pattern as the parameter.
492906f32e7eSjoerg static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_castsi128_ps(__m128i __a)493006f32e7eSjoerg _mm_castsi128_ps(__m128i __a)
493106f32e7eSjoerg {
493206f32e7eSjoerg   return (__m128)__a;
493306f32e7eSjoerg }
493406f32e7eSjoerg 
493506f32e7eSjoerg /// Casts a 128-bit integer vector into a 128-bit floating-point vector
493606f32e7eSjoerg ///    of [2 x double].
493706f32e7eSjoerg ///
493806f32e7eSjoerg /// \headerfile <x86intrin.h>
493906f32e7eSjoerg ///
494006f32e7eSjoerg /// This intrinsic has no corresponding instruction.
494106f32e7eSjoerg ///
494206f32e7eSjoerg /// \param __a
494306f32e7eSjoerg ///    A 128-bit integer vector.
494406f32e7eSjoerg /// \returns A 128-bit floating-point vector of [2 x double] containing the same
494506f32e7eSjoerg ///    bitwise pattern as the parameter.
494606f32e7eSjoerg static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_castsi128_pd(__m128i __a)494706f32e7eSjoerg _mm_castsi128_pd(__m128i __a)
494806f32e7eSjoerg {
494906f32e7eSjoerg   return (__m128d)__a;
495006f32e7eSjoerg }
495106f32e7eSjoerg 
495206f32e7eSjoerg #if defined(__cplusplus)
495306f32e7eSjoerg extern "C" {
495406f32e7eSjoerg #endif
495506f32e7eSjoerg 
495606f32e7eSjoerg /// Indicates that a spin loop is being executed for the purposes of
495706f32e7eSjoerg ///    optimizing power consumption during the loop.
495806f32e7eSjoerg ///
495906f32e7eSjoerg /// \headerfile <x86intrin.h>
496006f32e7eSjoerg ///
496106f32e7eSjoerg /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
496206f32e7eSjoerg ///
496306f32e7eSjoerg void _mm_pause(void);
496406f32e7eSjoerg 
496506f32e7eSjoerg #if defined(__cplusplus)
496606f32e7eSjoerg } // extern "C"
496706f32e7eSjoerg #endif
496806f32e7eSjoerg #undef __DEFAULT_FN_ATTRS
496906f32e7eSjoerg #undef __DEFAULT_FN_ATTRS_MMX
497006f32e7eSjoerg 
497106f32e7eSjoerg #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
497206f32e7eSjoerg 
4973*13fbcb42Sjoerg #define _MM_DENORMALS_ZERO_ON   (0x0040U)
4974*13fbcb42Sjoerg #define _MM_DENORMALS_ZERO_OFF  (0x0000U)
497506f32e7eSjoerg 
4976*13fbcb42Sjoerg #define _MM_DENORMALS_ZERO_MASK (0x0040U)
497706f32e7eSjoerg 
497806f32e7eSjoerg #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
497906f32e7eSjoerg #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
498006f32e7eSjoerg 
498106f32e7eSjoerg #endif /* __EMMINTRIN_H */
4982