1 /*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9 #ifndef __IMMINTRIN_H
10 #error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
11 #endif
12
13 #ifndef __AVX512FINTRIN_H
14 #define __AVX512FINTRIN_H
15
16 typedef char __v64qi __attribute__((__vector_size__(64)));
17 typedef short __v32hi __attribute__((__vector_size__(64)));
18 typedef double __v8df __attribute__((__vector_size__(64)));
19 typedef float __v16sf __attribute__((__vector_size__(64)));
20 typedef long long __v8di __attribute__((__vector_size__(64)));
21 typedef int __v16si __attribute__((__vector_size__(64)));
22
23 /* Unsigned types */
24 typedef unsigned char __v64qu __attribute__((__vector_size__(64)));
25 typedef unsigned short __v32hu __attribute__((__vector_size__(64)));
26 typedef unsigned long long __v8du __attribute__((__vector_size__(64)));
27 typedef unsigned int __v16su __attribute__((__vector_size__(64)));
28
29 /* We need an explicitly signed variant for char. Note that this shouldn't
30 * appear in the interface though. */
31 typedef signed char __v64qs __attribute__((__vector_size__(64)));
32
33 typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64)));
34 typedef double __m512d __attribute__((__vector_size__(64), __aligned__(64)));
35 typedef long long __m512i __attribute__((__vector_size__(64), __aligned__(64)));
36
37 typedef float __m512_u __attribute__((__vector_size__(64), __aligned__(1)));
38 typedef double __m512d_u __attribute__((__vector_size__(64), __aligned__(1)));
39 typedef long long __m512i_u __attribute__((__vector_size__(64), __aligned__(1)));
40
41 typedef unsigned char __mmask8;
42 typedef unsigned short __mmask16;
43
44 /* Rounding mode macros. */
45 #define _MM_FROUND_TO_NEAREST_INT 0x00
46 #define _MM_FROUND_TO_NEG_INF 0x01
47 #define _MM_FROUND_TO_POS_INF 0x02
48 #define _MM_FROUND_TO_ZERO 0x03
49 #define _MM_FROUND_CUR_DIRECTION 0x04
50
51 /* Constants for integer comparison predicates */
52 typedef enum {
53 _MM_CMPINT_EQ, /* Equal */
54 _MM_CMPINT_LT, /* Less than */
55 _MM_CMPINT_LE, /* Less than or Equal */
56 _MM_CMPINT_UNUSED,
57 _MM_CMPINT_NE, /* Not Equal */
58 _MM_CMPINT_NLT, /* Not Less than */
59 #define _MM_CMPINT_GE _MM_CMPINT_NLT /* Greater than or Equal */
60 _MM_CMPINT_NLE /* Not Less than or Equal */
61 #define _MM_CMPINT_GT _MM_CMPINT_NLE /* Greater than */
62 } _MM_CMPINT_ENUM;
63
64 typedef enum
65 {
66 _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
67 _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05,
68 _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08,
69 _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B,
70 _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E,
71 _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11,
72 _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14,
73 _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17,
74 _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A,
75 _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D,
76 _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20,
77 _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23,
78 _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26,
79 _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29,
80 _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C,
81 _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F,
82 _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32,
83 _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35,
84 _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38,
85 _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B,
86 _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E,
87 _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41,
88 _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44,
89 _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47,
90 _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A,
91 _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D,
92 _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50,
93 _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53,
94 _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56,
95 _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59,
96 _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C,
97 _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F,
98 _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62,
99 _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65,
100 _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68,
101 _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B,
102 _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E,
103 _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71,
104 _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74,
105 _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77,
106 _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A,
107 _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D,
108 _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80,
109 _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83,
110 _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86,
111 _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89,
112 _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C,
113 _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F,
114 _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92,
115 _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95,
116 _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98,
117 _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B,
118 _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E,
119 _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1,
120 _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4,
121 _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7,
122 _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA,
123 _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD,
124 _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0,
125 _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3,
126 _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6,
127 _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9,
128 _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC,
129 _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF,
130 _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2,
131 _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5,
132 _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8,
133 _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB,
134 _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE,
135 _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1,
136 _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4,
137 _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7,
138 _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA,
139 _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD,
140 _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0,
141 _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3,
142 _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6,
143 _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9,
144 _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC,
145 _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF,
146 _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2,
147 _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5,
148 _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8,
149 _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB,
150 _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE,
151 _MM_PERM_DDDD = 0xFF
152 } _MM_PERM_ENUM;
153
154 typedef enum
155 {
156 _MM_MANT_NORM_1_2, /* interval [1, 2) */
157 _MM_MANT_NORM_p5_2, /* interval [0.5, 2) */
158 _MM_MANT_NORM_p5_1, /* interval [0.5, 1) */
159 _MM_MANT_NORM_p75_1p5 /* interval [0.75, 1.5) */
160 } _MM_MANTISSA_NORM_ENUM;
161
162 typedef enum
163 {
164 _MM_MANT_SIGN_src, /* sign = sign(SRC) */
165 _MM_MANT_SIGN_zero, /* sign = 0 */
166 _MM_MANT_SIGN_nan /* DEST = NaN if sign(SRC) = 1 */
167 } _MM_MANTISSA_SIGN_ENUM;
168
169 /* Define the default attributes for the functions in this file. */
170 #define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(512)))
171 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(128)))
172 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))
173
174 /* Create vectors with repeated elements */
175
176 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_setzero_si512(void)177 _mm512_setzero_si512(void)
178 {
179 return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
180 }
181
182 #define _mm512_setzero_epi32 _mm512_setzero_si512
183
184 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_undefined_pd(void)185 _mm512_undefined_pd(void)
186 {
187 return (__m512d)__builtin_ia32_undef512();
188 }
189
190 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_undefined(void)191 _mm512_undefined(void)
192 {
193 return (__m512)__builtin_ia32_undef512();
194 }
195
196 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_undefined_ps(void)197 _mm512_undefined_ps(void)
198 {
199 return (__m512)__builtin_ia32_undef512();
200 }
201
202 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_undefined_epi32(void)203 _mm512_undefined_epi32(void)
204 {
205 return (__m512i)__builtin_ia32_undef512();
206 }
207
208 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_broadcastd_epi32(__m128i __A)209 _mm512_broadcastd_epi32 (__m128i __A)
210 {
211 return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A,
212 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
213 }
214
215 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_broadcastd_epi32(__m512i __O,__mmask16 __M,__m128i __A)216 _mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
217 {
218 return (__m512i)__builtin_ia32_selectd_512(__M,
219 (__v16si) _mm512_broadcastd_epi32(__A),
220 (__v16si) __O);
221 }
222
223 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_broadcastd_epi32(__mmask16 __M,__m128i __A)224 _mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
225 {
226 return (__m512i)__builtin_ia32_selectd_512(__M,
227 (__v16si) _mm512_broadcastd_epi32(__A),
228 (__v16si) _mm512_setzero_si512());
229 }
230
231 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_broadcastq_epi64(__m128i __A)232 _mm512_broadcastq_epi64 (__m128i __A)
233 {
234 return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A,
235 0, 0, 0, 0, 0, 0, 0, 0);
236 }
237
238 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_broadcastq_epi64(__m512i __O,__mmask8 __M,__m128i __A)239 _mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
240 {
241 return (__m512i)__builtin_ia32_selectq_512(__M,
242 (__v8di) _mm512_broadcastq_epi64(__A),
243 (__v8di) __O);
244
245 }
246
247 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_broadcastq_epi64(__mmask8 __M,__m128i __A)248 _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
249 {
250 return (__m512i)__builtin_ia32_selectq_512(__M,
251 (__v8di) _mm512_broadcastq_epi64(__A),
252 (__v8di) _mm512_setzero_si512());
253 }
254
255
256 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_setzero_ps(void)257 _mm512_setzero_ps(void)
258 {
259 return __extension__ (__m512){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
260 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
261 }
262
263 #define _mm512_setzero _mm512_setzero_ps
264
265 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_setzero_pd(void)266 _mm512_setzero_pd(void)
267 {
268 return __extension__ (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
269 }
270
271 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_set1_ps(float __w)272 _mm512_set1_ps(float __w)
273 {
274 return __extension__ (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
275 __w, __w, __w, __w, __w, __w, __w, __w };
276 }
277
278 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_set1_pd(double __w)279 _mm512_set1_pd(double __w)
280 {
281 return __extension__ (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
282 }
283
284 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_set1_epi8(char __w)285 _mm512_set1_epi8(char __w)
286 {
287 return __extension__ (__m512i)(__v64qi){
288 __w, __w, __w, __w, __w, __w, __w, __w,
289 __w, __w, __w, __w, __w, __w, __w, __w,
290 __w, __w, __w, __w, __w, __w, __w, __w,
291 __w, __w, __w, __w, __w, __w, __w, __w,
292 __w, __w, __w, __w, __w, __w, __w, __w,
293 __w, __w, __w, __w, __w, __w, __w, __w,
294 __w, __w, __w, __w, __w, __w, __w, __w,
295 __w, __w, __w, __w, __w, __w, __w, __w };
296 }
297
298 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_set1_epi16(short __w)299 _mm512_set1_epi16(short __w)
300 {
301 return __extension__ (__m512i)(__v32hi){
302 __w, __w, __w, __w, __w, __w, __w, __w,
303 __w, __w, __w, __w, __w, __w, __w, __w,
304 __w, __w, __w, __w, __w, __w, __w, __w,
305 __w, __w, __w, __w, __w, __w, __w, __w };
306 }
307
308 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_set1_epi32(int __s)309 _mm512_set1_epi32(int __s)
310 {
311 return __extension__ (__m512i)(__v16si){
312 __s, __s, __s, __s, __s, __s, __s, __s,
313 __s, __s, __s, __s, __s, __s, __s, __s };
314 }
315
316 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_set1_epi32(__mmask16 __M,int __A)317 _mm512_maskz_set1_epi32(__mmask16 __M, int __A)
318 {
319 return (__m512i)__builtin_ia32_selectd_512(__M,
320 (__v16si)_mm512_set1_epi32(__A),
321 (__v16si)_mm512_setzero_si512());
322 }
323
324 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_set1_epi64(long long __d)325 _mm512_set1_epi64(long long __d)
326 {
327 return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
328 }
329
330 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_set1_epi64(__mmask8 __M,long long __A)331 _mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
332 {
333 return (__m512i)__builtin_ia32_selectq_512(__M,
334 (__v8di)_mm512_set1_epi64(__A),
335 (__v8di)_mm512_setzero_si512());
336 }
337
338 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_broadcastss_ps(__m128 __A)339 _mm512_broadcastss_ps(__m128 __A)
340 {
341 return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A,
342 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
343 }
344
345 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_set4_epi32(int __A,int __B,int __C,int __D)346 _mm512_set4_epi32 (int __A, int __B, int __C, int __D)
347 {
348 return __extension__ (__m512i)(__v16si)
349 { __D, __C, __B, __A, __D, __C, __B, __A,
350 __D, __C, __B, __A, __D, __C, __B, __A };
351 }
352
353 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_set4_epi64(long long __A,long long __B,long long __C,long long __D)354 _mm512_set4_epi64 (long long __A, long long __B, long long __C,
355 long long __D)
356 {
357 return __extension__ (__m512i) (__v8di)
358 { __D, __C, __B, __A, __D, __C, __B, __A };
359 }
360
361 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_set4_pd(double __A,double __B,double __C,double __D)362 _mm512_set4_pd (double __A, double __B, double __C, double __D)
363 {
364 return __extension__ (__m512d)
365 { __D, __C, __B, __A, __D, __C, __B, __A };
366 }
367
368 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_set4_ps(float __A,float __B,float __C,float __D)369 _mm512_set4_ps (float __A, float __B, float __C, float __D)
370 {
371 return __extension__ (__m512)
372 { __D, __C, __B, __A, __D, __C, __B, __A,
373 __D, __C, __B, __A, __D, __C, __B, __A };
374 }
375
376 #define _mm512_setr4_epi32(e0,e1,e2,e3) \
377 _mm512_set4_epi32((e3),(e2),(e1),(e0))
378
379 #define _mm512_setr4_epi64(e0,e1,e2,e3) \
380 _mm512_set4_epi64((e3),(e2),(e1),(e0))
381
382 #define _mm512_setr4_pd(e0,e1,e2,e3) \
383 _mm512_set4_pd((e3),(e2),(e1),(e0))
384
385 #define _mm512_setr4_ps(e0,e1,e2,e3) \
386 _mm512_set4_ps((e3),(e2),(e1),(e0))
387
388 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_broadcastsd_pd(__m128d __A)389 _mm512_broadcastsd_pd(__m128d __A)
390 {
391 return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A,
392 0, 0, 0, 0, 0, 0, 0, 0);
393 }
394
395 /* Cast between vector types */
396
397 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_castpd256_pd512(__m256d __a)398 _mm512_castpd256_pd512(__m256d __a)
399 {
400 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
401 }
402
403 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_castps256_ps512(__m256 __a)404 _mm512_castps256_ps512(__m256 __a)
405 {
406 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7,
407 -1, -1, -1, -1, -1, -1, -1, -1);
408 }
409
410 static __inline __m128d __DEFAULT_FN_ATTRS512
_mm512_castpd512_pd128(__m512d __a)411 _mm512_castpd512_pd128(__m512d __a)
412 {
413 return __builtin_shufflevector(__a, __a, 0, 1);
414 }
415
416 static __inline __m256d __DEFAULT_FN_ATTRS512
_mm512_castpd512_pd256(__m512d __A)417 _mm512_castpd512_pd256 (__m512d __A)
418 {
419 return __builtin_shufflevector(__A, __A, 0, 1, 2, 3);
420 }
421
422 static __inline __m128 __DEFAULT_FN_ATTRS512
_mm512_castps512_ps128(__m512 __a)423 _mm512_castps512_ps128(__m512 __a)
424 {
425 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
426 }
427
428 static __inline __m256 __DEFAULT_FN_ATTRS512
_mm512_castps512_ps256(__m512 __A)429 _mm512_castps512_ps256 (__m512 __A)
430 {
431 return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7);
432 }
433
434 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_castpd_ps(__m512d __A)435 _mm512_castpd_ps (__m512d __A)
436 {
437 return (__m512) (__A);
438 }
439
440 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_castpd_si512(__m512d __A)441 _mm512_castpd_si512 (__m512d __A)
442 {
443 return (__m512i) (__A);
444 }
445
446 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_castpd128_pd512(__m128d __A)447 _mm512_castpd128_pd512 (__m128d __A)
448 {
449 return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
450 }
451
452 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_castps_pd(__m512 __A)453 _mm512_castps_pd (__m512 __A)
454 {
455 return (__m512d) (__A);
456 }
457
458 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_castps_si512(__m512 __A)459 _mm512_castps_si512 (__m512 __A)
460 {
461 return (__m512i) (__A);
462 }
463
464 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_castps128_ps512(__m128 __A)465 _mm512_castps128_ps512 (__m128 __A)
466 {
467 return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
468 }
469
470 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_castsi128_si512(__m128i __A)471 _mm512_castsi128_si512 (__m128i __A)
472 {
473 return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
474 }
475
476 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_castsi256_si512(__m256i __A)477 _mm512_castsi256_si512 (__m256i __A)
478 {
479 return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1);
480 }
481
482 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_castsi512_ps(__m512i __A)483 _mm512_castsi512_ps (__m512i __A)
484 {
485 return (__m512) (__A);
486 }
487
488 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_castsi512_pd(__m512i __A)489 _mm512_castsi512_pd (__m512i __A)
490 {
491 return (__m512d) (__A);
492 }
493
494 static __inline __m128i __DEFAULT_FN_ATTRS512
_mm512_castsi512_si128(__m512i __A)495 _mm512_castsi512_si128 (__m512i __A)
496 {
497 return (__m128i)__builtin_shufflevector(__A, __A , 0, 1);
498 }
499
500 static __inline __m256i __DEFAULT_FN_ATTRS512
_mm512_castsi512_si256(__m512i __A)501 _mm512_castsi512_si256 (__m512i __A)
502 {
503 return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3);
504 }
505
506 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
_mm512_int2mask(int __a)507 _mm512_int2mask(int __a)
508 {
509 return (__mmask16)__a;
510 }
511
512 static __inline__ int __DEFAULT_FN_ATTRS
_mm512_mask2int(__mmask16 __a)513 _mm512_mask2int(__mmask16 __a)
514 {
515 return (int)__a;
516 }
517
518 /// Constructs a 512-bit floating-point vector of [8 x double] from a
519 /// 128-bit floating-point vector of [2 x double]. The lower 128 bits
520 /// contain the value of the source vector. The upper 384 bits are set
521 /// to zero.
522 ///
523 /// \headerfile <x86intrin.h>
524 ///
525 /// This intrinsic has no corresponding instruction.
526 ///
527 /// \param __a
528 /// A 128-bit vector of [2 x double].
529 /// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits
530 /// contain the value of the parameter. The upper 384 bits are set to zero.
531 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_zextpd128_pd512(__m128d __a)532 _mm512_zextpd128_pd512(__m128d __a)
533 {
534 return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3);
535 }
536
537 /// Constructs a 512-bit floating-point vector of [8 x double] from a
538 /// 256-bit floating-point vector of [4 x double]. The lower 256 bits
539 /// contain the value of the source vector. The upper 256 bits are set
540 /// to zero.
541 ///
542 /// \headerfile <x86intrin.h>
543 ///
544 /// This intrinsic has no corresponding instruction.
545 ///
546 /// \param __a
547 /// A 256-bit vector of [4 x double].
548 /// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits
549 /// contain the value of the parameter. The upper 256 bits are set to zero.
550 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_zextpd256_pd512(__m256d __a)551 _mm512_zextpd256_pd512(__m256d __a)
552 {
553 return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7);
554 }
555
556 /// Constructs a 512-bit floating-point vector of [16 x float] from a
557 /// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
558 /// the value of the source vector. The upper 384 bits are set to zero.
559 ///
560 /// \headerfile <x86intrin.h>
561 ///
562 /// This intrinsic has no corresponding instruction.
563 ///
564 /// \param __a
565 /// A 128-bit vector of [4 x float].
566 /// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits
567 /// contain the value of the parameter. The upper 384 bits are set to zero.
568 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_zextps128_ps512(__m128 __a)569 _mm512_zextps128_ps512(__m128 __a)
570 {
571 return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7);
572 }
573
574 /// Constructs a 512-bit floating-point vector of [16 x float] from a
575 /// 256-bit floating-point vector of [8 x float]. The lower 256 bits contain
576 /// the value of the source vector. The upper 256 bits are set to zero.
577 ///
578 /// \headerfile <x86intrin.h>
579 ///
580 /// This intrinsic has no corresponding instruction.
581 ///
582 /// \param __a
583 /// A 256-bit vector of [8 x float].
584 /// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits
585 /// contain the value of the parameter. The upper 256 bits are set to zero.
586 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_zextps256_ps512(__m256 __a)587 _mm512_zextps256_ps512(__m256 __a)
588 {
589 return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
590 }
591
592 /// Constructs a 512-bit integer vector from a 128-bit integer vector.
593 /// The lower 128 bits contain the value of the source vector. The upper
594 /// 384 bits are set to zero.
595 ///
596 /// \headerfile <x86intrin.h>
597 ///
598 /// This intrinsic has no corresponding instruction.
599 ///
600 /// \param __a
601 /// A 128-bit integer vector.
602 /// \returns A 512-bit integer vector. The lower 128 bits contain the value of
603 /// the parameter. The upper 384 bits are set to zero.
604 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_zextsi128_si512(__m128i __a)605 _mm512_zextsi128_si512(__m128i __a)
606 {
607 return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3);
608 }
609
610 /// Constructs a 512-bit integer vector from a 256-bit integer vector.
611 /// The lower 256 bits contain the value of the source vector. The upper
612 /// 256 bits are set to zero.
613 ///
614 /// \headerfile <x86intrin.h>
615 ///
616 /// This intrinsic has no corresponding instruction.
617 ///
618 /// \param __a
619 /// A 256-bit integer vector.
620 /// \returns A 512-bit integer vector. The lower 256 bits contain the value of
621 /// the parameter. The upper 256 bits are set to zero.
622 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_zextsi256_si512(__m256i __a)623 _mm512_zextsi256_si512(__m256i __a)
624 {
625 return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7);
626 }
627
628 /* Bitwise operators */
629 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_and_epi32(__m512i __a,__m512i __b)630 _mm512_and_epi32(__m512i __a, __m512i __b)
631 {
632 return (__m512i)((__v16su)__a & (__v16su)__b);
633 }
634
635 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_and_epi32(__m512i __src,__mmask16 __k,__m512i __a,__m512i __b)636 _mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
637 {
638 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
639 (__v16si) _mm512_and_epi32(__a, __b),
640 (__v16si) __src);
641 }
642
643 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_and_epi32(__mmask16 __k,__m512i __a,__m512i __b)644 _mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b)
645 {
646 return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (),
647 __k, __a, __b);
648 }
649
650 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_and_epi64(__m512i __a,__m512i __b)651 _mm512_and_epi64(__m512i __a, __m512i __b)
652 {
653 return (__m512i)((__v8du)__a & (__v8du)__b);
654 }
655
656 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_and_epi64(__m512i __src,__mmask8 __k,__m512i __a,__m512i __b)657 _mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
658 {
659 return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k,
660 (__v8di) _mm512_and_epi64(__a, __b),
661 (__v8di) __src);
662 }
663
664 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_and_epi64(__mmask8 __k,__m512i __a,__m512i __b)665 _mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b)
666 {
667 return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (),
668 __k, __a, __b);
669 }
670
671 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_andnot_si512(__m512i __A,__m512i __B)672 _mm512_andnot_si512 (__m512i __A, __m512i __B)
673 {
674 return (__m512i)(~(__v8du)__A & (__v8du)__B);
675 }
676
677 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_andnot_epi32(__m512i __A,__m512i __B)678 _mm512_andnot_epi32 (__m512i __A, __m512i __B)
679 {
680 return (__m512i)(~(__v16su)__A & (__v16su)__B);
681 }
682
683 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_andnot_epi32(__m512i __W,__mmask16 __U,__m512i __A,__m512i __B)684 _mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
685 {
686 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
687 (__v16si)_mm512_andnot_epi32(__A, __B),
688 (__v16si)__W);
689 }
690
691 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_andnot_epi32(__mmask16 __U,__m512i __A,__m512i __B)692 _mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B)
693 {
694 return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(),
695 __U, __A, __B);
696 }
697
698 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_andnot_epi64(__m512i __A,__m512i __B)699 _mm512_andnot_epi64(__m512i __A, __m512i __B)
700 {
701 return (__m512i)(~(__v8du)__A & (__v8du)__B);
702 }
703
704 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_andnot_epi64(__m512i __W,__mmask8 __U,__m512i __A,__m512i __B)705 _mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
706 {
707 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
708 (__v8di)_mm512_andnot_epi64(__A, __B),
709 (__v8di)__W);
710 }
711
712 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_andnot_epi64(__mmask8 __U,__m512i __A,__m512i __B)713 _mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B)
714 {
715 return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(),
716 __U, __A, __B);
717 }
718
719 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_or_epi32(__m512i __a,__m512i __b)720 _mm512_or_epi32(__m512i __a, __m512i __b)
721 {
722 return (__m512i)((__v16su)__a | (__v16su)__b);
723 }
724
725 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_or_epi32(__m512i __src,__mmask16 __k,__m512i __a,__m512i __b)726 _mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
727 {
728 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
729 (__v16si)_mm512_or_epi32(__a, __b),
730 (__v16si)__src);
731 }
732
733 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_or_epi32(__mmask16 __k,__m512i __a,__m512i __b)734 _mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b)
735 {
736 return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b);
737 }
738
739 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_or_epi64(__m512i __a,__m512i __b)740 _mm512_or_epi64(__m512i __a, __m512i __b)
741 {
742 return (__m512i)((__v8du)__a | (__v8du)__b);
743 }
744
745 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_or_epi64(__m512i __src,__mmask8 __k,__m512i __a,__m512i __b)746 _mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
747 {
748 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
749 (__v8di)_mm512_or_epi64(__a, __b),
750 (__v8di)__src);
751 }
752
753 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_or_epi64(__mmask8 __k,__m512i __a,__m512i __b)754 _mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b)
755 {
756 return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b);
757 }
758
759 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_xor_epi32(__m512i __a,__m512i __b)760 _mm512_xor_epi32(__m512i __a, __m512i __b)
761 {
762 return (__m512i)((__v16su)__a ^ (__v16su)__b);
763 }
764
765 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_xor_epi32(__m512i __src,__mmask16 __k,__m512i __a,__m512i __b)766 _mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
767 {
768 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
769 (__v16si)_mm512_xor_epi32(__a, __b),
770 (__v16si)__src);
771 }
772
773 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_xor_epi32(__mmask16 __k,__m512i __a,__m512i __b)774 _mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b)
775 {
776 return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b);
777 }
778
779 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_xor_epi64(__m512i __a,__m512i __b)780 _mm512_xor_epi64(__m512i __a, __m512i __b)
781 {
782 return (__m512i)((__v8du)__a ^ (__v8du)__b);
783 }
784
785 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_xor_epi64(__m512i __src,__mmask8 __k,__m512i __a,__m512i __b)786 _mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
787 {
788 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
789 (__v8di)_mm512_xor_epi64(__a, __b),
790 (__v8di)__src);
791 }
792
793 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_xor_epi64(__mmask8 __k,__m512i __a,__m512i __b)794 _mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b)
795 {
796 return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b);
797 }
798
799 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_and_si512(__m512i __a,__m512i __b)800 _mm512_and_si512(__m512i __a, __m512i __b)
801 {
802 return (__m512i)((__v8du)__a & (__v8du)__b);
803 }
804
805 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_or_si512(__m512i __a,__m512i __b)806 _mm512_or_si512(__m512i __a, __m512i __b)
807 {
808 return (__m512i)((__v8du)__a | (__v8du)__b);
809 }
810
811 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_xor_si512(__m512i __a,__m512i __b)812 _mm512_xor_si512(__m512i __a, __m512i __b)
813 {
814 return (__m512i)((__v8du)__a ^ (__v8du)__b);
815 }
816
817 /* Arithmetic */
818
819 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_add_pd(__m512d __a,__m512d __b)820 _mm512_add_pd(__m512d __a, __m512d __b)
821 {
822 return (__m512d)((__v8df)__a + (__v8df)__b);
823 }
824
825 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_add_ps(__m512 __a,__m512 __b)826 _mm512_add_ps(__m512 __a, __m512 __b)
827 {
828 return (__m512)((__v16sf)__a + (__v16sf)__b);
829 }
830
831 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_mul_pd(__m512d __a,__m512d __b)832 _mm512_mul_pd(__m512d __a, __m512d __b)
833 {
834 return (__m512d)((__v8df)__a * (__v8df)__b);
835 }
836
837 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_mul_ps(__m512 __a,__m512 __b)838 _mm512_mul_ps(__m512 __a, __m512 __b)
839 {
840 return (__m512)((__v16sf)__a * (__v16sf)__b);
841 }
842
843 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_sub_pd(__m512d __a,__m512d __b)844 _mm512_sub_pd(__m512d __a, __m512d __b)
845 {
846 return (__m512d)((__v8df)__a - (__v8df)__b);
847 }
848
849 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_sub_ps(__m512 __a,__m512 __b)850 _mm512_sub_ps(__m512 __a, __m512 __b)
851 {
852 return (__m512)((__v16sf)__a - (__v16sf)__b);
853 }
854
855 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_add_epi64(__m512i __A,__m512i __B)856 _mm512_add_epi64 (__m512i __A, __m512i __B)
857 {
858 return (__m512i) ((__v8du) __A + (__v8du) __B);
859 }
860
861 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_add_epi64(__m512i __W,__mmask8 __U,__m512i __A,__m512i __B)862 _mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
863 {
864 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
865 (__v8di)_mm512_add_epi64(__A, __B),
866 (__v8di)__W);
867 }
868
869 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_add_epi64(__mmask8 __U,__m512i __A,__m512i __B)870 _mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B)
871 {
872 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
873 (__v8di)_mm512_add_epi64(__A, __B),
874 (__v8di)_mm512_setzero_si512());
875 }
876
877 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_sub_epi64(__m512i __A,__m512i __B)878 _mm512_sub_epi64 (__m512i __A, __m512i __B)
879 {
880 return (__m512i) ((__v8du) __A - (__v8du) __B);
881 }
882
883 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_sub_epi64(__m512i __W,__mmask8 __U,__m512i __A,__m512i __B)884 _mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
885 {
886 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
887 (__v8di)_mm512_sub_epi64(__A, __B),
888 (__v8di)__W);
889 }
890
891 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_sub_epi64(__mmask8 __U,__m512i __A,__m512i __B)892 _mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B)
893 {
894 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
895 (__v8di)_mm512_sub_epi64(__A, __B),
896 (__v8di)_mm512_setzero_si512());
897 }
898
899 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_add_epi32(__m512i __A,__m512i __B)900 _mm512_add_epi32 (__m512i __A, __m512i __B)
901 {
902 return (__m512i) ((__v16su) __A + (__v16su) __B);
903 }
904
905 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_add_epi32(__m512i __W,__mmask16 __U,__m512i __A,__m512i __B)906 _mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
907 {
908 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
909 (__v16si)_mm512_add_epi32(__A, __B),
910 (__v16si)__W);
911 }
912
913 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_add_epi32(__mmask16 __U,__m512i __A,__m512i __B)914 _mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
915 {
916 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
917 (__v16si)_mm512_add_epi32(__A, __B),
918 (__v16si)_mm512_setzero_si512());
919 }
920
921 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_sub_epi32(__m512i __A,__m512i __B)922 _mm512_sub_epi32 (__m512i __A, __m512i __B)
923 {
924 return (__m512i) ((__v16su) __A - (__v16su) __B);
925 }
926
927 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_sub_epi32(__m512i __W,__mmask16 __U,__m512i __A,__m512i __B)928 _mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
929 {
930 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
931 (__v16si)_mm512_sub_epi32(__A, __B),
932 (__v16si)__W);
933 }
934
935 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_sub_epi32(__mmask16 __U,__m512i __A,__m512i __B)936 _mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B)
937 {
938 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
939 (__v16si)_mm512_sub_epi32(__A, __B),
940 (__v16si)_mm512_setzero_si512());
941 }
942
943 #define _mm512_max_round_pd(A, B, R) \
944 ((__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \
945 (__v8df)(__m512d)(B), (int)(R)))
946
947 #define _mm512_mask_max_round_pd(W, U, A, B, R) \
948 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
949 (__v8df)_mm512_max_round_pd((A), (B), (R)), \
950 (__v8df)(W)))
951
952 #define _mm512_maskz_max_round_pd(U, A, B, R) \
953 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
954 (__v8df)_mm512_max_round_pd((A), (B), (R)), \
955 (__v8df)_mm512_setzero_pd()))
956
957 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_max_pd(__m512d __A,__m512d __B)958 _mm512_max_pd(__m512d __A, __m512d __B)
959 {
960 return (__m512d) __builtin_ia32_maxpd512((__v8df) __A, (__v8df) __B,
961 _MM_FROUND_CUR_DIRECTION);
962 }
963
964 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_max_pd(__m512d __W,__mmask8 __U,__m512d __A,__m512d __B)965 _mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
966 {
967 return (__m512d)__builtin_ia32_selectpd_512(__U,
968 (__v8df)_mm512_max_pd(__A, __B),
969 (__v8df)__W);
970 }
971
972 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_max_pd(__mmask8 __U,__m512d __A,__m512d __B)973 _mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
974 {
975 return (__m512d)__builtin_ia32_selectpd_512(__U,
976 (__v8df)_mm512_max_pd(__A, __B),
977 (__v8df)_mm512_setzero_pd());
978 }
979
980 #define _mm512_max_round_ps(A, B, R) \
981 ((__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \
982 (__v16sf)(__m512)(B), (int)(R)))
983
984 #define _mm512_mask_max_round_ps(W, U, A, B, R) \
985 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
986 (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
987 (__v16sf)(W)))
988
989 #define _mm512_maskz_max_round_ps(U, A, B, R) \
990 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
991 (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
992 (__v16sf)_mm512_setzero_ps()))
993
994 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_max_ps(__m512 __A,__m512 __B)995 _mm512_max_ps(__m512 __A, __m512 __B)
996 {
997 return (__m512) __builtin_ia32_maxps512((__v16sf) __A, (__v16sf) __B,
998 _MM_FROUND_CUR_DIRECTION);
999 }
1000
1001 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_max_ps(__m512 __W,__mmask16 __U,__m512 __A,__m512 __B)1002 _mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
1003 {
1004 return (__m512)__builtin_ia32_selectps_512(__U,
1005 (__v16sf)_mm512_max_ps(__A, __B),
1006 (__v16sf)__W);
1007 }
1008
1009 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_max_ps(__mmask16 __U,__m512 __A,__m512 __B)1010 _mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
1011 {
1012 return (__m512)__builtin_ia32_selectps_512(__U,
1013 (__v16sf)_mm512_max_ps(__A, __B),
1014 (__v16sf)_mm512_setzero_ps());
1015 }
1016
1017 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_max_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128 __B)1018 _mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
1019 return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
1020 (__v4sf) __B,
1021 (__v4sf) __W,
1022 (__mmask8) __U,
1023 _MM_FROUND_CUR_DIRECTION);
1024 }
1025
1026 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_max_ss(__mmask8 __U,__m128 __A,__m128 __B)1027 _mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) {
1028 return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
1029 (__v4sf) __B,
1030 (__v4sf) _mm_setzero_ps (),
1031 (__mmask8) __U,
1032 _MM_FROUND_CUR_DIRECTION);
1033 }
1034
1035 #define _mm_max_round_ss(A, B, R) \
1036 ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
1037 (__v4sf)(__m128)(B), \
1038 (__v4sf)_mm_setzero_ps(), \
1039 (__mmask8)-1, (int)(R)))
1040
1041 #define _mm_mask_max_round_ss(W, U, A, B, R) \
1042 ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
1043 (__v4sf)(__m128)(B), \
1044 (__v4sf)(__m128)(W), (__mmask8)(U), \
1045 (int)(R)))
1046
1047 #define _mm_maskz_max_round_ss(U, A, B, R) \
1048 ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
1049 (__v4sf)(__m128)(B), \
1050 (__v4sf)_mm_setzero_ps(), \
1051 (__mmask8)(U), (int)(R)))
1052
1053 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_max_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128d __B)1054 _mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
1055 return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
1056 (__v2df) __B,
1057 (__v2df) __W,
1058 (__mmask8) __U,
1059 _MM_FROUND_CUR_DIRECTION);
1060 }
1061
1062 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_max_sd(__mmask8 __U,__m128d __A,__m128d __B)1063 _mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) {
1064 return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
1065 (__v2df) __B,
1066 (__v2df) _mm_setzero_pd (),
1067 (__mmask8) __U,
1068 _MM_FROUND_CUR_DIRECTION);
1069 }
1070
1071 #define _mm_max_round_sd(A, B, R) \
1072 ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
1073 (__v2df)(__m128d)(B), \
1074 (__v2df)_mm_setzero_pd(), \
1075 (__mmask8)-1, (int)(R)))
1076
1077 #define _mm_mask_max_round_sd(W, U, A, B, R) \
1078 ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
1079 (__v2df)(__m128d)(B), \
1080 (__v2df)(__m128d)(W), \
1081 (__mmask8)(U), (int)(R)))
1082
1083 #define _mm_maskz_max_round_sd(U, A, B, R) \
1084 ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
1085 (__v2df)(__m128d)(B), \
1086 (__v2df)_mm_setzero_pd(), \
1087 (__mmask8)(U), (int)(R)))
1088
1089 static __inline __m512i
1090 __DEFAULT_FN_ATTRS512
_mm512_max_epi32(__m512i __A,__m512i __B)1091 _mm512_max_epi32(__m512i __A, __m512i __B)
1092 {
1093 return (__m512i)__builtin_elementwise_max((__v16si)__A, (__v16si)__B);
1094 }
1095
1096 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_max_epi32(__m512i __W,__mmask16 __M,__m512i __A,__m512i __B)1097 _mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
1098 {
1099 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1100 (__v16si)_mm512_max_epi32(__A, __B),
1101 (__v16si)__W);
1102 }
1103
1104 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_max_epi32(__mmask16 __M,__m512i __A,__m512i __B)1105 _mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
1106 {
1107 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1108 (__v16si)_mm512_max_epi32(__A, __B),
1109 (__v16si)_mm512_setzero_si512());
1110 }
1111
1112 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_max_epu32(__m512i __A,__m512i __B)1113 _mm512_max_epu32(__m512i __A, __m512i __B)
1114 {
1115 return (__m512i)__builtin_elementwise_max((__v16su)__A, (__v16su)__B);
1116 }
1117
1118 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_max_epu32(__m512i __W,__mmask16 __M,__m512i __A,__m512i __B)1119 _mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
1120 {
1121 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1122 (__v16si)_mm512_max_epu32(__A, __B),
1123 (__v16si)__W);
1124 }
1125
1126 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_max_epu32(__mmask16 __M,__m512i __A,__m512i __B)1127 _mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
1128 {
1129 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1130 (__v16si)_mm512_max_epu32(__A, __B),
1131 (__v16si)_mm512_setzero_si512());
1132 }
1133
1134 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_max_epi64(__m512i __A,__m512i __B)1135 _mm512_max_epi64(__m512i __A, __m512i __B)
1136 {
1137 return (__m512i)__builtin_elementwise_max((__v8di)__A, (__v8di)__B);
1138 }
1139
1140 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_max_epi64(__m512i __W,__mmask8 __M,__m512i __A,__m512i __B)1141 _mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
1142 {
1143 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1144 (__v8di)_mm512_max_epi64(__A, __B),
1145 (__v8di)__W);
1146 }
1147
1148 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_max_epi64(__mmask8 __M,__m512i __A,__m512i __B)1149 _mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
1150 {
1151 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1152 (__v8di)_mm512_max_epi64(__A, __B),
1153 (__v8di)_mm512_setzero_si512());
1154 }
1155
1156 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_max_epu64(__m512i __A,__m512i __B)1157 _mm512_max_epu64(__m512i __A, __m512i __B)
1158 {
1159 return (__m512i)__builtin_elementwise_max((__v8du)__A, (__v8du)__B);
1160 }
1161
1162 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_max_epu64(__m512i __W,__mmask8 __M,__m512i __A,__m512i __B)1163 _mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
1164 {
1165 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1166 (__v8di)_mm512_max_epu64(__A, __B),
1167 (__v8di)__W);
1168 }
1169
1170 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_max_epu64(__mmask8 __M,__m512i __A,__m512i __B)1171 _mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
1172 {
1173 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1174 (__v8di)_mm512_max_epu64(__A, __B),
1175 (__v8di)_mm512_setzero_si512());
1176 }
1177
1178 #define _mm512_min_round_pd(A, B, R) \
1179 ((__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \
1180 (__v8df)(__m512d)(B), (int)(R)))
1181
1182 #define _mm512_mask_min_round_pd(W, U, A, B, R) \
1183 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1184 (__v8df)_mm512_min_round_pd((A), (B), (R)), \
1185 (__v8df)(W)))
1186
1187 #define _mm512_maskz_min_round_pd(U, A, B, R) \
1188 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1189 (__v8df)_mm512_min_round_pd((A), (B), (R)), \
1190 (__v8df)_mm512_setzero_pd()))
1191
1192 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_min_pd(__m512d __A,__m512d __B)1193 _mm512_min_pd(__m512d __A, __m512d __B)
1194 {
1195 return (__m512d) __builtin_ia32_minpd512((__v8df) __A, (__v8df) __B,
1196 _MM_FROUND_CUR_DIRECTION);
1197 }
1198
1199 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_min_pd(__m512d __W,__mmask8 __U,__m512d __A,__m512d __B)1200 _mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
1201 {
1202 return (__m512d)__builtin_ia32_selectpd_512(__U,
1203 (__v8df)_mm512_min_pd(__A, __B),
1204 (__v8df)__W);
1205 }
1206
1207 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_min_pd(__mmask8 __U,__m512d __A,__m512d __B)1208 _mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
1209 {
1210 return (__m512d)__builtin_ia32_selectpd_512(__U,
1211 (__v8df)_mm512_min_pd(__A, __B),
1212 (__v8df)_mm512_setzero_pd());
1213 }
1214
1215 #define _mm512_min_round_ps(A, B, R) \
1216 ((__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \
1217 (__v16sf)(__m512)(B), (int)(R)))
1218
1219 #define _mm512_mask_min_round_ps(W, U, A, B, R) \
1220 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1221 (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
1222 (__v16sf)(W)))
1223
1224 #define _mm512_maskz_min_round_ps(U, A, B, R) \
1225 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1226 (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
1227 (__v16sf)_mm512_setzero_ps()))
1228
1229 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_min_ps(__m512 __A,__m512 __B)1230 _mm512_min_ps(__m512 __A, __m512 __B)
1231 {
1232 return (__m512) __builtin_ia32_minps512((__v16sf) __A, (__v16sf) __B,
1233 _MM_FROUND_CUR_DIRECTION);
1234 }
1235
1236 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_min_ps(__m512 __W,__mmask16 __U,__m512 __A,__m512 __B)1237 _mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
1238 {
1239 return (__m512)__builtin_ia32_selectps_512(__U,
1240 (__v16sf)_mm512_min_ps(__A, __B),
1241 (__v16sf)__W);
1242 }
1243
1244 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_min_ps(__mmask16 __U,__m512 __A,__m512 __B)1245 _mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
1246 {
1247 return (__m512)__builtin_ia32_selectps_512(__U,
1248 (__v16sf)_mm512_min_ps(__A, __B),
1249 (__v16sf)_mm512_setzero_ps());
1250 }
1251
1252 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_min_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128 __B)1253 _mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
1254 return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
1255 (__v4sf) __B,
1256 (__v4sf) __W,
1257 (__mmask8) __U,
1258 _MM_FROUND_CUR_DIRECTION);
1259 }
1260
1261 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_min_ss(__mmask8 __U,__m128 __A,__m128 __B)1262 _mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) {
1263 return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
1264 (__v4sf) __B,
1265 (__v4sf) _mm_setzero_ps (),
1266 (__mmask8) __U,
1267 _MM_FROUND_CUR_DIRECTION);
1268 }
1269
1270 #define _mm_min_round_ss(A, B, R) \
1271 ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
1272 (__v4sf)(__m128)(B), \
1273 (__v4sf)_mm_setzero_ps(), \
1274 (__mmask8)-1, (int)(R)))
1275
1276 #define _mm_mask_min_round_ss(W, U, A, B, R) \
1277 ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
1278 (__v4sf)(__m128)(B), \
1279 (__v4sf)(__m128)(W), (__mmask8)(U), \
1280 (int)(R)))
1281
1282 #define _mm_maskz_min_round_ss(U, A, B, R) \
1283 ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
1284 (__v4sf)(__m128)(B), \
1285 (__v4sf)_mm_setzero_ps(), \
1286 (__mmask8)(U), (int)(R)))
1287
1288 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_min_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128d __B)1289 _mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
1290 return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
1291 (__v2df) __B,
1292 (__v2df) __W,
1293 (__mmask8) __U,
1294 _MM_FROUND_CUR_DIRECTION);
1295 }
1296
1297 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_min_sd(__mmask8 __U,__m128d __A,__m128d __B)1298 _mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) {
1299 return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
1300 (__v2df) __B,
1301 (__v2df) _mm_setzero_pd (),
1302 (__mmask8) __U,
1303 _MM_FROUND_CUR_DIRECTION);
1304 }
1305
1306 #define _mm_min_round_sd(A, B, R) \
1307 ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
1308 (__v2df)(__m128d)(B), \
1309 (__v2df)_mm_setzero_pd(), \
1310 (__mmask8)-1, (int)(R)))
1311
1312 #define _mm_mask_min_round_sd(W, U, A, B, R) \
1313 ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
1314 (__v2df)(__m128d)(B), \
1315 (__v2df)(__m128d)(W), \
1316 (__mmask8)(U), (int)(R)))
1317
1318 #define _mm_maskz_min_round_sd(U, A, B, R) \
1319 ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
1320 (__v2df)(__m128d)(B), \
1321 (__v2df)_mm_setzero_pd(), \
1322 (__mmask8)(U), (int)(R)))
1323
1324 static __inline __m512i
1325 __DEFAULT_FN_ATTRS512
_mm512_min_epi32(__m512i __A,__m512i __B)1326 _mm512_min_epi32(__m512i __A, __m512i __B)
1327 {
1328 return (__m512i)__builtin_elementwise_min((__v16si)__A, (__v16si)__B);
1329 }
1330
1331 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_min_epi32(__m512i __W,__mmask16 __M,__m512i __A,__m512i __B)1332 _mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
1333 {
1334 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1335 (__v16si)_mm512_min_epi32(__A, __B),
1336 (__v16si)__W);
1337 }
1338
1339 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_min_epi32(__mmask16 __M,__m512i __A,__m512i __B)1340 _mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
1341 {
1342 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1343 (__v16si)_mm512_min_epi32(__A, __B),
1344 (__v16si)_mm512_setzero_si512());
1345 }
1346
1347 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_min_epu32(__m512i __A,__m512i __B)1348 _mm512_min_epu32(__m512i __A, __m512i __B)
1349 {
1350 return (__m512i)__builtin_elementwise_min((__v16su)__A, (__v16su)__B);
1351 }
1352
1353 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_min_epu32(__m512i __W,__mmask16 __M,__m512i __A,__m512i __B)1354 _mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
1355 {
1356 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1357 (__v16si)_mm512_min_epu32(__A, __B),
1358 (__v16si)__W);
1359 }
1360
1361 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_min_epu32(__mmask16 __M,__m512i __A,__m512i __B)1362 _mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
1363 {
1364 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1365 (__v16si)_mm512_min_epu32(__A, __B),
1366 (__v16si)_mm512_setzero_si512());
1367 }
1368
1369 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_min_epi64(__m512i __A,__m512i __B)1370 _mm512_min_epi64(__m512i __A, __m512i __B)
1371 {
1372 return (__m512i)__builtin_elementwise_min((__v8di)__A, (__v8di)__B);
1373 }
1374
1375 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_min_epi64(__m512i __W,__mmask8 __M,__m512i __A,__m512i __B)1376 _mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
1377 {
1378 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1379 (__v8di)_mm512_min_epi64(__A, __B),
1380 (__v8di)__W);
1381 }
1382
1383 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_min_epi64(__mmask8 __M,__m512i __A,__m512i __B)1384 _mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
1385 {
1386 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1387 (__v8di)_mm512_min_epi64(__A, __B),
1388 (__v8di)_mm512_setzero_si512());
1389 }
1390
1391 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_min_epu64(__m512i __A,__m512i __B)1392 _mm512_min_epu64(__m512i __A, __m512i __B)
1393 {
1394 return (__m512i)__builtin_elementwise_min((__v8du)__A, (__v8du)__B);
1395 }
1396
1397 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_min_epu64(__m512i __W,__mmask8 __M,__m512i __A,__m512i __B)1398 _mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
1399 {
1400 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1401 (__v8di)_mm512_min_epu64(__A, __B),
1402 (__v8di)__W);
1403 }
1404
1405 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_min_epu64(__mmask8 __M,__m512i __A,__m512i __B)1406 _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
1407 {
1408 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1409 (__v8di)_mm512_min_epu64(__A, __B),
1410 (__v8di)_mm512_setzero_si512());
1411 }
1412
1413 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_mul_epi32(__m512i __X,__m512i __Y)1414 _mm512_mul_epi32(__m512i __X, __m512i __Y)
1415 {
1416 return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y);
1417 }
1418
1419 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_mul_epi32(__m512i __W,__mmask8 __M,__m512i __X,__m512i __Y)1420 _mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
1421 {
1422 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1423 (__v8di)_mm512_mul_epi32(__X, __Y),
1424 (__v8di)__W);
1425 }
1426
1427 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_mul_epi32(__mmask8 __M,__m512i __X,__m512i __Y)1428 _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y)
1429 {
1430 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1431 (__v8di)_mm512_mul_epi32(__X, __Y),
1432 (__v8di)_mm512_setzero_si512 ());
1433 }
1434
1435 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_mul_epu32(__m512i __X,__m512i __Y)1436 _mm512_mul_epu32(__m512i __X, __m512i __Y)
1437 {
1438 return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y);
1439 }
1440
1441 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_mul_epu32(__m512i __W,__mmask8 __M,__m512i __X,__m512i __Y)1442 _mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
1443 {
1444 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1445 (__v8di)_mm512_mul_epu32(__X, __Y),
1446 (__v8di)__W);
1447 }
1448
1449 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_mul_epu32(__mmask8 __M,__m512i __X,__m512i __Y)1450 _mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y)
1451 {
1452 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1453 (__v8di)_mm512_mul_epu32(__X, __Y),
1454 (__v8di)_mm512_setzero_si512 ());
1455 }
1456
1457 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_mullo_epi32(__m512i __A,__m512i __B)1458 _mm512_mullo_epi32 (__m512i __A, __m512i __B)
1459 {
1460 return (__m512i) ((__v16su) __A * (__v16su) __B);
1461 }
1462
1463 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_mullo_epi32(__mmask16 __M,__m512i __A,__m512i __B)1464 _mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B)
1465 {
1466 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1467 (__v16si)_mm512_mullo_epi32(__A, __B),
1468 (__v16si)_mm512_setzero_si512());
1469 }
1470
1471 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_mullo_epi32(__m512i __W,__mmask16 __M,__m512i __A,__m512i __B)1472 _mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
1473 {
1474 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1475 (__v16si)_mm512_mullo_epi32(__A, __B),
1476 (__v16si)__W);
1477 }
1478
1479 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mullox_epi64(__m512i __A,__m512i __B)1480 _mm512_mullox_epi64 (__m512i __A, __m512i __B) {
1481 return (__m512i) ((__v8du) __A * (__v8du) __B);
1482 }
1483
1484 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_mullox_epi64(__m512i __W,__mmask8 __U,__m512i __A,__m512i __B)1485 _mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
1486 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
1487 (__v8di)_mm512_mullox_epi64(__A, __B),
1488 (__v8di)__W);
1489 }
1490
1491 #define _mm512_sqrt_round_pd(A, R) \
1492 ((__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R)))
1493
1494 #define _mm512_mask_sqrt_round_pd(W, U, A, R) \
1495 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1496 (__v8df)_mm512_sqrt_round_pd((A), (R)), \
1497 (__v8df)(__m512d)(W)))
1498
1499 #define _mm512_maskz_sqrt_round_pd(U, A, R) \
1500 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1501 (__v8df)_mm512_sqrt_round_pd((A), (R)), \
1502 (__v8df)_mm512_setzero_pd()))
1503
1504 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_sqrt_pd(__m512d __A)1505 _mm512_sqrt_pd(__m512d __A)
1506 {
1507 return (__m512d)__builtin_ia32_sqrtpd512((__v8df)__A,
1508 _MM_FROUND_CUR_DIRECTION);
1509 }
1510
1511 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_sqrt_pd(__m512d __W,__mmask8 __U,__m512d __A)1512 _mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
1513 {
1514 return (__m512d)__builtin_ia32_selectpd_512(__U,
1515 (__v8df)_mm512_sqrt_pd(__A),
1516 (__v8df)__W);
1517 }
1518
1519 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_sqrt_pd(__mmask8 __U,__m512d __A)1520 _mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
1521 {
1522 return (__m512d)__builtin_ia32_selectpd_512(__U,
1523 (__v8df)_mm512_sqrt_pd(__A),
1524 (__v8df)_mm512_setzero_pd());
1525 }
1526
1527 #define _mm512_sqrt_round_ps(A, R) \
1528 ((__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R)))
1529
1530 #define _mm512_mask_sqrt_round_ps(W, U, A, R) \
1531 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1532 (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
1533 (__v16sf)(__m512)(W)))
1534
1535 #define _mm512_maskz_sqrt_round_ps(U, A, R) \
1536 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1537 (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
1538 (__v16sf)_mm512_setzero_ps()))
1539
1540 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_sqrt_ps(__m512 __A)1541 _mm512_sqrt_ps(__m512 __A)
1542 {
1543 return (__m512)__builtin_ia32_sqrtps512((__v16sf)__A,
1544 _MM_FROUND_CUR_DIRECTION);
1545 }
1546
1547 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_sqrt_ps(__m512 __W,__mmask16 __U,__m512 __A)1548 _mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A)
1549 {
1550 return (__m512)__builtin_ia32_selectps_512(__U,
1551 (__v16sf)_mm512_sqrt_ps(__A),
1552 (__v16sf)__W);
1553 }
1554
1555 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_sqrt_ps(__mmask16 __U,__m512 __A)1556 _mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A)
1557 {
1558 return (__m512)__builtin_ia32_selectps_512(__U,
1559 (__v16sf)_mm512_sqrt_ps(__A),
1560 (__v16sf)_mm512_setzero_ps());
1561 }
1562
1563 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_rsqrt14_pd(__m512d __A)1564 _mm512_rsqrt14_pd(__m512d __A)
1565 {
1566 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
1567 (__v8df)
1568 _mm512_setzero_pd (),
1569 (__mmask8) -1);}
1570
1571 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_rsqrt14_pd(__m512d __W,__mmask8 __U,__m512d __A)1572 _mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
1573 {
1574 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
1575 (__v8df) __W,
1576 (__mmask8) __U);
1577 }
1578
1579 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_rsqrt14_pd(__mmask8 __U,__m512d __A)1580 _mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
1581 {
1582 return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
1583 (__v8df)
1584 _mm512_setzero_pd (),
1585 (__mmask8) __U);
1586 }
1587
1588 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_rsqrt14_ps(__m512 __A)1589 _mm512_rsqrt14_ps(__m512 __A)
1590 {
1591 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
1592 (__v16sf)
1593 _mm512_setzero_ps (),
1594 (__mmask16) -1);
1595 }
1596
1597 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_rsqrt14_ps(__m512 __W,__mmask16 __U,__m512 __A)1598 _mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
1599 {
1600 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
1601 (__v16sf) __W,
1602 (__mmask16) __U);
1603 }
1604
1605 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_rsqrt14_ps(__mmask16 __U,__m512 __A)1606 _mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
1607 {
1608 return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
1609 (__v16sf)
1610 _mm512_setzero_ps (),
1611 (__mmask16) __U);
1612 }
1613
1614 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_rsqrt14_ss(__m128 __A,__m128 __B)1615 _mm_rsqrt14_ss(__m128 __A, __m128 __B)
1616 {
1617 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
1618 (__v4sf) __B,
1619 (__v4sf)
1620 _mm_setzero_ps (),
1621 (__mmask8) -1);
1622 }
1623
1624 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_rsqrt14_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128 __B)1625 _mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
1626 {
1627 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
1628 (__v4sf) __B,
1629 (__v4sf) __W,
1630 (__mmask8) __U);
1631 }
1632
1633 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_rsqrt14_ss(__mmask8 __U,__m128 __A,__m128 __B)1634 _mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B)
1635 {
1636 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
1637 (__v4sf) __B,
1638 (__v4sf) _mm_setzero_ps (),
1639 (__mmask8) __U);
1640 }
1641
1642 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_rsqrt14_sd(__m128d __A,__m128d __B)1643 _mm_rsqrt14_sd(__m128d __A, __m128d __B)
1644 {
1645 return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A,
1646 (__v2df) __B,
1647 (__v2df)
1648 _mm_setzero_pd (),
1649 (__mmask8) -1);
1650 }
1651
1652 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_rsqrt14_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128d __B)1653 _mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
1654 {
1655 return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
1656 (__v2df) __B,
1657 (__v2df) __W,
1658 (__mmask8) __U);
1659 }
1660
1661 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_rsqrt14_sd(__mmask8 __U,__m128d __A,__m128d __B)1662 _mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B)
1663 {
1664 return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
1665 (__v2df) __B,
1666 (__v2df) _mm_setzero_pd (),
1667 (__mmask8) __U);
1668 }
1669
1670 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_rcp14_pd(__m512d __A)1671 _mm512_rcp14_pd(__m512d __A)
1672 {
1673 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
1674 (__v8df)
1675 _mm512_setzero_pd (),
1676 (__mmask8) -1);
1677 }
1678
1679 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_rcp14_pd(__m512d __W,__mmask8 __U,__m512d __A)1680 _mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
1681 {
1682 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
1683 (__v8df) __W,
1684 (__mmask8) __U);
1685 }
1686
1687 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_rcp14_pd(__mmask8 __U,__m512d __A)1688 _mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
1689 {
1690 return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
1691 (__v8df)
1692 _mm512_setzero_pd (),
1693 (__mmask8) __U);
1694 }
1695
1696 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_rcp14_ps(__m512 __A)1697 _mm512_rcp14_ps(__m512 __A)
1698 {
1699 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
1700 (__v16sf)
1701 _mm512_setzero_ps (),
1702 (__mmask16) -1);
1703 }
1704
1705 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_rcp14_ps(__m512 __W,__mmask16 __U,__m512 __A)1706 _mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
1707 {
1708 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
1709 (__v16sf) __W,
1710 (__mmask16) __U);
1711 }
1712
1713 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_rcp14_ps(__mmask16 __U,__m512 __A)1714 _mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
1715 {
1716 return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
1717 (__v16sf)
1718 _mm512_setzero_ps (),
1719 (__mmask16) __U);
1720 }
1721
1722 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_rcp14_ss(__m128 __A,__m128 __B)1723 _mm_rcp14_ss(__m128 __A, __m128 __B)
1724 {
1725 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
1726 (__v4sf) __B,
1727 (__v4sf)
1728 _mm_setzero_ps (),
1729 (__mmask8) -1);
1730 }
1731
1732 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_rcp14_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128 __B)1733 _mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
1734 {
1735 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
1736 (__v4sf) __B,
1737 (__v4sf) __W,
1738 (__mmask8) __U);
1739 }
1740
1741 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_rcp14_ss(__mmask8 __U,__m128 __A,__m128 __B)1742 _mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B)
1743 {
1744 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
1745 (__v4sf) __B,
1746 (__v4sf) _mm_setzero_ps (),
1747 (__mmask8) __U);
1748 }
1749
1750 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_rcp14_sd(__m128d __A,__m128d __B)1751 _mm_rcp14_sd(__m128d __A, __m128d __B)
1752 {
1753 return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A,
1754 (__v2df) __B,
1755 (__v2df)
1756 _mm_setzero_pd (),
1757 (__mmask8) -1);
1758 }
1759
1760 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_rcp14_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128d __B)1761 _mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
1762 {
1763 return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
1764 (__v2df) __B,
1765 (__v2df) __W,
1766 (__mmask8) __U);
1767 }
1768
1769 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_rcp14_sd(__mmask8 __U,__m128d __A,__m128d __B)1770 _mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B)
1771 {
1772 return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
1773 (__v2df) __B,
1774 (__v2df) _mm_setzero_pd (),
1775 (__mmask8) __U);
1776 }
1777
1778 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_floor_ps(__m512 __A)1779 _mm512_floor_ps(__m512 __A)
1780 {
1781 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1782 _MM_FROUND_FLOOR,
1783 (__v16sf) __A, (unsigned short)-1,
1784 _MM_FROUND_CUR_DIRECTION);
1785 }
1786
1787 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_floor_ps(__m512 __W,__mmask16 __U,__m512 __A)1788 _mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
1789 {
1790 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1791 _MM_FROUND_FLOOR,
1792 (__v16sf) __W, __U,
1793 _MM_FROUND_CUR_DIRECTION);
1794 }
1795
1796 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_floor_pd(__m512d __A)1797 _mm512_floor_pd(__m512d __A)
1798 {
1799 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1800 _MM_FROUND_FLOOR,
1801 (__v8df) __A, (unsigned char)-1,
1802 _MM_FROUND_CUR_DIRECTION);
1803 }
1804
1805 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_floor_pd(__m512d __W,__mmask8 __U,__m512d __A)1806 _mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
1807 {
1808 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1809 _MM_FROUND_FLOOR,
1810 (__v8df) __W, __U,
1811 _MM_FROUND_CUR_DIRECTION);
1812 }
1813
1814 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_ceil_ps(__m512 __W,__mmask16 __U,__m512 __A)1815 _mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
1816 {
1817 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1818 _MM_FROUND_CEIL,
1819 (__v16sf) __W, __U,
1820 _MM_FROUND_CUR_DIRECTION);
1821 }
1822
1823 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_ceil_ps(__m512 __A)1824 _mm512_ceil_ps(__m512 __A)
1825 {
1826 return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1827 _MM_FROUND_CEIL,
1828 (__v16sf) __A, (unsigned short)-1,
1829 _MM_FROUND_CUR_DIRECTION);
1830 }
1831
1832 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_ceil_pd(__m512d __A)1833 _mm512_ceil_pd(__m512d __A)
1834 {
1835 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1836 _MM_FROUND_CEIL,
1837 (__v8df) __A, (unsigned char)-1,
1838 _MM_FROUND_CUR_DIRECTION);
1839 }
1840
1841 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_ceil_pd(__m512d __W,__mmask8 __U,__m512d __A)1842 _mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
1843 {
1844 return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1845 _MM_FROUND_CEIL,
1846 (__v8df) __W, __U,
1847 _MM_FROUND_CUR_DIRECTION);
1848 }
1849
1850 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_abs_epi64(__m512i __A)1851 _mm512_abs_epi64(__m512i __A)
1852 {
1853 return (__m512i)__builtin_elementwise_abs((__v8di)__A);
1854 }
1855
1856 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_abs_epi64(__m512i __W,__mmask8 __U,__m512i __A)1857 _mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
1858 {
1859 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
1860 (__v8di)_mm512_abs_epi64(__A),
1861 (__v8di)__W);
1862 }
1863
1864 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_abs_epi64(__mmask8 __U,__m512i __A)1865 _mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
1866 {
1867 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
1868 (__v8di)_mm512_abs_epi64(__A),
1869 (__v8di)_mm512_setzero_si512());
1870 }
1871
1872 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_abs_epi32(__m512i __A)1873 _mm512_abs_epi32(__m512i __A)
1874 {
1875 return (__m512i)__builtin_elementwise_abs((__v16si) __A);
1876 }
1877
1878 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_abs_epi32(__m512i __W,__mmask16 __U,__m512i __A)1879 _mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
1880 {
1881 return (__m512i)__builtin_ia32_selectd_512(__U,
1882 (__v16si)_mm512_abs_epi32(__A),
1883 (__v16si)__W);
1884 }
1885
1886 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_abs_epi32(__mmask16 __U,__m512i __A)1887 _mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
1888 {
1889 return (__m512i)__builtin_ia32_selectd_512(__U,
1890 (__v16si)_mm512_abs_epi32(__A),
1891 (__v16si)_mm512_setzero_si512());
1892 }
1893
1894 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_add_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128 __B)1895 _mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
1896 __A = _mm_add_ss(__A, __B);
1897 return __builtin_ia32_selectss_128(__U, __A, __W);
1898 }
1899
1900 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_add_ss(__mmask8 __U,__m128 __A,__m128 __B)1901 _mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) {
1902 __A = _mm_add_ss(__A, __B);
1903 return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
1904 }
1905
1906 #define _mm_add_round_ss(A, B, R) \
1907 ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
1908 (__v4sf)(__m128)(B), \
1909 (__v4sf)_mm_setzero_ps(), \
1910 (__mmask8)-1, (int)(R)))
1911
1912 #define _mm_mask_add_round_ss(W, U, A, B, R) \
1913 ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
1914 (__v4sf)(__m128)(B), \
1915 (__v4sf)(__m128)(W), (__mmask8)(U), \
1916 (int)(R)))
1917
1918 #define _mm_maskz_add_round_ss(U, A, B, R) \
1919 ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
1920 (__v4sf)(__m128)(B), \
1921 (__v4sf)_mm_setzero_ps(), \
1922 (__mmask8)(U), (int)(R)))
1923
1924 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_add_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128d __B)1925 _mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
1926 __A = _mm_add_sd(__A, __B);
1927 return __builtin_ia32_selectsd_128(__U, __A, __W);
1928 }
1929
1930 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_add_sd(__mmask8 __U,__m128d __A,__m128d __B)1931 _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
1932 __A = _mm_add_sd(__A, __B);
1933 return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
1934 }
1935 #define _mm_add_round_sd(A, B, R) \
1936 ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
1937 (__v2df)(__m128d)(B), \
1938 (__v2df)_mm_setzero_pd(), \
1939 (__mmask8)-1, (int)(R)))
1940
1941 #define _mm_mask_add_round_sd(W, U, A, B, R) \
1942 ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
1943 (__v2df)(__m128d)(B), \
1944 (__v2df)(__m128d)(W), \
1945 (__mmask8)(U), (int)(R)))
1946
1947 #define _mm_maskz_add_round_sd(U, A, B, R) \
1948 ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
1949 (__v2df)(__m128d)(B), \
1950 (__v2df)_mm_setzero_pd(), \
1951 (__mmask8)(U), (int)(R)))
1952
1953 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_add_pd(__m512d __W,__mmask8 __U,__m512d __A,__m512d __B)1954 _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
1955 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
1956 (__v8df)_mm512_add_pd(__A, __B),
1957 (__v8df)__W);
1958 }
1959
1960 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_add_pd(__mmask8 __U,__m512d __A,__m512d __B)1961 _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
1962 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
1963 (__v8df)_mm512_add_pd(__A, __B),
1964 (__v8df)_mm512_setzero_pd());
1965 }
1966
1967 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_add_ps(__m512 __W,__mmask16 __U,__m512 __A,__m512 __B)1968 _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
1969 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
1970 (__v16sf)_mm512_add_ps(__A, __B),
1971 (__v16sf)__W);
1972 }
1973
1974 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_add_ps(__mmask16 __U,__m512 __A,__m512 __B)1975 _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
1976 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
1977 (__v16sf)_mm512_add_ps(__A, __B),
1978 (__v16sf)_mm512_setzero_ps());
1979 }
1980
1981 #define _mm512_add_round_pd(A, B, R) \
1982 ((__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \
1983 (__v8df)(__m512d)(B), (int)(R)))
1984
1985 #define _mm512_mask_add_round_pd(W, U, A, B, R) \
1986 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1987 (__v8df)_mm512_add_round_pd((A), (B), (R)), \
1988 (__v8df)(__m512d)(W)))
1989
1990 #define _mm512_maskz_add_round_pd(U, A, B, R) \
1991 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1992 (__v8df)_mm512_add_round_pd((A), (B), (R)), \
1993 (__v8df)_mm512_setzero_pd()))
1994
1995 #define _mm512_add_round_ps(A, B, R) \
1996 ((__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \
1997 (__v16sf)(__m512)(B), (int)(R)))
1998
1999 #define _mm512_mask_add_round_ps(W, U, A, B, R) \
2000 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2001 (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
2002 (__v16sf)(__m512)(W)))
2003
2004 #define _mm512_maskz_add_round_ps(U, A, B, R) \
2005 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2006 (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
2007 (__v16sf)_mm512_setzero_ps()))
2008
2009 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_sub_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128 __B)2010 _mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
2011 __A = _mm_sub_ss(__A, __B);
2012 return __builtin_ia32_selectss_128(__U, __A, __W);
2013 }
2014
2015 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_sub_ss(__mmask8 __U,__m128 __A,__m128 __B)2016 _mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) {
2017 __A = _mm_sub_ss(__A, __B);
2018 return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
2019 }
2020 #define _mm_sub_round_ss(A, B, R) \
2021 ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
2022 (__v4sf)(__m128)(B), \
2023 (__v4sf)_mm_setzero_ps(), \
2024 (__mmask8)-1, (int)(R)))
2025
2026 #define _mm_mask_sub_round_ss(W, U, A, B, R) \
2027 ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
2028 (__v4sf)(__m128)(B), \
2029 (__v4sf)(__m128)(W), (__mmask8)(U), \
2030 (int)(R)))
2031
2032 #define _mm_maskz_sub_round_ss(U, A, B, R) \
2033 ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
2034 (__v4sf)(__m128)(B), \
2035 (__v4sf)_mm_setzero_ps(), \
2036 (__mmask8)(U), (int)(R)))
2037
2038 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_sub_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128d __B)2039 _mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
2040 __A = _mm_sub_sd(__A, __B);
2041 return __builtin_ia32_selectsd_128(__U, __A, __W);
2042 }
2043
2044 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_sub_sd(__mmask8 __U,__m128d __A,__m128d __B)2045 _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
2046 __A = _mm_sub_sd(__A, __B);
2047 return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
2048 }
2049
2050 #define _mm_sub_round_sd(A, B, R) \
2051 ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
2052 (__v2df)(__m128d)(B), \
2053 (__v2df)_mm_setzero_pd(), \
2054 (__mmask8)-1, (int)(R)))
2055
2056 #define _mm_mask_sub_round_sd(W, U, A, B, R) \
2057 ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
2058 (__v2df)(__m128d)(B), \
2059 (__v2df)(__m128d)(W), \
2060 (__mmask8)(U), (int)(R)))
2061
2062 #define _mm_maskz_sub_round_sd(U, A, B, R) \
2063 ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
2064 (__v2df)(__m128d)(B), \
2065 (__v2df)_mm_setzero_pd(), \
2066 (__mmask8)(U), (int)(R)))
2067
2068 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_sub_pd(__m512d __W,__mmask8 __U,__m512d __A,__m512d __B)2069 _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
2070 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2071 (__v8df)_mm512_sub_pd(__A, __B),
2072 (__v8df)__W);
2073 }
2074
2075 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_sub_pd(__mmask8 __U,__m512d __A,__m512d __B)2076 _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
2077 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2078 (__v8df)_mm512_sub_pd(__A, __B),
2079 (__v8df)_mm512_setzero_pd());
2080 }
2081
2082 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_sub_ps(__m512 __W,__mmask16 __U,__m512 __A,__m512 __B)2083 _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
2084 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2085 (__v16sf)_mm512_sub_ps(__A, __B),
2086 (__v16sf)__W);
2087 }
2088
2089 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_sub_ps(__mmask16 __U,__m512 __A,__m512 __B)2090 _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
2091 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2092 (__v16sf)_mm512_sub_ps(__A, __B),
2093 (__v16sf)_mm512_setzero_ps());
2094 }
2095
2096 #define _mm512_sub_round_pd(A, B, R) \
2097 ((__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \
2098 (__v8df)(__m512d)(B), (int)(R)))
2099
2100 #define _mm512_mask_sub_round_pd(W, U, A, B, R) \
2101 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2102 (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
2103 (__v8df)(__m512d)(W)))
2104
2105 #define _mm512_maskz_sub_round_pd(U, A, B, R) \
2106 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2107 (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
2108 (__v8df)_mm512_setzero_pd()))
2109
2110 #define _mm512_sub_round_ps(A, B, R) \
2111 ((__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \
2112 (__v16sf)(__m512)(B), (int)(R)))
2113
2114 #define _mm512_mask_sub_round_ps(W, U, A, B, R) \
2115 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2116 (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
2117 (__v16sf)(__m512)(W)))
2118
2119 #define _mm512_maskz_sub_round_ps(U, A, B, R) \
2120 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2121 (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
2122 (__v16sf)_mm512_setzero_ps()))
2123
2124 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_mul_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128 __B)2125 _mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
2126 __A = _mm_mul_ss(__A, __B);
2127 return __builtin_ia32_selectss_128(__U, __A, __W);
2128 }
2129
2130 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_mul_ss(__mmask8 __U,__m128 __A,__m128 __B)2131 _mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) {
2132 __A = _mm_mul_ss(__A, __B);
2133 return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
2134 }
2135 #define _mm_mul_round_ss(A, B, R) \
2136 ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
2137 (__v4sf)(__m128)(B), \
2138 (__v4sf)_mm_setzero_ps(), \
2139 (__mmask8)-1, (int)(R)))
2140
2141 #define _mm_mask_mul_round_ss(W, U, A, B, R) \
2142 ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
2143 (__v4sf)(__m128)(B), \
2144 (__v4sf)(__m128)(W), (__mmask8)(U), \
2145 (int)(R)))
2146
2147 #define _mm_maskz_mul_round_ss(U, A, B, R) \
2148 ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
2149 (__v4sf)(__m128)(B), \
2150 (__v4sf)_mm_setzero_ps(), \
2151 (__mmask8)(U), (int)(R)))
2152
2153 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_mul_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128d __B)2154 _mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
2155 __A = _mm_mul_sd(__A, __B);
2156 return __builtin_ia32_selectsd_128(__U, __A, __W);
2157 }
2158
2159 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_mul_sd(__mmask8 __U,__m128d __A,__m128d __B)2160 _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
2161 __A = _mm_mul_sd(__A, __B);
2162 return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
2163 }
2164
2165 #define _mm_mul_round_sd(A, B, R) \
2166 ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
2167 (__v2df)(__m128d)(B), \
2168 (__v2df)_mm_setzero_pd(), \
2169 (__mmask8)-1, (int)(R)))
2170
2171 #define _mm_mask_mul_round_sd(W, U, A, B, R) \
2172 ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
2173 (__v2df)(__m128d)(B), \
2174 (__v2df)(__m128d)(W), \
2175 (__mmask8)(U), (int)(R)))
2176
2177 #define _mm_maskz_mul_round_sd(U, A, B, R) \
2178 ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
2179 (__v2df)(__m128d)(B), \
2180 (__v2df)_mm_setzero_pd(), \
2181 (__mmask8)(U), (int)(R)))
2182
2183 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_mul_pd(__m512d __W,__mmask8 __U,__m512d __A,__m512d __B)2184 _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
2185 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2186 (__v8df)_mm512_mul_pd(__A, __B),
2187 (__v8df)__W);
2188 }
2189
2190 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_mul_pd(__mmask8 __U,__m512d __A,__m512d __B)2191 _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
2192 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2193 (__v8df)_mm512_mul_pd(__A, __B),
2194 (__v8df)_mm512_setzero_pd());
2195 }
2196
2197 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_mul_ps(__m512 __W,__mmask16 __U,__m512 __A,__m512 __B)2198 _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
2199 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2200 (__v16sf)_mm512_mul_ps(__A, __B),
2201 (__v16sf)__W);
2202 }
2203
2204 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_mul_ps(__mmask16 __U,__m512 __A,__m512 __B)2205 _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
2206 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2207 (__v16sf)_mm512_mul_ps(__A, __B),
2208 (__v16sf)_mm512_setzero_ps());
2209 }
2210
2211 #define _mm512_mul_round_pd(A, B, R) \
2212 ((__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \
2213 (__v8df)(__m512d)(B), (int)(R)))
2214
2215 #define _mm512_mask_mul_round_pd(W, U, A, B, R) \
2216 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2217 (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
2218 (__v8df)(__m512d)(W)))
2219
2220 #define _mm512_maskz_mul_round_pd(U, A, B, R) \
2221 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2222 (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
2223 (__v8df)_mm512_setzero_pd()))
2224
2225 #define _mm512_mul_round_ps(A, B, R) \
2226 ((__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \
2227 (__v16sf)(__m512)(B), (int)(R)))
2228
2229 #define _mm512_mask_mul_round_ps(W, U, A, B, R) \
2230 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2231 (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
2232 (__v16sf)(__m512)(W)))
2233
2234 #define _mm512_maskz_mul_round_ps(U, A, B, R) \
2235 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2236 (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
2237 (__v16sf)_mm512_setzero_ps()))
2238
2239 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_div_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128 __B)2240 _mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
2241 __A = _mm_div_ss(__A, __B);
2242 return __builtin_ia32_selectss_128(__U, __A, __W);
2243 }
2244
2245 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_div_ss(__mmask8 __U,__m128 __A,__m128 __B)2246 _mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) {
2247 __A = _mm_div_ss(__A, __B);
2248 return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
2249 }
2250
2251 #define _mm_div_round_ss(A, B, R) \
2252 ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
2253 (__v4sf)(__m128)(B), \
2254 (__v4sf)_mm_setzero_ps(), \
2255 (__mmask8)-1, (int)(R)))
2256
2257 #define _mm_mask_div_round_ss(W, U, A, B, R) \
2258 ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
2259 (__v4sf)(__m128)(B), \
2260 (__v4sf)(__m128)(W), (__mmask8)(U), \
2261 (int)(R)))
2262
2263 #define _mm_maskz_div_round_ss(U, A, B, R) \
2264 ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
2265 (__v4sf)(__m128)(B), \
2266 (__v4sf)_mm_setzero_ps(), \
2267 (__mmask8)(U), (int)(R)))
2268
2269 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_div_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128d __B)2270 _mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
2271 __A = _mm_div_sd(__A, __B);
2272 return __builtin_ia32_selectsd_128(__U, __A, __W);
2273 }
2274
2275 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_div_sd(__mmask8 __U,__m128d __A,__m128d __B)2276 _mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) {
2277 __A = _mm_div_sd(__A, __B);
2278 return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
2279 }
2280
2281 #define _mm_div_round_sd(A, B, R) \
2282 ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
2283 (__v2df)(__m128d)(B), \
2284 (__v2df)_mm_setzero_pd(), \
2285 (__mmask8)-1, (int)(R)))
2286
2287 #define _mm_mask_div_round_sd(W, U, A, B, R) \
2288 ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
2289 (__v2df)(__m128d)(B), \
2290 (__v2df)(__m128d)(W), \
2291 (__mmask8)(U), (int)(R)))
2292
2293 #define _mm_maskz_div_round_sd(U, A, B, R) \
2294 ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
2295 (__v2df)(__m128d)(B), \
2296 (__v2df)_mm_setzero_pd(), \
2297 (__mmask8)(U), (int)(R)))
2298
2299 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_div_pd(__m512d __a,__m512d __b)2300 _mm512_div_pd(__m512d __a, __m512d __b)
2301 {
2302 return (__m512d)((__v8df)__a/(__v8df)__b);
2303 }
2304
2305 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_div_pd(__m512d __W,__mmask8 __U,__m512d __A,__m512d __B)2306 _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
2307 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2308 (__v8df)_mm512_div_pd(__A, __B),
2309 (__v8df)__W);
2310 }
2311
2312 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_div_pd(__mmask8 __U,__m512d __A,__m512d __B)2313 _mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
2314 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2315 (__v8df)_mm512_div_pd(__A, __B),
2316 (__v8df)_mm512_setzero_pd());
2317 }
2318
2319 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_div_ps(__m512 __a,__m512 __b)2320 _mm512_div_ps(__m512 __a, __m512 __b)
2321 {
2322 return (__m512)((__v16sf)__a/(__v16sf)__b);
2323 }
2324
2325 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_div_ps(__m512 __W,__mmask16 __U,__m512 __A,__m512 __B)2326 _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
2327 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2328 (__v16sf)_mm512_div_ps(__A, __B),
2329 (__v16sf)__W);
2330 }
2331
2332 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_div_ps(__mmask16 __U,__m512 __A,__m512 __B)2333 _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
2334 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2335 (__v16sf)_mm512_div_ps(__A, __B),
2336 (__v16sf)_mm512_setzero_ps());
2337 }
2338
2339 #define _mm512_div_round_pd(A, B, R) \
2340 ((__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \
2341 (__v8df)(__m512d)(B), (int)(R)))
2342
2343 #define _mm512_mask_div_round_pd(W, U, A, B, R) \
2344 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2345 (__v8df)_mm512_div_round_pd((A), (B), (R)), \
2346 (__v8df)(__m512d)(W)))
2347
2348 #define _mm512_maskz_div_round_pd(U, A, B, R) \
2349 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2350 (__v8df)_mm512_div_round_pd((A), (B), (R)), \
2351 (__v8df)_mm512_setzero_pd()))
2352
2353 #define _mm512_div_round_ps(A, B, R) \
2354 ((__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \
2355 (__v16sf)(__m512)(B), (int)(R)))
2356
2357 #define _mm512_mask_div_round_ps(W, U, A, B, R) \
2358 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2359 (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
2360 (__v16sf)(__m512)(W)))
2361
2362 #define _mm512_maskz_div_round_ps(U, A, B, R) \
2363 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2364 (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
2365 (__v16sf)_mm512_setzero_ps()))
2366
2367 #define _mm512_roundscale_ps(A, B) \
2368 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
2369 (__v16sf)_mm512_undefined_ps(), \
2370 (__mmask16)-1, \
2371 _MM_FROUND_CUR_DIRECTION))
2372
2373 #define _mm512_mask_roundscale_ps(A, B, C, imm) \
2374 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
2375 (__v16sf)(__m512)(A), (__mmask16)(B), \
2376 _MM_FROUND_CUR_DIRECTION))
2377
2378 #define _mm512_maskz_roundscale_ps(A, B, imm) \
2379 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
2380 (__v16sf)_mm512_setzero_ps(), \
2381 (__mmask16)(A), \
2382 _MM_FROUND_CUR_DIRECTION))
2383
2384 #define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) \
2385 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
2386 (__v16sf)(__m512)(A), (__mmask16)(B), \
2387 (int)(R)))
2388
2389 #define _mm512_maskz_roundscale_round_ps(A, B, imm, R) \
2390 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
2391 (__v16sf)_mm512_setzero_ps(), \
2392 (__mmask16)(A), (int)(R)))
2393
2394 #define _mm512_roundscale_round_ps(A, imm, R) \
2395 ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \
2396 (__v16sf)_mm512_undefined_ps(), \
2397 (__mmask16)-1, (int)(R)))
2398
2399 #define _mm512_roundscale_pd(A, B) \
2400 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \
2401 (__v8df)_mm512_undefined_pd(), \
2402 (__mmask8)-1, \
2403 _MM_FROUND_CUR_DIRECTION))
2404
2405 #define _mm512_mask_roundscale_pd(A, B, C, imm) \
2406 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
2407 (__v8df)(__m512d)(A), (__mmask8)(B), \
2408 _MM_FROUND_CUR_DIRECTION))
2409
2410 #define _mm512_maskz_roundscale_pd(A, B, imm) \
2411 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
2412 (__v8df)_mm512_setzero_pd(), \
2413 (__mmask8)(A), \
2414 _MM_FROUND_CUR_DIRECTION))
2415
2416 #define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) \
2417 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
2418 (__v8df)(__m512d)(A), (__mmask8)(B), \
2419 (int)(R)))
2420
2421 #define _mm512_maskz_roundscale_round_pd(A, B, imm, R) \
2422 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
2423 (__v8df)_mm512_setzero_pd(), \
2424 (__mmask8)(A), (int)(R)))
2425
2426 #define _mm512_roundscale_round_pd(A, imm, R) \
2427 ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \
2428 (__v8df)_mm512_undefined_pd(), \
2429 (__mmask8)-1, (int)(R)))
2430
2431 #define _mm512_fmadd_round_pd(A, B, C, R) \
2432 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2433 (__v8df)(__m512d)(B), \
2434 (__v8df)(__m512d)(C), \
2435 (__mmask8)-1, (int)(R)))
2436
2437
2438 #define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \
2439 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2440 (__v8df)(__m512d)(B), \
2441 (__v8df)(__m512d)(C), \
2442 (__mmask8)(U), (int)(R)))
2443
2444
2445 #define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \
2446 ((__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \
2447 (__v8df)(__m512d)(B), \
2448 (__v8df)(__m512d)(C), \
2449 (__mmask8)(U), (int)(R)))
2450
2451
2452 #define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \
2453 ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
2454 (__v8df)(__m512d)(B), \
2455 (__v8df)(__m512d)(C), \
2456 (__mmask8)(U), (int)(R)))
2457
2458
2459 #define _mm512_fmsub_round_pd(A, B, C, R) \
2460 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2461 (__v8df)(__m512d)(B), \
2462 -(__v8df)(__m512d)(C), \
2463 (__mmask8)-1, (int)(R)))
2464
2465
2466 #define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \
2467 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2468 (__v8df)(__m512d)(B), \
2469 -(__v8df)(__m512d)(C), \
2470 (__mmask8)(U), (int)(R)))
2471
2472
2473 #define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \
2474 ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
2475 (__v8df)(__m512d)(B), \
2476 -(__v8df)(__m512d)(C), \
2477 (__mmask8)(U), (int)(R)))
2478
2479
2480 #define _mm512_fnmadd_round_pd(A, B, C, R) \
2481 ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
2482 (__v8df)(__m512d)(B), \
2483 (__v8df)(__m512d)(C), \
2484 (__mmask8)-1, (int)(R)))
2485
2486
2487 #define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \
2488 ((__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \
2489 (__v8df)(__m512d)(B), \
2490 (__v8df)(__m512d)(C), \
2491 (__mmask8)(U), (int)(R)))
2492
2493
2494 #define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \
2495 ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
2496 (__v8df)(__m512d)(B), \
2497 (__v8df)(__m512d)(C), \
2498 (__mmask8)(U), (int)(R)))
2499
2500
2501 #define _mm512_fnmsub_round_pd(A, B, C, R) \
2502 ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
2503 (__v8df)(__m512d)(B), \
2504 -(__v8df)(__m512d)(C), \
2505 (__mmask8)-1, (int)(R)))
2506
2507
2508 #define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \
2509 ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
2510 (__v8df)(__m512d)(B), \
2511 -(__v8df)(__m512d)(C), \
2512 (__mmask8)(U), (int)(R)))
2513
2514
2515 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_fmadd_pd(__m512d __A,__m512d __B,__m512d __C)2516 _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
2517 {
2518 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2519 (__v8df) __B,
2520 (__v8df) __C,
2521 (__mmask8) -1,
2522 _MM_FROUND_CUR_DIRECTION);
2523 }
2524
2525 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_fmadd_pd(__m512d __A,__mmask8 __U,__m512d __B,__m512d __C)2526 _mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
2527 {
2528 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2529 (__v8df) __B,
2530 (__v8df) __C,
2531 (__mmask8) __U,
2532 _MM_FROUND_CUR_DIRECTION);
2533 }
2534
2535 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask3_fmadd_pd(__m512d __A,__m512d __B,__m512d __C,__mmask8 __U)2536 _mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
2537 {
2538 return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
2539 (__v8df) __B,
2540 (__v8df) __C,
2541 (__mmask8) __U,
2542 _MM_FROUND_CUR_DIRECTION);
2543 }
2544
2545 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_fmadd_pd(__mmask8 __U,__m512d __A,__m512d __B,__m512d __C)2546 _mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2547 {
2548 return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
2549 (__v8df) __B,
2550 (__v8df) __C,
2551 (__mmask8) __U,
2552 _MM_FROUND_CUR_DIRECTION);
2553 }
2554
2555 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_fmsub_pd(__m512d __A,__m512d __B,__m512d __C)2556 _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C)
2557 {
2558 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2559 (__v8df) __B,
2560 -(__v8df) __C,
2561 (__mmask8) -1,
2562 _MM_FROUND_CUR_DIRECTION);
2563 }
2564
2565 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_fmsub_pd(__m512d __A,__mmask8 __U,__m512d __B,__m512d __C)2566 _mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
2567 {
2568 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2569 (__v8df) __B,
2570 -(__v8df) __C,
2571 (__mmask8) __U,
2572 _MM_FROUND_CUR_DIRECTION);
2573 }
2574
2575 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_fmsub_pd(__mmask8 __U,__m512d __A,__m512d __B,__m512d __C)2576 _mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2577 {
2578 return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
2579 (__v8df) __B,
2580 -(__v8df) __C,
2581 (__mmask8) __U,
2582 _MM_FROUND_CUR_DIRECTION);
2583 }
2584
2585 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_fnmadd_pd(__m512d __A,__m512d __B,__m512d __C)2586 _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C)
2587 {
2588 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2589 -(__v8df) __B,
2590 (__v8df) __C,
2591 (__mmask8) -1,
2592 _MM_FROUND_CUR_DIRECTION);
2593 }
2594
2595 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask3_fnmadd_pd(__m512d __A,__m512d __B,__m512d __C,__mmask8 __U)2596 _mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
2597 {
2598 return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
2599 (__v8df) __B,
2600 (__v8df) __C,
2601 (__mmask8) __U,
2602 _MM_FROUND_CUR_DIRECTION);
2603 }
2604
2605 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_fnmadd_pd(__mmask8 __U,__m512d __A,__m512d __B,__m512d __C)2606 _mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2607 {
2608 return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
2609 (__v8df) __B,
2610 (__v8df) __C,
2611 (__mmask8) __U,
2612 _MM_FROUND_CUR_DIRECTION);
2613 }
2614
2615 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_fnmsub_pd(__m512d __A,__m512d __B,__m512d __C)2616 _mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C)
2617 {
2618 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2619 -(__v8df) __B,
2620 -(__v8df) __C,
2621 (__mmask8) -1,
2622 _MM_FROUND_CUR_DIRECTION);
2623 }
2624
2625 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_fnmsub_pd(__mmask8 __U,__m512d __A,__m512d __B,__m512d __C)2626 _mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2627 {
2628 return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
2629 (__v8df) __B,
2630 -(__v8df) __C,
2631 (__mmask8) __U,
2632 _MM_FROUND_CUR_DIRECTION);
2633 }
2634
2635 #define _mm512_fmadd_round_ps(A, B, C, R) \
2636 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2637 (__v16sf)(__m512)(B), \
2638 (__v16sf)(__m512)(C), \
2639 (__mmask16)-1, (int)(R)))
2640
2641
2642 #define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \
2643 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2644 (__v16sf)(__m512)(B), \
2645 (__v16sf)(__m512)(C), \
2646 (__mmask16)(U), (int)(R)))
2647
2648
2649 #define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \
2650 ((__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \
2651 (__v16sf)(__m512)(B), \
2652 (__v16sf)(__m512)(C), \
2653 (__mmask16)(U), (int)(R)))
2654
2655
2656 #define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \
2657 ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
2658 (__v16sf)(__m512)(B), \
2659 (__v16sf)(__m512)(C), \
2660 (__mmask16)(U), (int)(R)))
2661
2662
2663 #define _mm512_fmsub_round_ps(A, B, C, R) \
2664 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2665 (__v16sf)(__m512)(B), \
2666 -(__v16sf)(__m512)(C), \
2667 (__mmask16)-1, (int)(R)))
2668
2669
2670 #define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \
2671 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2672 (__v16sf)(__m512)(B), \
2673 -(__v16sf)(__m512)(C), \
2674 (__mmask16)(U), (int)(R)))
2675
2676
2677 #define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \
2678 ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
2679 (__v16sf)(__m512)(B), \
2680 -(__v16sf)(__m512)(C), \
2681 (__mmask16)(U), (int)(R)))
2682
2683
2684 #define _mm512_fnmadd_round_ps(A, B, C, R) \
2685 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2686 -(__v16sf)(__m512)(B), \
2687 (__v16sf)(__m512)(C), \
2688 (__mmask16)-1, (int)(R)))
2689
2690
2691 #define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \
2692 ((__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \
2693 (__v16sf)(__m512)(B), \
2694 (__v16sf)(__m512)(C), \
2695 (__mmask16)(U), (int)(R)))
2696
2697
2698 #define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \
2699 ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
2700 (__v16sf)(__m512)(B), \
2701 (__v16sf)(__m512)(C), \
2702 (__mmask16)(U), (int)(R)))
2703
2704
2705 #define _mm512_fnmsub_round_ps(A, B, C, R) \
2706 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2707 -(__v16sf)(__m512)(B), \
2708 -(__v16sf)(__m512)(C), \
2709 (__mmask16)-1, (int)(R)))
2710
2711
2712 #define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \
2713 ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
2714 (__v16sf)(__m512)(B), \
2715 -(__v16sf)(__m512)(C), \
2716 (__mmask16)(U), (int)(R)))
2717
2718
2719 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_fmadd_ps(__m512 __A,__m512 __B,__m512 __C)2720 _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C)
2721 {
2722 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2723 (__v16sf) __B,
2724 (__v16sf) __C,
2725 (__mmask16) -1,
2726 _MM_FROUND_CUR_DIRECTION);
2727 }
2728
2729 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_fmadd_ps(__m512 __A,__mmask16 __U,__m512 __B,__m512 __C)2730 _mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
2731 {
2732 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2733 (__v16sf) __B,
2734 (__v16sf) __C,
2735 (__mmask16) __U,
2736 _MM_FROUND_CUR_DIRECTION);
2737 }
2738
2739 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask3_fmadd_ps(__m512 __A,__m512 __B,__m512 __C,__mmask16 __U)2740 _mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
2741 {
2742 return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
2743 (__v16sf) __B,
2744 (__v16sf) __C,
2745 (__mmask16) __U,
2746 _MM_FROUND_CUR_DIRECTION);
2747 }
2748
2749 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_fmadd_ps(__mmask16 __U,__m512 __A,__m512 __B,__m512 __C)2750 _mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
2751 {
2752 return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
2753 (__v16sf) __B,
2754 (__v16sf) __C,
2755 (__mmask16) __U,
2756 _MM_FROUND_CUR_DIRECTION);
2757 }
2758
2759 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_fmsub_ps(__m512 __A,__m512 __B,__m512 __C)2760 _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C)
2761 {
2762 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2763 (__v16sf) __B,
2764 -(__v16sf) __C,
2765 (__mmask16) -1,
2766 _MM_FROUND_CUR_DIRECTION);
2767 }
2768
2769 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_fmsub_ps(__m512 __A,__mmask16 __U,__m512 __B,__m512 __C)2770 _mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
2771 {
2772 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2773 (__v16sf) __B,
2774 -(__v16sf) __C,
2775 (__mmask16) __U,
2776 _MM_FROUND_CUR_DIRECTION);
2777 }
2778
2779 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_fmsub_ps(__mmask16 __U,__m512 __A,__m512 __B,__m512 __C)2780 _mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
2781 {
2782 return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
2783 (__v16sf) __B,
2784 -(__v16sf) __C,
2785 (__mmask16) __U,
2786 _MM_FROUND_CUR_DIRECTION);
2787 }
2788
2789 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_fnmadd_ps(__m512 __A,__m512 __B,__m512 __C)2790 _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C)
2791 {
2792 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2793 -(__v16sf) __B,
2794 (__v16sf) __C,
2795 (__mmask16) -1,
2796 _MM_FROUND_CUR_DIRECTION);
2797 }
2798
2799 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask3_fnmadd_ps(__m512 __A,__m512 __B,__m512 __C,__mmask16 __U)2800 _mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
2801 {
2802 return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
2803 (__v16sf) __B,
2804 (__v16sf) __C,
2805 (__mmask16) __U,
2806 _MM_FROUND_CUR_DIRECTION);
2807 }
2808
2809 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_fnmadd_ps(__mmask16 __U,__m512 __A,__m512 __B,__m512 __C)2810 _mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
2811 {
2812 return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
2813 (__v16sf) __B,
2814 (__v16sf) __C,
2815 (__mmask16) __U,
2816 _MM_FROUND_CUR_DIRECTION);
2817 }
2818
2819 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_fnmsub_ps(__m512 __A,__m512 __B,__m512 __C)2820 _mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C)
2821 {
2822 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2823 -(__v16sf) __B,
2824 -(__v16sf) __C,
2825 (__mmask16) -1,
2826 _MM_FROUND_CUR_DIRECTION);
2827 }
2828
2829 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_fnmsub_ps(__mmask16 __U,__m512 __A,__m512 __B,__m512 __C)2830 _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
2831 {
2832 return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
2833 (__v16sf) __B,
2834 -(__v16sf) __C,
2835 (__mmask16) __U,
2836 _MM_FROUND_CUR_DIRECTION);
2837 }
2838
2839 #define _mm512_fmaddsub_round_pd(A, B, C, R) \
2840 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2841 (__v8df)(__m512d)(B), \
2842 (__v8df)(__m512d)(C), \
2843 (__mmask8)-1, (int)(R)))
2844
2845
2846 #define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \
2847 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2848 (__v8df)(__m512d)(B), \
2849 (__v8df)(__m512d)(C), \
2850 (__mmask8)(U), (int)(R)))
2851
2852
2853 #define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \
2854 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \
2855 (__v8df)(__m512d)(B), \
2856 (__v8df)(__m512d)(C), \
2857 (__mmask8)(U), (int)(R)))
2858
2859
2860 #define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \
2861 ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
2862 (__v8df)(__m512d)(B), \
2863 (__v8df)(__m512d)(C), \
2864 (__mmask8)(U), (int)(R)))
2865
2866
2867 #define _mm512_fmsubadd_round_pd(A, B, C, R) \
2868 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2869 (__v8df)(__m512d)(B), \
2870 -(__v8df)(__m512d)(C), \
2871 (__mmask8)-1, (int)(R)))
2872
2873
2874 #define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \
2875 ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2876 (__v8df)(__m512d)(B), \
2877 -(__v8df)(__m512d)(C), \
2878 (__mmask8)(U), (int)(R)))
2879
2880
2881 #define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \
2882 ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
2883 (__v8df)(__m512d)(B), \
2884 -(__v8df)(__m512d)(C), \
2885 (__mmask8)(U), (int)(R)))
2886
2887
2888 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_fmaddsub_pd(__m512d __A,__m512d __B,__m512d __C)2889 _mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C)
2890 {
2891 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2892 (__v8df) __B,
2893 (__v8df) __C,
2894 (__mmask8) -1,
2895 _MM_FROUND_CUR_DIRECTION);
2896 }
2897
2898 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_fmaddsub_pd(__m512d __A,__mmask8 __U,__m512d __B,__m512d __C)2899 _mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
2900 {
2901 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2902 (__v8df) __B,
2903 (__v8df) __C,
2904 (__mmask8) __U,
2905 _MM_FROUND_CUR_DIRECTION);
2906 }
2907
2908 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask3_fmaddsub_pd(__m512d __A,__m512d __B,__m512d __C,__mmask8 __U)2909 _mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
2910 {
2911 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
2912 (__v8df) __B,
2913 (__v8df) __C,
2914 (__mmask8) __U,
2915 _MM_FROUND_CUR_DIRECTION);
2916 }
2917
2918 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_fmaddsub_pd(__mmask8 __U,__m512d __A,__m512d __B,__m512d __C)2919 _mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2920 {
2921 return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
2922 (__v8df) __B,
2923 (__v8df) __C,
2924 (__mmask8) __U,
2925 _MM_FROUND_CUR_DIRECTION);
2926 }
2927
2928 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_fmsubadd_pd(__m512d __A,__m512d __B,__m512d __C)2929 _mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C)
2930 {
2931 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2932 (__v8df) __B,
2933 -(__v8df) __C,
2934 (__mmask8) -1,
2935 _MM_FROUND_CUR_DIRECTION);
2936 }
2937
2938 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_fmsubadd_pd(__m512d __A,__mmask8 __U,__m512d __B,__m512d __C)2939 _mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
2940 {
2941 return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2942 (__v8df) __B,
2943 -(__v8df) __C,
2944 (__mmask8) __U,
2945 _MM_FROUND_CUR_DIRECTION);
2946 }
2947
2948 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_fmsubadd_pd(__mmask8 __U,__m512d __A,__m512d __B,__m512d __C)2949 _mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2950 {
2951 return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
2952 (__v8df) __B,
2953 -(__v8df) __C,
2954 (__mmask8) __U,
2955 _MM_FROUND_CUR_DIRECTION);
2956 }
2957
2958 #define _mm512_fmaddsub_round_ps(A, B, C, R) \
2959 ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
2960 (__v16sf)(__m512)(B), \
2961 (__v16sf)(__m512)(C), \
2962 (__mmask16)-1, (int)(R)))
2963
2964
2965 #define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \
2966 ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
2967 (__v16sf)(__m512)(B), \
2968 (__v16sf)(__m512)(C), \
2969 (__mmask16)(U), (int)(R)))
2970
2971
2972 #define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \
2973 ((__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \
2974 (__v16sf)(__m512)(B), \
2975 (__v16sf)(__m512)(C), \
2976 (__mmask16)(U), (int)(R)))
2977
2978
2979 #define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \
2980 ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
2981 (__v16sf)(__m512)(B), \
2982 (__v16sf)(__m512)(C), \
2983 (__mmask16)(U), (int)(R)))
2984
2985
2986 #define _mm512_fmsubadd_round_ps(A, B, C, R) \
2987 ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
2988 (__v16sf)(__m512)(B), \
2989 -(__v16sf)(__m512)(C), \
2990 (__mmask16)-1, (int)(R)))
2991
2992
2993 #define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \
2994 ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
2995 (__v16sf)(__m512)(B), \
2996 -(__v16sf)(__m512)(C), \
2997 (__mmask16)(U), (int)(R)))
2998
2999
3000 #define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \
3001 ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
3002 (__v16sf)(__m512)(B), \
3003 -(__v16sf)(__m512)(C), \
3004 (__mmask16)(U), (int)(R)))
3005
3006
3007 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_fmaddsub_ps(__m512 __A,__m512 __B,__m512 __C)3008 _mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C)
3009 {
3010 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3011 (__v16sf) __B,
3012 (__v16sf) __C,
3013 (__mmask16) -1,
3014 _MM_FROUND_CUR_DIRECTION);
3015 }
3016
3017 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_fmaddsub_ps(__m512 __A,__mmask16 __U,__m512 __B,__m512 __C)3018 _mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
3019 {
3020 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3021 (__v16sf) __B,
3022 (__v16sf) __C,
3023 (__mmask16) __U,
3024 _MM_FROUND_CUR_DIRECTION);
3025 }
3026
3027 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask3_fmaddsub_ps(__m512 __A,__m512 __B,__m512 __C,__mmask16 __U)3028 _mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
3029 {
3030 return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
3031 (__v16sf) __B,
3032 (__v16sf) __C,
3033 (__mmask16) __U,
3034 _MM_FROUND_CUR_DIRECTION);
3035 }
3036
3037 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_fmaddsub_ps(__mmask16 __U,__m512 __A,__m512 __B,__m512 __C)3038 _mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
3039 {
3040 return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
3041 (__v16sf) __B,
3042 (__v16sf) __C,
3043 (__mmask16) __U,
3044 _MM_FROUND_CUR_DIRECTION);
3045 }
3046
3047 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_fmsubadd_ps(__m512 __A,__m512 __B,__m512 __C)3048 _mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C)
3049 {
3050 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3051 (__v16sf) __B,
3052 -(__v16sf) __C,
3053 (__mmask16) -1,
3054 _MM_FROUND_CUR_DIRECTION);
3055 }
3056
3057 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_fmsubadd_ps(__m512 __A,__mmask16 __U,__m512 __B,__m512 __C)3058 _mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
3059 {
3060 return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3061 (__v16sf) __B,
3062 -(__v16sf) __C,
3063 (__mmask16) __U,
3064 _MM_FROUND_CUR_DIRECTION);
3065 }
3066
3067 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_fmsubadd_ps(__mmask16 __U,__m512 __A,__m512 __B,__m512 __C)3068 _mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
3069 {
3070 return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
3071 (__v16sf) __B,
3072 -(__v16sf) __C,
3073 (__mmask16) __U,
3074 _MM_FROUND_CUR_DIRECTION);
3075 }
3076
3077 #define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \
3078 ((__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \
3079 (__v8df)(__m512d)(B), \
3080 (__v8df)(__m512d)(C), \
3081 (__mmask8)(U), (int)(R)))
3082
3083
3084 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask3_fmsub_pd(__m512d __A,__m512d __B,__m512d __C,__mmask8 __U)3085 _mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
3086 {
3087 return (__m512d)__builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
3088 (__v8df) __B,
3089 (__v8df) __C,
3090 (__mmask8) __U,
3091 _MM_FROUND_CUR_DIRECTION);
3092 }
3093
3094 #define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \
3095 ((__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \
3096 (__v16sf)(__m512)(B), \
3097 (__v16sf)(__m512)(C), \
3098 (__mmask16)(U), (int)(R)))
3099
3100 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask3_fmsub_ps(__m512 __A,__m512 __B,__m512 __C,__mmask16 __U)3101 _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
3102 {
3103 return (__m512)__builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
3104 (__v16sf) __B,
3105 (__v16sf) __C,
3106 (__mmask16) __U,
3107 _MM_FROUND_CUR_DIRECTION);
3108 }
3109
3110 #define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \
3111 ((__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \
3112 (__v8df)(__m512d)(B), \
3113 (__v8df)(__m512d)(C), \
3114 (__mmask8)(U), (int)(R)))
3115
3116
3117 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask3_fmsubadd_pd(__m512d __A,__m512d __B,__m512d __C,__mmask8 __U)3118 _mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
3119 {
3120 return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
3121 (__v8df) __B,
3122 (__v8df) __C,
3123 (__mmask8) __U,
3124 _MM_FROUND_CUR_DIRECTION);
3125 }
3126
3127 #define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \
3128 ((__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
3129 (__v16sf)(__m512)(B), \
3130 (__v16sf)(__m512)(C), \
3131 (__mmask16)(U), (int)(R)))
3132
3133
3134 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask3_fmsubadd_ps(__m512 __A,__m512 __B,__m512 __C,__mmask16 __U)3135 _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
3136 {
3137 return (__m512)__builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
3138 (__v16sf) __B,
3139 (__v16sf) __C,
3140 (__mmask16) __U,
3141 _MM_FROUND_CUR_DIRECTION);
3142 }
3143
3144 #define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \
3145 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
3146 -(__v8df)(__m512d)(B), \
3147 (__v8df)(__m512d)(C), \
3148 (__mmask8)(U), (int)(R)))
3149
3150
3151 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_fnmadd_pd(__m512d __A,__mmask8 __U,__m512d __B,__m512d __C)3152 _mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
3153 {
3154 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
3155 -(__v8df) __B,
3156 (__v8df) __C,
3157 (__mmask8) __U,
3158 _MM_FROUND_CUR_DIRECTION);
3159 }
3160
3161 #define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \
3162 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
3163 -(__v16sf)(__m512)(B), \
3164 (__v16sf)(__m512)(C), \
3165 (__mmask16)(U), (int)(R)))
3166
3167
3168 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_fnmadd_ps(__m512 __A,__mmask16 __U,__m512 __B,__m512 __C)3169 _mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
3170 {
3171 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
3172 -(__v16sf) __B,
3173 (__v16sf) __C,
3174 (__mmask16) __U,
3175 _MM_FROUND_CUR_DIRECTION);
3176 }
3177
3178 #define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \
3179 ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
3180 -(__v8df)(__m512d)(B), \
3181 -(__v8df)(__m512d)(C), \
3182 (__mmask8)(U), (int)(R)))
3183
3184
3185 #define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \
3186 ((__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \
3187 (__v8df)(__m512d)(B), \
3188 (__v8df)(__m512d)(C), \
3189 (__mmask8)(U), (int)(R)))
3190
3191
3192 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_fnmsub_pd(__m512d __A,__mmask8 __U,__m512d __B,__m512d __C)3193 _mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
3194 {
3195 return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
3196 -(__v8df) __B,
3197 -(__v8df) __C,
3198 (__mmask8) __U,
3199 _MM_FROUND_CUR_DIRECTION);
3200 }
3201
3202 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask3_fnmsub_pd(__m512d __A,__m512d __B,__m512d __C,__mmask8 __U)3203 _mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
3204 {
3205 return (__m512d) __builtin_ia32_vfmsubpd512_mask3 (-(__v8df) __A,
3206 (__v8df) __B,
3207 (__v8df) __C,
3208 (__mmask8) __U,
3209 _MM_FROUND_CUR_DIRECTION);
3210 }
3211
3212 #define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \
3213 ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
3214 -(__v16sf)(__m512)(B), \
3215 -(__v16sf)(__m512)(C), \
3216 (__mmask16)(U), (int)(R)))
3217
3218
3219 #define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \
3220 ((__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \
3221 (__v16sf)(__m512)(B), \
3222 (__v16sf)(__m512)(C), \
3223 (__mmask16)(U), (int)(R)))
3224
3225
3226 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_fnmsub_ps(__m512 __A,__mmask16 __U,__m512 __B,__m512 __C)3227 _mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
3228 {
3229 return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
3230 -(__v16sf) __B,
3231 -(__v16sf) __C,
3232 (__mmask16) __U,
3233 _MM_FROUND_CUR_DIRECTION);
3234 }
3235
3236 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask3_fnmsub_ps(__m512 __A,__m512 __B,__m512 __C,__mmask16 __U)3237 _mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
3238 {
3239 return (__m512) __builtin_ia32_vfmsubps512_mask3 (-(__v16sf) __A,
3240 (__v16sf) __B,
3241 (__v16sf) __C,
3242 (__mmask16) __U,
3243 _MM_FROUND_CUR_DIRECTION);
3244 }
3245
3246
3247
3248 /* Vector permutations */
3249
3250 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_permutex2var_epi32(__m512i __A,__m512i __I,__m512i __B)3251 _mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
3252 {
3253 return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I,
3254 (__v16si) __B);
3255 }
3256
3257 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_permutex2var_epi32(__m512i __A,__mmask16 __U,__m512i __I,__m512i __B)3258 _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I,
3259 __m512i __B)
3260 {
3261 return (__m512i)__builtin_ia32_selectd_512(__U,
3262 (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
3263 (__v16si)__A);
3264 }
3265
3266 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask2_permutex2var_epi32(__m512i __A,__m512i __I,__mmask16 __U,__m512i __B)3267 _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U,
3268 __m512i __B)
3269 {
3270 return (__m512i)__builtin_ia32_selectd_512(__U,
3271 (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
3272 (__v16si)__I);
3273 }
3274
3275 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_permutex2var_epi32(__mmask16 __U,__m512i __A,__m512i __I,__m512i __B)3276 _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I,
3277 __m512i __B)
3278 {
3279 return (__m512i)__builtin_ia32_selectd_512(__U,
3280 (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
3281 (__v16si)_mm512_setzero_si512());
3282 }
3283
3284 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_permutex2var_epi64(__m512i __A,__m512i __I,__m512i __B)3285 _mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
3286 {
3287 return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I,
3288 (__v8di) __B);
3289 }
3290
3291 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_permutex2var_epi64(__m512i __A,__mmask8 __U,__m512i __I,__m512i __B)3292 _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I,
3293 __m512i __B)
3294 {
3295 return (__m512i)__builtin_ia32_selectq_512(__U,
3296 (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
3297 (__v8di)__A);
3298 }
3299
3300 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask2_permutex2var_epi64(__m512i __A,__m512i __I,__mmask8 __U,__m512i __B)3301 _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U,
3302 __m512i __B)
3303 {
3304 return (__m512i)__builtin_ia32_selectq_512(__U,
3305 (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
3306 (__v8di)__I);
3307 }
3308
3309 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_permutex2var_epi64(__mmask8 __U,__m512i __A,__m512i __I,__m512i __B)3310 _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
3311 __m512i __B)
3312 {
3313 return (__m512i)__builtin_ia32_selectq_512(__U,
3314 (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
3315 (__v8di)_mm512_setzero_si512());
3316 }
3317
3318 #define _mm512_alignr_epi64(A, B, I) \
3319 ((__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \
3320 (__v8di)(__m512i)(B), (int)(I)))
3321
3322 #define _mm512_mask_alignr_epi64(W, U, A, B, imm) \
3323 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
3324 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
3325 (__v8di)(__m512i)(W)))
3326
3327 #define _mm512_maskz_alignr_epi64(U, A, B, imm) \
3328 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
3329 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
3330 (__v8di)_mm512_setzero_si512()))
3331
3332 #define _mm512_alignr_epi32(A, B, I) \
3333 ((__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \
3334 (__v16si)(__m512i)(B), (int)(I)))
3335
3336 #define _mm512_mask_alignr_epi32(W, U, A, B, imm) \
3337 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
3338 (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
3339 (__v16si)(__m512i)(W)))
3340
3341 #define _mm512_maskz_alignr_epi32(U, A, B, imm) \
3342 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
3343 (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
3344 (__v16si)_mm512_setzero_si512()))
3345 /* Vector Extract */
3346
3347 #define _mm512_extractf64x4_pd(A, I) \
3348 ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
3349 (__v4df)_mm256_undefined_pd(), \
3350 (__mmask8)-1))
3351
3352 #define _mm512_mask_extractf64x4_pd(W, U, A, imm) \
3353 ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
3354 (__v4df)(__m256d)(W), \
3355 (__mmask8)(U)))
3356
3357 #define _mm512_maskz_extractf64x4_pd(U, A, imm) \
3358 ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
3359 (__v4df)_mm256_setzero_pd(), \
3360 (__mmask8)(U)))
3361
3362 #define _mm512_extractf32x4_ps(A, I) \
3363 ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
3364 (__v4sf)_mm_undefined_ps(), \
3365 (__mmask8)-1))
3366
3367 #define _mm512_mask_extractf32x4_ps(W, U, A, imm) \
3368 ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
3369 (__v4sf)(__m128)(W), \
3370 (__mmask8)(U)))
3371
3372 #define _mm512_maskz_extractf32x4_ps(U, A, imm) \
3373 ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
3374 (__v4sf)_mm_setzero_ps(), \
3375 (__mmask8)(U)))
3376
3377 /* Vector Blend */
3378
3379 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_blend_pd(__mmask8 __U,__m512d __A,__m512d __W)3380 _mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W)
3381 {
3382 return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
3383 (__v8df) __W,
3384 (__v8df) __A);
3385 }
3386
3387 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_blend_ps(__mmask16 __U,__m512 __A,__m512 __W)3388 _mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W)
3389 {
3390 return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
3391 (__v16sf) __W,
3392 (__v16sf) __A);
3393 }
3394
3395 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_blend_epi64(__mmask8 __U,__m512i __A,__m512i __W)3396 _mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W)
3397 {
3398 return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
3399 (__v8di) __W,
3400 (__v8di) __A);
3401 }
3402
3403 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_blend_epi32(__mmask16 __U,__m512i __A,__m512i __W)3404 _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
3405 {
3406 return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
3407 (__v16si) __W,
3408 (__v16si) __A);
3409 }
3410
3411 /* Compare */
3412
3413 #define _mm512_cmp_round_ps_mask(A, B, P, R) \
3414 ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
3415 (__v16sf)(__m512)(B), (int)(P), \
3416 (__mmask16)-1, (int)(R)))
3417
3418 #define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) \
3419 ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
3420 (__v16sf)(__m512)(B), (int)(P), \
3421 (__mmask16)(U), (int)(R)))
3422
3423 #define _mm512_cmp_ps_mask(A, B, P) \
3424 _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3425 #define _mm512_mask_cmp_ps_mask(U, A, B, P) \
3426 _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3427
3428 #define _mm512_cmpeq_ps_mask(A, B) \
3429 _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ)
3430 #define _mm512_mask_cmpeq_ps_mask(k, A, B) \
3431 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ)
3432
3433 #define _mm512_cmplt_ps_mask(A, B) \
3434 _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS)
3435 #define _mm512_mask_cmplt_ps_mask(k, A, B) \
3436 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS)
3437
3438 #define _mm512_cmple_ps_mask(A, B) \
3439 _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS)
3440 #define _mm512_mask_cmple_ps_mask(k, A, B) \
3441 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS)
3442
3443 #define _mm512_cmpunord_ps_mask(A, B) \
3444 _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q)
3445 #define _mm512_mask_cmpunord_ps_mask(k, A, B) \
3446 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q)
3447
3448 #define _mm512_cmpneq_ps_mask(A, B) \
3449 _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ)
3450 #define _mm512_mask_cmpneq_ps_mask(k, A, B) \
3451 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ)
3452
3453 #define _mm512_cmpnlt_ps_mask(A, B) \
3454 _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US)
3455 #define _mm512_mask_cmpnlt_ps_mask(k, A, B) \
3456 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US)
3457
3458 #define _mm512_cmpnle_ps_mask(A, B) \
3459 _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US)
3460 #define _mm512_mask_cmpnle_ps_mask(k, A, B) \
3461 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US)
3462
3463 #define _mm512_cmpord_ps_mask(A, B) \
3464 _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q)
3465 #define _mm512_mask_cmpord_ps_mask(k, A, B) \
3466 _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q)
3467
3468 #define _mm512_cmp_round_pd_mask(A, B, P, R) \
3469 ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
3470 (__v8df)(__m512d)(B), (int)(P), \
3471 (__mmask8)-1, (int)(R)))
3472
3473 #define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) \
3474 ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
3475 (__v8df)(__m512d)(B), (int)(P), \
3476 (__mmask8)(U), (int)(R)))
3477
3478 #define _mm512_cmp_pd_mask(A, B, P) \
3479 _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3480 #define _mm512_mask_cmp_pd_mask(U, A, B, P) \
3481 _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3482
3483 #define _mm512_cmpeq_pd_mask(A, B) \
3484 _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ)
3485 #define _mm512_mask_cmpeq_pd_mask(k, A, B) \
3486 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ)
3487
3488 #define _mm512_cmplt_pd_mask(A, B) \
3489 _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS)
3490 #define _mm512_mask_cmplt_pd_mask(k, A, B) \
3491 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS)
3492
3493 #define _mm512_cmple_pd_mask(A, B) \
3494 _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS)
3495 #define _mm512_mask_cmple_pd_mask(k, A, B) \
3496 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS)
3497
3498 #define _mm512_cmpunord_pd_mask(A, B) \
3499 _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q)
3500 #define _mm512_mask_cmpunord_pd_mask(k, A, B) \
3501 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q)
3502
3503 #define _mm512_cmpneq_pd_mask(A, B) \
3504 _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ)
3505 #define _mm512_mask_cmpneq_pd_mask(k, A, B) \
3506 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ)
3507
3508 #define _mm512_cmpnlt_pd_mask(A, B) \
3509 _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US)
3510 #define _mm512_mask_cmpnlt_pd_mask(k, A, B) \
3511 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US)
3512
3513 #define _mm512_cmpnle_pd_mask(A, B) \
3514 _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US)
3515 #define _mm512_mask_cmpnle_pd_mask(k, A, B) \
3516 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US)
3517
3518 #define _mm512_cmpord_pd_mask(A, B) \
3519 _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q)
3520 #define _mm512_mask_cmpord_pd_mask(k, A, B) \
3521 _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q)
3522
3523 /* Conversion */
3524
3525 #define _mm512_cvtt_roundps_epu32(A, R) \
3526 ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
3527 (__v16si)_mm512_undefined_epi32(), \
3528 (__mmask16)-1, (int)(R)))
3529
3530 #define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) \
3531 ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
3532 (__v16si)(__m512i)(W), \
3533 (__mmask16)(U), (int)(R)))
3534
3535 #define _mm512_maskz_cvtt_roundps_epu32(U, A, R) \
3536 ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
3537 (__v16si)_mm512_setzero_si512(), \
3538 (__mmask16)(U), (int)(R)))
3539
3540
3541 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttps_epu32(__m512 __A)3542 _mm512_cvttps_epu32(__m512 __A)
3543 {
3544 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
3545 (__v16si)
3546 _mm512_setzero_si512 (),
3547 (__mmask16) -1,
3548 _MM_FROUND_CUR_DIRECTION);
3549 }
3550
3551 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttps_epu32(__m512i __W,__mmask16 __U,__m512 __A)3552 _mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
3553 {
3554 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
3555 (__v16si) __W,
3556 (__mmask16) __U,
3557 _MM_FROUND_CUR_DIRECTION);
3558 }
3559
3560 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttps_epu32(__mmask16 __U,__m512 __A)3561 _mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
3562 {
3563 return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
3564 (__v16si) _mm512_setzero_si512 (),
3565 (__mmask16) __U,
3566 _MM_FROUND_CUR_DIRECTION);
3567 }
3568
3569 #define _mm512_cvt_roundepi32_ps(A, R) \
3570 ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
3571 (__v16sf)_mm512_setzero_ps(), \
3572 (__mmask16)-1, (int)(R)))
3573
3574 #define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) \
3575 ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
3576 (__v16sf)(__m512)(W), \
3577 (__mmask16)(U), (int)(R)))
3578
3579 #define _mm512_maskz_cvt_roundepi32_ps(U, A, R) \
3580 ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
3581 (__v16sf)_mm512_setzero_ps(), \
3582 (__mmask16)(U), (int)(R)))
3583
3584 #define _mm512_cvt_roundepu32_ps(A, R) \
3585 ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
3586 (__v16sf)_mm512_setzero_ps(), \
3587 (__mmask16)-1, (int)(R)))
3588
3589 #define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) \
3590 ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
3591 (__v16sf)(__m512)(W), \
3592 (__mmask16)(U), (int)(R)))
3593
3594 #define _mm512_maskz_cvt_roundepu32_ps(U, A, R) \
3595 ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
3596 (__v16sf)_mm512_setzero_ps(), \
3597 (__mmask16)(U), (int)(R)))
3598
3599 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_cvtepu32_ps(__m512i __A)3600 _mm512_cvtepu32_ps (__m512i __A)
3601 {
3602 return (__m512)__builtin_convertvector((__v16su)__A, __v16sf);
3603 }
3604
3605 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepu32_ps(__m512 __W,__mmask16 __U,__m512i __A)3606 _mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
3607 {
3608 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
3609 (__v16sf)_mm512_cvtepu32_ps(__A),
3610 (__v16sf)__W);
3611 }
3612
3613 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepu32_ps(__mmask16 __U,__m512i __A)3614 _mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
3615 {
3616 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
3617 (__v16sf)_mm512_cvtepu32_ps(__A),
3618 (__v16sf)_mm512_setzero_ps());
3619 }
3620
3621 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_cvtepi32_pd(__m256i __A)3622 _mm512_cvtepi32_pd(__m256i __A)
3623 {
3624 return (__m512d)__builtin_convertvector((__v8si)__A, __v8df);
3625 }
3626
3627 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi32_pd(__m512d __W,__mmask8 __U,__m256i __A)3628 _mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
3629 {
3630 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
3631 (__v8df)_mm512_cvtepi32_pd(__A),
3632 (__v8df)__W);
3633 }
3634
3635 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi32_pd(__mmask8 __U,__m256i __A)3636 _mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
3637 {
3638 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
3639 (__v8df)_mm512_cvtepi32_pd(__A),
3640 (__v8df)_mm512_setzero_pd());
3641 }
3642
3643 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_cvtepi32lo_pd(__m512i __A)3644 _mm512_cvtepi32lo_pd(__m512i __A)
3645 {
3646 return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A));
3647 }
3648
3649 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi32lo_pd(__m512d __W,__mmask8 __U,__m512i __A)3650 _mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
3651 {
3652 return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A));
3653 }
3654
3655 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_cvtepi32_ps(__m512i __A)3656 _mm512_cvtepi32_ps (__m512i __A)
3657 {
3658 return (__m512)__builtin_convertvector((__v16si)__A, __v16sf);
3659 }
3660
3661 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi32_ps(__m512 __W,__mmask16 __U,__m512i __A)3662 _mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
3663 {
3664 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
3665 (__v16sf)_mm512_cvtepi32_ps(__A),
3666 (__v16sf)__W);
3667 }
3668
3669 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi32_ps(__mmask16 __U,__m512i __A)3670 _mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
3671 {
3672 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
3673 (__v16sf)_mm512_cvtepi32_ps(__A),
3674 (__v16sf)_mm512_setzero_ps());
3675 }
3676
3677 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_cvtepu32_pd(__m256i __A)3678 _mm512_cvtepu32_pd(__m256i __A)
3679 {
3680 return (__m512d)__builtin_convertvector((__v8su)__A, __v8df);
3681 }
3682
3683 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepu32_pd(__m512d __W,__mmask8 __U,__m256i __A)3684 _mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
3685 {
3686 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
3687 (__v8df)_mm512_cvtepu32_pd(__A),
3688 (__v8df)__W);
3689 }
3690
3691 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepu32_pd(__mmask8 __U,__m256i __A)3692 _mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
3693 {
3694 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
3695 (__v8df)_mm512_cvtepu32_pd(__A),
3696 (__v8df)_mm512_setzero_pd());
3697 }
3698
3699 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_cvtepu32lo_pd(__m512i __A)3700 _mm512_cvtepu32lo_pd(__m512i __A)
3701 {
3702 return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A));
3703 }
3704
3705 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepu32lo_pd(__m512d __W,__mmask8 __U,__m512i __A)3706 _mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
3707 {
3708 return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A));
3709 }
3710
3711 #define _mm512_cvt_roundpd_ps(A, R) \
3712 ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
3713 (__v8sf)_mm256_setzero_ps(), \
3714 (__mmask8)-1, (int)(R)))
3715
3716 #define _mm512_mask_cvt_roundpd_ps(W, U, A, R) \
3717 ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
3718 (__v8sf)(__m256)(W), (__mmask8)(U), \
3719 (int)(R)))
3720
3721 #define _mm512_maskz_cvt_roundpd_ps(U, A, R) \
3722 ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
3723 (__v8sf)_mm256_setzero_ps(), \
3724 (__mmask8)(U), (int)(R)))
3725
3726 static __inline__ __m256 __DEFAULT_FN_ATTRS512
_mm512_cvtpd_ps(__m512d __A)3727 _mm512_cvtpd_ps (__m512d __A)
3728 {
3729 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
3730 (__v8sf) _mm256_undefined_ps (),
3731 (__mmask8) -1,
3732 _MM_FROUND_CUR_DIRECTION);
3733 }
3734
3735 static __inline__ __m256 __DEFAULT_FN_ATTRS512
_mm512_mask_cvtpd_ps(__m256 __W,__mmask8 __U,__m512d __A)3736 _mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
3737 {
3738 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
3739 (__v8sf) __W,
3740 (__mmask8) __U,
3741 _MM_FROUND_CUR_DIRECTION);
3742 }
3743
3744 static __inline__ __m256 __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtpd_ps(__mmask8 __U,__m512d __A)3745 _mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
3746 {
3747 return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
3748 (__v8sf) _mm256_setzero_ps (),
3749 (__mmask8) __U,
3750 _MM_FROUND_CUR_DIRECTION);
3751 }
3752
3753 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_cvtpd_pslo(__m512d __A)3754 _mm512_cvtpd_pslo (__m512d __A)
3755 {
3756 return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A),
3757 (__v8sf) _mm256_setzero_ps (),
3758 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3759 }
3760
3761 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_cvtpd_pslo(__m512 __W,__mmask8 __U,__m512d __A)3762 _mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A)
3763 {
3764 return (__m512) __builtin_shufflevector (
3765 (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W),
3766 __U, __A),
3767 (__v8sf) _mm256_setzero_ps (),
3768 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3769 }
3770
3771 #define _mm512_cvt_roundps_ph(A, I) \
3772 ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
3773 (__v16hi)_mm256_undefined_si256(), \
3774 (__mmask16)-1))
3775
3776 #define _mm512_mask_cvt_roundps_ph(U, W, A, I) \
3777 ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
3778 (__v16hi)(__m256i)(U), \
3779 (__mmask16)(W)))
3780
3781 #define _mm512_maskz_cvt_roundps_ph(W, A, I) \
3782 ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
3783 (__v16hi)_mm256_setzero_si256(), \
3784 (__mmask16)(W)))
3785
3786 #define _mm512_cvtps_ph _mm512_cvt_roundps_ph
3787 #define _mm512_mask_cvtps_ph _mm512_mask_cvt_roundps_ph
3788 #define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph
3789
3790 #define _mm512_cvt_roundph_ps(A, R) \
3791 ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
3792 (__v16sf)_mm512_undefined_ps(), \
3793 (__mmask16)-1, (int)(R)))
3794
3795 #define _mm512_mask_cvt_roundph_ps(W, U, A, R) \
3796 ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
3797 (__v16sf)(__m512)(W), \
3798 (__mmask16)(U), (int)(R)))
3799
3800 #define _mm512_maskz_cvt_roundph_ps(U, A, R) \
3801 ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
3802 (__v16sf)_mm512_setzero_ps(), \
3803 (__mmask16)(U), (int)(R)))
3804
3805
3806 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_cvtph_ps(__m256i __A)3807 _mm512_cvtph_ps(__m256i __A)
3808 {
3809 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
3810 (__v16sf)
3811 _mm512_setzero_ps (),
3812 (__mmask16) -1,
3813 _MM_FROUND_CUR_DIRECTION);
3814 }
3815
3816 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_ps(__m512 __W,__mmask16 __U,__m256i __A)3817 _mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
3818 {
3819 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
3820 (__v16sf) __W,
3821 (__mmask16) __U,
3822 _MM_FROUND_CUR_DIRECTION);
3823 }
3824
3825 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_ps(__mmask16 __U,__m256i __A)3826 _mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
3827 {
3828 return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
3829 (__v16sf) _mm512_setzero_ps (),
3830 (__mmask16) __U,
3831 _MM_FROUND_CUR_DIRECTION);
3832 }
3833
3834 #define _mm512_cvtt_roundpd_epi32(A, R) \
3835 ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
3836 (__v8si)_mm256_setzero_si256(), \
3837 (__mmask8)-1, (int)(R)))
3838
3839 #define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) \
3840 ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
3841 (__v8si)(__m256i)(W), \
3842 (__mmask8)(U), (int)(R)))
3843
3844 #define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) \
3845 ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
3846 (__v8si)_mm256_setzero_si256(), \
3847 (__mmask8)(U), (int)(R)))
3848
3849 static __inline __m256i __DEFAULT_FN_ATTRS512
_mm512_cvttpd_epi32(__m512d __a)3850 _mm512_cvttpd_epi32(__m512d __a)
3851 {
3852 return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a,
3853 (__v8si)_mm256_setzero_si256(),
3854 (__mmask8) -1,
3855 _MM_FROUND_CUR_DIRECTION);
3856 }
3857
3858 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttpd_epi32(__m256i __W,__mmask8 __U,__m512d __A)3859 _mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
3860 {
3861 return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
3862 (__v8si) __W,
3863 (__mmask8) __U,
3864 _MM_FROUND_CUR_DIRECTION);
3865 }
3866
3867 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttpd_epi32(__mmask8 __U,__m512d __A)3868 _mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
3869 {
3870 return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
3871 (__v8si) _mm256_setzero_si256 (),
3872 (__mmask8) __U,
3873 _MM_FROUND_CUR_DIRECTION);
3874 }
3875
3876 #define _mm512_cvtt_roundps_epi32(A, R) \
3877 ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
3878 (__v16si)_mm512_setzero_si512(), \
3879 (__mmask16)-1, (int)(R)))
3880
3881 #define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) \
3882 ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
3883 (__v16si)(__m512i)(W), \
3884 (__mmask16)(U), (int)(R)))
3885
3886 #define _mm512_maskz_cvtt_roundps_epi32(U, A, R) \
3887 ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
3888 (__v16si)_mm512_setzero_si512(), \
3889 (__mmask16)(U), (int)(R)))
3890
3891 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttps_epi32(__m512 __a)3892 _mm512_cvttps_epi32(__m512 __a)
3893 {
3894 return (__m512i)
3895 __builtin_ia32_cvttps2dq512_mask((__v16sf) __a,
3896 (__v16si) _mm512_setzero_si512 (),
3897 (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);
3898 }
3899
3900 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttps_epi32(__m512i __W,__mmask16 __U,__m512 __A)3901 _mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
3902 {
3903 return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
3904 (__v16si) __W,
3905 (__mmask16) __U,
3906 _MM_FROUND_CUR_DIRECTION);
3907 }
3908
3909 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttps_epi32(__mmask16 __U,__m512 __A)3910 _mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
3911 {
3912 return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
3913 (__v16si) _mm512_setzero_si512 (),
3914 (__mmask16) __U,
3915 _MM_FROUND_CUR_DIRECTION);
3916 }
3917
3918 #define _mm512_cvt_roundps_epi32(A, R) \
3919 ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
3920 (__v16si)_mm512_setzero_si512(), \
3921 (__mmask16)-1, (int)(R)))
3922
3923 #define _mm512_mask_cvt_roundps_epi32(W, U, A, R) \
3924 ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
3925 (__v16si)(__m512i)(W), \
3926 (__mmask16)(U), (int)(R)))
3927
3928 #define _mm512_maskz_cvt_roundps_epi32(U, A, R) \
3929 ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
3930 (__v16si)_mm512_setzero_si512(), \
3931 (__mmask16)(U), (int)(R)))
3932
3933 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtps_epi32(__m512 __A)3934 _mm512_cvtps_epi32 (__m512 __A)
3935 {
3936 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
3937 (__v16si) _mm512_undefined_epi32 (),
3938 (__mmask16) -1,
3939 _MM_FROUND_CUR_DIRECTION);
3940 }
3941
3942 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtps_epi32(__m512i __W,__mmask16 __U,__m512 __A)3943 _mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
3944 {
3945 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
3946 (__v16si) __W,
3947 (__mmask16) __U,
3948 _MM_FROUND_CUR_DIRECTION);
3949 }
3950
3951 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtps_epi32(__mmask16 __U,__m512 __A)3952 _mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
3953 {
3954 return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
3955 (__v16si)
3956 _mm512_setzero_si512 (),
3957 (__mmask16) __U,
3958 _MM_FROUND_CUR_DIRECTION);
3959 }
3960
3961 #define _mm512_cvt_roundpd_epi32(A, R) \
3962 ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
3963 (__v8si)_mm256_setzero_si256(), \
3964 (__mmask8)-1, (int)(R)))
3965
3966 #define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) \
3967 ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
3968 (__v8si)(__m256i)(W), \
3969 (__mmask8)(U), (int)(R)))
3970
3971 #define _mm512_maskz_cvt_roundpd_epi32(U, A, R) \
3972 ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
3973 (__v8si)_mm256_setzero_si256(), \
3974 (__mmask8)(U), (int)(R)))
3975
3976 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_cvtpd_epi32(__m512d __A)3977 _mm512_cvtpd_epi32 (__m512d __A)
3978 {
3979 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
3980 (__v8si)
3981 _mm256_undefined_si256 (),
3982 (__mmask8) -1,
3983 _MM_FROUND_CUR_DIRECTION);
3984 }
3985
3986 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtpd_epi32(__m256i __W,__mmask8 __U,__m512d __A)3987 _mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
3988 {
3989 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
3990 (__v8si) __W,
3991 (__mmask8) __U,
3992 _MM_FROUND_CUR_DIRECTION);
3993 }
3994
3995 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtpd_epi32(__mmask8 __U,__m512d __A)3996 _mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
3997 {
3998 return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
3999 (__v8si)
4000 _mm256_setzero_si256 (),
4001 (__mmask8) __U,
4002 _MM_FROUND_CUR_DIRECTION);
4003 }
4004
4005 #define _mm512_cvt_roundps_epu32(A, R) \
4006 ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
4007 (__v16si)_mm512_setzero_si512(), \
4008 (__mmask16)-1, (int)(R)))
4009
4010 #define _mm512_mask_cvt_roundps_epu32(W, U, A, R) \
4011 ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
4012 (__v16si)(__m512i)(W), \
4013 (__mmask16)(U), (int)(R)))
4014
4015 #define _mm512_maskz_cvt_roundps_epu32(U, A, R) \
4016 ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
4017 (__v16si)_mm512_setzero_si512(), \
4018 (__mmask16)(U), (int)(R)))
4019
4020 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtps_epu32(__m512 __A)4021 _mm512_cvtps_epu32 ( __m512 __A)
4022 {
4023 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\
4024 (__v16si)\
4025 _mm512_undefined_epi32 (),
4026 (__mmask16) -1,\
4027 _MM_FROUND_CUR_DIRECTION);
4028 }
4029
4030 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtps_epu32(__m512i __W,__mmask16 __U,__m512 __A)4031 _mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
4032 {
4033 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
4034 (__v16si) __W,
4035 (__mmask16) __U,
4036 _MM_FROUND_CUR_DIRECTION);
4037 }
4038
4039 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtps_epu32(__mmask16 __U,__m512 __A)4040 _mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A)
4041 {
4042 return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
4043 (__v16si)
4044 _mm512_setzero_si512 (),
4045 (__mmask16) __U ,
4046 _MM_FROUND_CUR_DIRECTION);
4047 }
4048
4049 #define _mm512_cvt_roundpd_epu32(A, R) \
4050 ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
4051 (__v8si)_mm256_setzero_si256(), \
4052 (__mmask8)-1, (int)(R)))
4053
4054 #define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \
4055 ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
4056 (__v8si)(__m256i)(W), \
4057 (__mmask8)(U), (int)(R)))
4058
4059 #define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \
4060 ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
4061 (__v8si)_mm256_setzero_si256(), \
4062 (__mmask8)(U), (int)(R)))
4063
4064 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_cvtpd_epu32(__m512d __A)4065 _mm512_cvtpd_epu32 (__m512d __A)
4066 {
4067 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
4068 (__v8si)
4069 _mm256_undefined_si256 (),
4070 (__mmask8) -1,
4071 _MM_FROUND_CUR_DIRECTION);
4072 }
4073
4074 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtpd_epu32(__m256i __W,__mmask8 __U,__m512d __A)4075 _mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
4076 {
4077 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
4078 (__v8si) __W,
4079 (__mmask8) __U,
4080 _MM_FROUND_CUR_DIRECTION);
4081 }
4082
4083 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtpd_epu32(__mmask8 __U,__m512d __A)4084 _mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
4085 {
4086 return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
4087 (__v8si)
4088 _mm256_setzero_si256 (),
4089 (__mmask8) __U,
4090 _MM_FROUND_CUR_DIRECTION);
4091 }
4092
4093 static __inline__ double __DEFAULT_FN_ATTRS512
_mm512_cvtsd_f64(__m512d __a)4094 _mm512_cvtsd_f64(__m512d __a)
4095 {
4096 return __a[0];
4097 }
4098
4099 static __inline__ float __DEFAULT_FN_ATTRS512
_mm512_cvtss_f32(__m512 __a)4100 _mm512_cvtss_f32(__m512 __a)
4101 {
4102 return __a[0];
4103 }
4104
4105 /* Unpack and Interleave */
4106
4107 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_unpackhi_pd(__m512d __a,__m512d __b)4108 _mm512_unpackhi_pd(__m512d __a, __m512d __b)
4109 {
4110 return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
4111 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
4112 }
4113
4114 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_unpackhi_pd(__m512d __W,__mmask8 __U,__m512d __A,__m512d __B)4115 _mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
4116 {
4117 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
4118 (__v8df)_mm512_unpackhi_pd(__A, __B),
4119 (__v8df)__W);
4120 }
4121
4122 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_unpackhi_pd(__mmask8 __U,__m512d __A,__m512d __B)4123 _mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B)
4124 {
4125 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
4126 (__v8df)_mm512_unpackhi_pd(__A, __B),
4127 (__v8df)_mm512_setzero_pd());
4128 }
4129
4130 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_unpacklo_pd(__m512d __a,__m512d __b)4131 _mm512_unpacklo_pd(__m512d __a, __m512d __b)
4132 {
4133 return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
4134 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
4135 }
4136
4137 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_unpacklo_pd(__m512d __W,__mmask8 __U,__m512d __A,__m512d __B)4138 _mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
4139 {
4140 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
4141 (__v8df)_mm512_unpacklo_pd(__A, __B),
4142 (__v8df)__W);
4143 }
4144
4145 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_unpacklo_pd(__mmask8 __U,__m512d __A,__m512d __B)4146 _mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
4147 {
4148 return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
4149 (__v8df)_mm512_unpacklo_pd(__A, __B),
4150 (__v8df)_mm512_setzero_pd());
4151 }
4152
4153 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_unpackhi_ps(__m512 __a,__m512 __b)4154 _mm512_unpackhi_ps(__m512 __a, __m512 __b)
4155 {
4156 return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
4157 2, 18, 3, 19,
4158 2+4, 18+4, 3+4, 19+4,
4159 2+8, 18+8, 3+8, 19+8,
4160 2+12, 18+12, 3+12, 19+12);
4161 }
4162
4163 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_unpackhi_ps(__m512 __W,__mmask16 __U,__m512 __A,__m512 __B)4164 _mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
4165 {
4166 return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
4167 (__v16sf)_mm512_unpackhi_ps(__A, __B),
4168 (__v16sf)__W);
4169 }
4170
4171 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_unpackhi_ps(__mmask16 __U,__m512 __A,__m512 __B)4172 _mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
4173 {
4174 return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
4175 (__v16sf)_mm512_unpackhi_ps(__A, __B),
4176 (__v16sf)_mm512_setzero_ps());
4177 }
4178
4179 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_unpacklo_ps(__m512 __a,__m512 __b)4180 _mm512_unpacklo_ps(__m512 __a, __m512 __b)
4181 {
4182 return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
4183 0, 16, 1, 17,
4184 0+4, 16+4, 1+4, 17+4,
4185 0+8, 16+8, 1+8, 17+8,
4186 0+12, 16+12, 1+12, 17+12);
4187 }
4188
4189 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_unpacklo_ps(__m512 __W,__mmask16 __U,__m512 __A,__m512 __B)4190 _mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
4191 {
4192 return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
4193 (__v16sf)_mm512_unpacklo_ps(__A, __B),
4194 (__v16sf)__W);
4195 }
4196
4197 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_unpacklo_ps(__mmask16 __U,__m512 __A,__m512 __B)4198 _mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B)
4199 {
4200 return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
4201 (__v16sf)_mm512_unpacklo_ps(__A, __B),
4202 (__v16sf)_mm512_setzero_ps());
4203 }
4204
4205 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_unpackhi_epi32(__m512i __A,__m512i __B)4206 _mm512_unpackhi_epi32(__m512i __A, __m512i __B)
4207 {
4208 return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
4209 2, 18, 3, 19,
4210 2+4, 18+4, 3+4, 19+4,
4211 2+8, 18+8, 3+8, 19+8,
4212 2+12, 18+12, 3+12, 19+12);
4213 }
4214
4215 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_unpackhi_epi32(__m512i __W,__mmask16 __U,__m512i __A,__m512i __B)4216 _mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
4217 {
4218 return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
4219 (__v16si)_mm512_unpackhi_epi32(__A, __B),
4220 (__v16si)__W);
4221 }
4222
4223 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_unpackhi_epi32(__mmask16 __U,__m512i __A,__m512i __B)4224 _mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B)
4225 {
4226 return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
4227 (__v16si)_mm512_unpackhi_epi32(__A, __B),
4228 (__v16si)_mm512_setzero_si512());
4229 }
4230
4231 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_unpacklo_epi32(__m512i __A,__m512i __B)4232 _mm512_unpacklo_epi32(__m512i __A, __m512i __B)
4233 {
4234 return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
4235 0, 16, 1, 17,
4236 0+4, 16+4, 1+4, 17+4,
4237 0+8, 16+8, 1+8, 17+8,
4238 0+12, 16+12, 1+12, 17+12);
4239 }
4240
4241 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_unpacklo_epi32(__m512i __W,__mmask16 __U,__m512i __A,__m512i __B)4242 _mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
4243 {
4244 return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
4245 (__v16si)_mm512_unpacklo_epi32(__A, __B),
4246 (__v16si)__W);
4247 }
4248
4249 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_unpacklo_epi32(__mmask16 __U,__m512i __A,__m512i __B)4250 _mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B)
4251 {
4252 return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
4253 (__v16si)_mm512_unpacklo_epi32(__A, __B),
4254 (__v16si)_mm512_setzero_si512());
4255 }
4256
4257 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_unpackhi_epi64(__m512i __A,__m512i __B)4258 _mm512_unpackhi_epi64(__m512i __A, __m512i __B)
4259 {
4260 return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
4261 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
4262 }
4263
4264 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_unpackhi_epi64(__m512i __W,__mmask8 __U,__m512i __A,__m512i __B)4265 _mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
4266 {
4267 return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
4268 (__v8di)_mm512_unpackhi_epi64(__A, __B),
4269 (__v8di)__W);
4270 }
4271
4272 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_unpackhi_epi64(__mmask8 __U,__m512i __A,__m512i __B)4273 _mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B)
4274 {
4275 return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
4276 (__v8di)_mm512_unpackhi_epi64(__A, __B),
4277 (__v8di)_mm512_setzero_si512());
4278 }
4279
4280 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_unpacklo_epi64(__m512i __A,__m512i __B)4281 _mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
4282 {
4283 return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
4284 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
4285 }
4286
4287 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_unpacklo_epi64(__m512i __W,__mmask8 __U,__m512i __A,__m512i __B)4288 _mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
4289 {
4290 return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
4291 (__v8di)_mm512_unpacklo_epi64(__A, __B),
4292 (__v8di)__W);
4293 }
4294
4295 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_unpacklo_epi64(__mmask8 __U,__m512i __A,__m512i __B)4296 _mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
4297 {
4298 return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
4299 (__v8di)_mm512_unpacklo_epi64(__A, __B),
4300 (__v8di)_mm512_setzero_si512());
4301 }
4302
4303
4304 /* SIMD load ops */
4305
4306 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_loadu_si512(void const * __P)4307 _mm512_loadu_si512 (void const *__P)
4308 {
4309 struct __loadu_si512 {
4310 __m512i_u __v;
4311 } __attribute__((__packed__, __may_alias__));
4312 return ((const struct __loadu_si512*)__P)->__v;
4313 }
4314
4315 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_loadu_epi32(void const * __P)4316 _mm512_loadu_epi32 (void const *__P)
4317 {
4318 struct __loadu_epi32 {
4319 __m512i_u __v;
4320 } __attribute__((__packed__, __may_alias__));
4321 return ((const struct __loadu_epi32*)__P)->__v;
4322 }
4323
4324 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_loadu_epi32(__m512i __W,__mmask16 __U,void const * __P)4325 _mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
4326 {
4327 return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
4328 (__v16si) __W,
4329 (__mmask16) __U);
4330 }
4331
4332
4333 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_loadu_epi32(__mmask16 __U,void const * __P)4334 _mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P)
4335 {
4336 return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P,
4337 (__v16si)
4338 _mm512_setzero_si512 (),
4339 (__mmask16) __U);
4340 }
4341
4342 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_loadu_epi64(void const * __P)4343 _mm512_loadu_epi64 (void const *__P)
4344 {
4345 struct __loadu_epi64 {
4346 __m512i_u __v;
4347 } __attribute__((__packed__, __may_alias__));
4348 return ((const struct __loadu_epi64*)__P)->__v;
4349 }
4350
4351 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_loadu_epi64(__m512i __W,__mmask8 __U,void const * __P)4352 _mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
4353 {
4354 return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P,
4355 (__v8di) __W,
4356 (__mmask8) __U);
4357 }
4358
4359 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_loadu_epi64(__mmask8 __U,void const * __P)4360 _mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P)
4361 {
4362 return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P,
4363 (__v8di)
4364 _mm512_setzero_si512 (),
4365 (__mmask8) __U);
4366 }
4367
4368 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_loadu_ps(__m512 __W,__mmask16 __U,void const * __P)4369 _mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P)
4370 {
4371 return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
4372 (__v16sf) __W,
4373 (__mmask16) __U);
4374 }
4375
4376 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_loadu_ps(__mmask16 __U,void const * __P)4377 _mm512_maskz_loadu_ps(__mmask16 __U, void const *__P)
4378 {
4379 return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P,
4380 (__v16sf)
4381 _mm512_setzero_ps (),
4382 (__mmask16) __U);
4383 }
4384
4385 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_loadu_pd(__m512d __W,__mmask8 __U,void const * __P)4386 _mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P)
4387 {
4388 return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
4389 (__v8df) __W,
4390 (__mmask8) __U);
4391 }
4392
4393 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_loadu_pd(__mmask8 __U,void const * __P)4394 _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
4395 {
4396 return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P,
4397 (__v8df)
4398 _mm512_setzero_pd (),
4399 (__mmask8) __U);
4400 }
4401
4402 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_loadu_pd(void const * __p)4403 _mm512_loadu_pd(void const *__p)
4404 {
4405 struct __loadu_pd {
4406 __m512d_u __v;
4407 } __attribute__((__packed__, __may_alias__));
4408 return ((const struct __loadu_pd*)__p)->__v;
4409 }
4410
4411 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_loadu_ps(void const * __p)4412 _mm512_loadu_ps(void const *__p)
4413 {
4414 struct __loadu_ps {
4415 __m512_u __v;
4416 } __attribute__((__packed__, __may_alias__));
4417 return ((const struct __loadu_ps*)__p)->__v;
4418 }
4419
4420 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_load_ps(void const * __p)4421 _mm512_load_ps(void const *__p)
4422 {
4423 return *(const __m512*)__p;
4424 }
4425
4426 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_load_ps(__m512 __W,__mmask16 __U,void const * __P)4427 _mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P)
4428 {
4429 return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
4430 (__v16sf) __W,
4431 (__mmask16) __U);
4432 }
4433
4434 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_load_ps(__mmask16 __U,void const * __P)4435 _mm512_maskz_load_ps(__mmask16 __U, void const *__P)
4436 {
4437 return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P,
4438 (__v16sf)
4439 _mm512_setzero_ps (),
4440 (__mmask16) __U);
4441 }
4442
4443 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_load_pd(void const * __p)4444 _mm512_load_pd(void const *__p)
4445 {
4446 return *(const __m512d*)__p;
4447 }
4448
4449 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_load_pd(__m512d __W,__mmask8 __U,void const * __P)4450 _mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P)
4451 {
4452 return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
4453 (__v8df) __W,
4454 (__mmask8) __U);
4455 }
4456
4457 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_load_pd(__mmask8 __U,void const * __P)4458 _mm512_maskz_load_pd(__mmask8 __U, void const *__P)
4459 {
4460 return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P,
4461 (__v8df)
4462 _mm512_setzero_pd (),
4463 (__mmask8) __U);
4464 }
4465
4466 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_load_si512(void const * __P)4467 _mm512_load_si512 (void const *__P)
4468 {
4469 return *(const __m512i *) __P;
4470 }
4471
4472 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_load_epi32(void const * __P)4473 _mm512_load_epi32 (void const *__P)
4474 {
4475 return *(const __m512i *) __P;
4476 }
4477
4478 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_load_epi64(void const * __P)4479 _mm512_load_epi64 (void const *__P)
4480 {
4481 return *(const __m512i *) __P;
4482 }
4483
4484 /* SIMD store ops */
4485
4486 static __inline void __DEFAULT_FN_ATTRS512
_mm512_storeu_epi64(void * __P,__m512i __A)4487 _mm512_storeu_epi64 (void *__P, __m512i __A)
4488 {
4489 struct __storeu_epi64 {
4490 __m512i_u __v;
4491 } __attribute__((__packed__, __may_alias__));
4492 ((struct __storeu_epi64*)__P)->__v = __A;
4493 }
4494
4495 static __inline void __DEFAULT_FN_ATTRS512
_mm512_mask_storeu_epi64(void * __P,__mmask8 __U,__m512i __A)4496 _mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A)
4497 {
4498 __builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A,
4499 (__mmask8) __U);
4500 }
4501
4502 static __inline void __DEFAULT_FN_ATTRS512
_mm512_storeu_si512(void * __P,__m512i __A)4503 _mm512_storeu_si512 (void *__P, __m512i __A)
4504 {
4505 struct __storeu_si512 {
4506 __m512i_u __v;
4507 } __attribute__((__packed__, __may_alias__));
4508 ((struct __storeu_si512*)__P)->__v = __A;
4509 }
4510
4511 static __inline void __DEFAULT_FN_ATTRS512
_mm512_storeu_epi32(void * __P,__m512i __A)4512 _mm512_storeu_epi32 (void *__P, __m512i __A)
4513 {
4514 struct __storeu_epi32 {
4515 __m512i_u __v;
4516 } __attribute__((__packed__, __may_alias__));
4517 ((struct __storeu_epi32*)__P)->__v = __A;
4518 }
4519
4520 static __inline void __DEFAULT_FN_ATTRS512
_mm512_mask_storeu_epi32(void * __P,__mmask16 __U,__m512i __A)4521 _mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A)
4522 {
4523 __builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A,
4524 (__mmask16) __U);
4525 }
4526
4527 static __inline void __DEFAULT_FN_ATTRS512
_mm512_mask_storeu_pd(void * __P,__mmask8 __U,__m512d __A)4528 _mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A)
4529 {
4530 __builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U);
4531 }
4532
4533 static __inline void __DEFAULT_FN_ATTRS512
_mm512_storeu_pd(void * __P,__m512d __A)4534 _mm512_storeu_pd(void *__P, __m512d __A)
4535 {
4536 struct __storeu_pd {
4537 __m512d_u __v;
4538 } __attribute__((__packed__, __may_alias__));
4539 ((struct __storeu_pd*)__P)->__v = __A;
4540 }
4541
4542 static __inline void __DEFAULT_FN_ATTRS512
_mm512_mask_storeu_ps(void * __P,__mmask16 __U,__m512 __A)4543 _mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A)
4544 {
4545 __builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A,
4546 (__mmask16) __U);
4547 }
4548
4549 static __inline void __DEFAULT_FN_ATTRS512
_mm512_storeu_ps(void * __P,__m512 __A)4550 _mm512_storeu_ps(void *__P, __m512 __A)
4551 {
4552 struct __storeu_ps {
4553 __m512_u __v;
4554 } __attribute__((__packed__, __may_alias__));
4555 ((struct __storeu_ps*)__P)->__v = __A;
4556 }
4557
4558 static __inline void __DEFAULT_FN_ATTRS512
_mm512_mask_store_pd(void * __P,__mmask8 __U,__m512d __A)4559 _mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A)
4560 {
4561 __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
4562 }
4563
4564 static __inline void __DEFAULT_FN_ATTRS512
_mm512_store_pd(void * __P,__m512d __A)4565 _mm512_store_pd(void *__P, __m512d __A)
4566 {
4567 *(__m512d*)__P = __A;
4568 }
4569
4570 static __inline void __DEFAULT_FN_ATTRS512
_mm512_mask_store_ps(void * __P,__mmask16 __U,__m512 __A)4571 _mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A)
4572 {
4573 __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A,
4574 (__mmask16) __U);
4575 }
4576
4577 static __inline void __DEFAULT_FN_ATTRS512
_mm512_store_ps(void * __P,__m512 __A)4578 _mm512_store_ps(void *__P, __m512 __A)
4579 {
4580 *(__m512*)__P = __A;
4581 }
4582
4583 static __inline void __DEFAULT_FN_ATTRS512
_mm512_store_si512(void * __P,__m512i __A)4584 _mm512_store_si512 (void *__P, __m512i __A)
4585 {
4586 *(__m512i *) __P = __A;
4587 }
4588
4589 static __inline void __DEFAULT_FN_ATTRS512
_mm512_store_epi32(void * __P,__m512i __A)4590 _mm512_store_epi32 (void *__P, __m512i __A)
4591 {
4592 *(__m512i *) __P = __A;
4593 }
4594
4595 static __inline void __DEFAULT_FN_ATTRS512
_mm512_store_epi64(void * __P,__m512i __A)4596 _mm512_store_epi64 (void *__P, __m512i __A)
4597 {
4598 *(__m512i *) __P = __A;
4599 }
4600
4601 /* Mask ops */
4602
4603 static __inline __mmask16 __DEFAULT_FN_ATTRS
_mm512_knot(__mmask16 __M)4604 _mm512_knot(__mmask16 __M)
4605 {
4606 return __builtin_ia32_knothi(__M);
4607 }
4608
4609 /* Integer compare */
4610
4611 #define _mm512_cmpeq_epi32_mask(A, B) \
4612 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
4613 #define _mm512_mask_cmpeq_epi32_mask(k, A, B) \
4614 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
4615 #define _mm512_cmpge_epi32_mask(A, B) \
4616 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
4617 #define _mm512_mask_cmpge_epi32_mask(k, A, B) \
4618 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
4619 #define _mm512_cmpgt_epi32_mask(A, B) \
4620 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
4621 #define _mm512_mask_cmpgt_epi32_mask(k, A, B) \
4622 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
4623 #define _mm512_cmple_epi32_mask(A, B) \
4624 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
4625 #define _mm512_mask_cmple_epi32_mask(k, A, B) \
4626 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
4627 #define _mm512_cmplt_epi32_mask(A, B) \
4628 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
4629 #define _mm512_mask_cmplt_epi32_mask(k, A, B) \
4630 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
4631 #define _mm512_cmpneq_epi32_mask(A, B) \
4632 _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
4633 #define _mm512_mask_cmpneq_epi32_mask(k, A, B) \
4634 _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
4635
4636 #define _mm512_cmpeq_epu32_mask(A, B) \
4637 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
4638 #define _mm512_mask_cmpeq_epu32_mask(k, A, B) \
4639 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
4640 #define _mm512_cmpge_epu32_mask(A, B) \
4641 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
4642 #define _mm512_mask_cmpge_epu32_mask(k, A, B) \
4643 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
4644 #define _mm512_cmpgt_epu32_mask(A, B) \
4645 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
4646 #define _mm512_mask_cmpgt_epu32_mask(k, A, B) \
4647 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
4648 #define _mm512_cmple_epu32_mask(A, B) \
4649 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
4650 #define _mm512_mask_cmple_epu32_mask(k, A, B) \
4651 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
4652 #define _mm512_cmplt_epu32_mask(A, B) \
4653 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
4654 #define _mm512_mask_cmplt_epu32_mask(k, A, B) \
4655 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
4656 #define _mm512_cmpneq_epu32_mask(A, B) \
4657 _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
4658 #define _mm512_mask_cmpneq_epu32_mask(k, A, B) \
4659 _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
4660
4661 #define _mm512_cmpeq_epi64_mask(A, B) \
4662 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
4663 #define _mm512_mask_cmpeq_epi64_mask(k, A, B) \
4664 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
4665 #define _mm512_cmpge_epi64_mask(A, B) \
4666 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
4667 #define _mm512_mask_cmpge_epi64_mask(k, A, B) \
4668 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
4669 #define _mm512_cmpgt_epi64_mask(A, B) \
4670 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
4671 #define _mm512_mask_cmpgt_epi64_mask(k, A, B) \
4672 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
4673 #define _mm512_cmple_epi64_mask(A, B) \
4674 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
4675 #define _mm512_mask_cmple_epi64_mask(k, A, B) \
4676 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
4677 #define _mm512_cmplt_epi64_mask(A, B) \
4678 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
4679 #define _mm512_mask_cmplt_epi64_mask(k, A, B) \
4680 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
4681 #define _mm512_cmpneq_epi64_mask(A, B) \
4682 _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
4683 #define _mm512_mask_cmpneq_epi64_mask(k, A, B) \
4684 _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
4685
4686 #define _mm512_cmpeq_epu64_mask(A, B) \
4687 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
4688 #define _mm512_mask_cmpeq_epu64_mask(k, A, B) \
4689 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
4690 #define _mm512_cmpge_epu64_mask(A, B) \
4691 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
4692 #define _mm512_mask_cmpge_epu64_mask(k, A, B) \
4693 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
4694 #define _mm512_cmpgt_epu64_mask(A, B) \
4695 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
4696 #define _mm512_mask_cmpgt_epu64_mask(k, A, B) \
4697 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
4698 #define _mm512_cmple_epu64_mask(A, B) \
4699 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
4700 #define _mm512_mask_cmple_epu64_mask(k, A, B) \
4701 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
4702 #define _mm512_cmplt_epu64_mask(A, B) \
4703 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
4704 #define _mm512_mask_cmplt_epu64_mask(k, A, B) \
4705 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
4706 #define _mm512_cmpneq_epu64_mask(A, B) \
4707 _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
4708 #define _mm512_mask_cmpneq_epu64_mask(k, A, B) \
4709 _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
4710
4711 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtepi8_epi32(__m128i __A)4712 _mm512_cvtepi8_epi32(__m128i __A)
4713 {
4714 /* This function always performs a signed extension, but __v16qi is a char
4715 which may be signed or unsigned, so use __v16qs. */
4716 return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si);
4717 }
4718
4719 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi8_epi32(__m512i __W,__mmask16 __U,__m128i __A)4720 _mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
4721 {
4722 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4723 (__v16si)_mm512_cvtepi8_epi32(__A),
4724 (__v16si)__W);
4725 }
4726
4727 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi8_epi32(__mmask16 __U,__m128i __A)4728 _mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A)
4729 {
4730 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4731 (__v16si)_mm512_cvtepi8_epi32(__A),
4732 (__v16si)_mm512_setzero_si512());
4733 }
4734
4735 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtepi8_epi64(__m128i __A)4736 _mm512_cvtepi8_epi64(__m128i __A)
4737 {
4738 /* This function always performs a signed extension, but __v16qi is a char
4739 which may be signed or unsigned, so use __v16qs. */
4740 return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
4741 }
4742
4743 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi8_epi64(__m512i __W,__mmask8 __U,__m128i __A)4744 _mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
4745 {
4746 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4747 (__v8di)_mm512_cvtepi8_epi64(__A),
4748 (__v8di)__W);
4749 }
4750
4751 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi8_epi64(__mmask8 __U,__m128i __A)4752 _mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
4753 {
4754 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4755 (__v8di)_mm512_cvtepi8_epi64(__A),
4756 (__v8di)_mm512_setzero_si512 ());
4757 }
4758
4759 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtepi32_epi64(__m256i __X)4760 _mm512_cvtepi32_epi64(__m256i __X)
4761 {
4762 return (__m512i)__builtin_convertvector((__v8si)__X, __v8di);
4763 }
4764
4765 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi32_epi64(__m512i __W,__mmask8 __U,__m256i __X)4766 _mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
4767 {
4768 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4769 (__v8di)_mm512_cvtepi32_epi64(__X),
4770 (__v8di)__W);
4771 }
4772
4773 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi32_epi64(__mmask8 __U,__m256i __X)4774 _mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X)
4775 {
4776 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4777 (__v8di)_mm512_cvtepi32_epi64(__X),
4778 (__v8di)_mm512_setzero_si512());
4779 }
4780
4781 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtepi16_epi32(__m256i __A)4782 _mm512_cvtepi16_epi32(__m256i __A)
4783 {
4784 return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si);
4785 }
4786
4787 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi16_epi32(__m512i __W,__mmask16 __U,__m256i __A)4788 _mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
4789 {
4790 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4791 (__v16si)_mm512_cvtepi16_epi32(__A),
4792 (__v16si)__W);
4793 }
4794
4795 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi16_epi32(__mmask16 __U,__m256i __A)4796 _mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A)
4797 {
4798 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4799 (__v16si)_mm512_cvtepi16_epi32(__A),
4800 (__v16si)_mm512_setzero_si512 ());
4801 }
4802
4803 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtepi16_epi64(__m128i __A)4804 _mm512_cvtepi16_epi64(__m128i __A)
4805 {
4806 return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di);
4807 }
4808
4809 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi16_epi64(__m512i __W,__mmask8 __U,__m128i __A)4810 _mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
4811 {
4812 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4813 (__v8di)_mm512_cvtepi16_epi64(__A),
4814 (__v8di)__W);
4815 }
4816
4817 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi16_epi64(__mmask8 __U,__m128i __A)4818 _mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
4819 {
4820 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4821 (__v8di)_mm512_cvtepi16_epi64(__A),
4822 (__v8di)_mm512_setzero_si512());
4823 }
4824
4825 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtepu8_epi32(__m128i __A)4826 _mm512_cvtepu8_epi32(__m128i __A)
4827 {
4828 return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si);
4829 }
4830
4831 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepu8_epi32(__m512i __W,__mmask16 __U,__m128i __A)4832 _mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
4833 {
4834 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4835 (__v16si)_mm512_cvtepu8_epi32(__A),
4836 (__v16si)__W);
4837 }
4838
4839 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepu8_epi32(__mmask16 __U,__m128i __A)4840 _mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A)
4841 {
4842 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4843 (__v16si)_mm512_cvtepu8_epi32(__A),
4844 (__v16si)_mm512_setzero_si512());
4845 }
4846
4847 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtepu8_epi64(__m128i __A)4848 _mm512_cvtepu8_epi64(__m128i __A)
4849 {
4850 return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
4851 }
4852
4853 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepu8_epi64(__m512i __W,__mmask8 __U,__m128i __A)4854 _mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
4855 {
4856 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4857 (__v8di)_mm512_cvtepu8_epi64(__A),
4858 (__v8di)__W);
4859 }
4860
4861 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepu8_epi64(__mmask8 __U,__m128i __A)4862 _mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
4863 {
4864 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4865 (__v8di)_mm512_cvtepu8_epi64(__A),
4866 (__v8di)_mm512_setzero_si512());
4867 }
4868
4869 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtepu32_epi64(__m256i __X)4870 _mm512_cvtepu32_epi64(__m256i __X)
4871 {
4872 return (__m512i)__builtin_convertvector((__v8su)__X, __v8di);
4873 }
4874
4875 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepu32_epi64(__m512i __W,__mmask8 __U,__m256i __X)4876 _mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
4877 {
4878 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4879 (__v8di)_mm512_cvtepu32_epi64(__X),
4880 (__v8di)__W);
4881 }
4882
4883 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepu32_epi64(__mmask8 __U,__m256i __X)4884 _mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X)
4885 {
4886 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4887 (__v8di)_mm512_cvtepu32_epi64(__X),
4888 (__v8di)_mm512_setzero_si512());
4889 }
4890
4891 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtepu16_epi32(__m256i __A)4892 _mm512_cvtepu16_epi32(__m256i __A)
4893 {
4894 return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si);
4895 }
4896
4897 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepu16_epi32(__m512i __W,__mmask16 __U,__m256i __A)4898 _mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
4899 {
4900 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4901 (__v16si)_mm512_cvtepu16_epi32(__A),
4902 (__v16si)__W);
4903 }
4904
4905 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepu16_epi32(__mmask16 __U,__m256i __A)4906 _mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A)
4907 {
4908 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4909 (__v16si)_mm512_cvtepu16_epi32(__A),
4910 (__v16si)_mm512_setzero_si512());
4911 }
4912
4913 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtepu16_epi64(__m128i __A)4914 _mm512_cvtepu16_epi64(__m128i __A)
4915 {
4916 return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di);
4917 }
4918
4919 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepu16_epi64(__m512i __W,__mmask8 __U,__m128i __A)4920 _mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
4921 {
4922 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4923 (__v8di)_mm512_cvtepu16_epi64(__A),
4924 (__v8di)__W);
4925 }
4926
4927 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepu16_epi64(__mmask8 __U,__m128i __A)4928 _mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
4929 {
4930 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4931 (__v8di)_mm512_cvtepu16_epi64(__A),
4932 (__v8di)_mm512_setzero_si512());
4933 }
4934
4935 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_rorv_epi32(__m512i __A,__m512i __B)4936 _mm512_rorv_epi32 (__m512i __A, __m512i __B)
4937 {
4938 return (__m512i)__builtin_ia32_prorvd512((__v16si)__A, (__v16si)__B);
4939 }
4940
4941 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_rorv_epi32(__m512i __W,__mmask16 __U,__m512i __A,__m512i __B)4942 _mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
4943 {
4944 return (__m512i)__builtin_ia32_selectd_512(__U,
4945 (__v16si)_mm512_rorv_epi32(__A, __B),
4946 (__v16si)__W);
4947 }
4948
4949 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_rorv_epi32(__mmask16 __U,__m512i __A,__m512i __B)4950 _mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
4951 {
4952 return (__m512i)__builtin_ia32_selectd_512(__U,
4953 (__v16si)_mm512_rorv_epi32(__A, __B),
4954 (__v16si)_mm512_setzero_si512());
4955 }
4956
4957 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_rorv_epi64(__m512i __A,__m512i __B)4958 _mm512_rorv_epi64 (__m512i __A, __m512i __B)
4959 {
4960 return (__m512i)__builtin_ia32_prorvq512((__v8di)__A, (__v8di)__B);
4961 }
4962
4963 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_rorv_epi64(__m512i __W,__mmask8 __U,__m512i __A,__m512i __B)4964 _mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
4965 {
4966 return (__m512i)__builtin_ia32_selectq_512(__U,
4967 (__v8di)_mm512_rorv_epi64(__A, __B),
4968 (__v8di)__W);
4969 }
4970
4971 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_rorv_epi64(__mmask8 __U,__m512i __A,__m512i __B)4972 _mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
4973 {
4974 return (__m512i)__builtin_ia32_selectq_512(__U,
4975 (__v8di)_mm512_rorv_epi64(__A, __B),
4976 (__v8di)_mm512_setzero_si512());
4977 }
4978
4979
4980
4981 #define _mm512_cmp_epi32_mask(a, b, p) \
4982 ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
4983 (__v16si)(__m512i)(b), (int)(p), \
4984 (__mmask16)-1))
4985
4986 #define _mm512_cmp_epu32_mask(a, b, p) \
4987 ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
4988 (__v16si)(__m512i)(b), (int)(p), \
4989 (__mmask16)-1))
4990
4991 #define _mm512_cmp_epi64_mask(a, b, p) \
4992 ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
4993 (__v8di)(__m512i)(b), (int)(p), \
4994 (__mmask8)-1))
4995
4996 #define _mm512_cmp_epu64_mask(a, b, p) \
4997 ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
4998 (__v8di)(__m512i)(b), (int)(p), \
4999 (__mmask8)-1))
5000
5001 #define _mm512_mask_cmp_epi32_mask(m, a, b, p) \
5002 ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
5003 (__v16si)(__m512i)(b), (int)(p), \
5004 (__mmask16)(m)))
5005
5006 #define _mm512_mask_cmp_epu32_mask(m, a, b, p) \
5007 ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
5008 (__v16si)(__m512i)(b), (int)(p), \
5009 (__mmask16)(m)))
5010
5011 #define _mm512_mask_cmp_epi64_mask(m, a, b, p) \
5012 ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
5013 (__v8di)(__m512i)(b), (int)(p), \
5014 (__mmask8)(m)))
5015
5016 #define _mm512_mask_cmp_epu64_mask(m, a, b, p) \
5017 ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
5018 (__v8di)(__m512i)(b), (int)(p), \
5019 (__mmask8)(m)))
5020
5021 #define _mm512_rol_epi32(a, b) \
5022 ((__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b)))
5023
5024 #define _mm512_mask_rol_epi32(W, U, a, b) \
5025 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5026 (__v16si)_mm512_rol_epi32((a), (b)), \
5027 (__v16si)(__m512i)(W)))
5028
5029 #define _mm512_maskz_rol_epi32(U, a, b) \
5030 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5031 (__v16si)_mm512_rol_epi32((a), (b)), \
5032 (__v16si)_mm512_setzero_si512()))
5033
5034 #define _mm512_rol_epi64(a, b) \
5035 ((__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b)))
5036
5037 #define _mm512_mask_rol_epi64(W, U, a, b) \
5038 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5039 (__v8di)_mm512_rol_epi64((a), (b)), \
5040 (__v8di)(__m512i)(W)))
5041
5042 #define _mm512_maskz_rol_epi64(U, a, b) \
5043 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5044 (__v8di)_mm512_rol_epi64((a), (b)), \
5045 (__v8di)_mm512_setzero_si512()))
5046
5047 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_rolv_epi32(__m512i __A,__m512i __B)5048 _mm512_rolv_epi32 (__m512i __A, __m512i __B)
5049 {
5050 return (__m512i)__builtin_ia32_prolvd512((__v16si)__A, (__v16si)__B);
5051 }
5052
5053 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_rolv_epi32(__m512i __W,__mmask16 __U,__m512i __A,__m512i __B)5054 _mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
5055 {
5056 return (__m512i)__builtin_ia32_selectd_512(__U,
5057 (__v16si)_mm512_rolv_epi32(__A, __B),
5058 (__v16si)__W);
5059 }
5060
5061 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_rolv_epi32(__mmask16 __U,__m512i __A,__m512i __B)5062 _mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
5063 {
5064 return (__m512i)__builtin_ia32_selectd_512(__U,
5065 (__v16si)_mm512_rolv_epi32(__A, __B),
5066 (__v16si)_mm512_setzero_si512());
5067 }
5068
5069 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_rolv_epi64(__m512i __A,__m512i __B)5070 _mm512_rolv_epi64 (__m512i __A, __m512i __B)
5071 {
5072 return (__m512i)__builtin_ia32_prolvq512((__v8di)__A, (__v8di)__B);
5073 }
5074
5075 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_rolv_epi64(__m512i __W,__mmask8 __U,__m512i __A,__m512i __B)5076 _mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
5077 {
5078 return (__m512i)__builtin_ia32_selectq_512(__U,
5079 (__v8di)_mm512_rolv_epi64(__A, __B),
5080 (__v8di)__W);
5081 }
5082
5083 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_rolv_epi64(__mmask8 __U,__m512i __A,__m512i __B)5084 _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
5085 {
5086 return (__m512i)__builtin_ia32_selectq_512(__U,
5087 (__v8di)_mm512_rolv_epi64(__A, __B),
5088 (__v8di)_mm512_setzero_si512());
5089 }
5090
5091 #define _mm512_ror_epi32(A, B) \
5092 ((__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B)))
5093
5094 #define _mm512_mask_ror_epi32(W, U, A, B) \
5095 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5096 (__v16si)_mm512_ror_epi32((A), (B)), \
5097 (__v16si)(__m512i)(W)))
5098
5099 #define _mm512_maskz_ror_epi32(U, A, B) \
5100 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5101 (__v16si)_mm512_ror_epi32((A), (B)), \
5102 (__v16si)_mm512_setzero_si512()))
5103
5104 #define _mm512_ror_epi64(A, B) \
5105 ((__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B)))
5106
5107 #define _mm512_mask_ror_epi64(W, U, A, B) \
5108 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5109 (__v8di)_mm512_ror_epi64((A), (B)), \
5110 (__v8di)(__m512i)(W)))
5111
5112 #define _mm512_maskz_ror_epi64(U, A, B) \
5113 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5114 (__v8di)_mm512_ror_epi64((A), (B)), \
5115 (__v8di)_mm512_setzero_si512()))
5116
5117 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_slli_epi32(__m512i __A,unsigned int __B)5118 _mm512_slli_epi32(__m512i __A, unsigned int __B)
5119 {
5120 return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, (int)__B);
5121 }
5122
5123 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_slli_epi32(__m512i __W,__mmask16 __U,__m512i __A,unsigned int __B)5124 _mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A,
5125 unsigned int __B)
5126 {
5127 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5128 (__v16si)_mm512_slli_epi32(__A, __B),
5129 (__v16si)__W);
5130 }
5131
5132 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_slli_epi32(__mmask16 __U,__m512i __A,unsigned int __B)5133 _mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
5134 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5135 (__v16si)_mm512_slli_epi32(__A, __B),
5136 (__v16si)_mm512_setzero_si512());
5137 }
5138
5139 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_slli_epi64(__m512i __A,unsigned int __B)5140 _mm512_slli_epi64(__m512i __A, unsigned int __B)
5141 {
5142 return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, (int)__B);
5143 }
5144
5145 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_slli_epi64(__m512i __W,__mmask8 __U,__m512i __A,unsigned int __B)5146 _mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B)
5147 {
5148 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5149 (__v8di)_mm512_slli_epi64(__A, __B),
5150 (__v8di)__W);
5151 }
5152
5153 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_slli_epi64(__mmask8 __U,__m512i __A,unsigned int __B)5154 _mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
5155 {
5156 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5157 (__v8di)_mm512_slli_epi64(__A, __B),
5158 (__v8di)_mm512_setzero_si512());
5159 }
5160
5161 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_srli_epi32(__m512i __A,unsigned int __B)5162 _mm512_srli_epi32(__m512i __A, unsigned int __B)
5163 {
5164 return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, (int)__B);
5165 }
5166
5167 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_srli_epi32(__m512i __W,__mmask16 __U,__m512i __A,unsigned int __B)5168 _mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A,
5169 unsigned int __B)
5170 {
5171 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5172 (__v16si)_mm512_srli_epi32(__A, __B),
5173 (__v16si)__W);
5174 }
5175
5176 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_srli_epi32(__mmask16 __U,__m512i __A,unsigned int __B)5177 _mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
5178 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5179 (__v16si)_mm512_srli_epi32(__A, __B),
5180 (__v16si)_mm512_setzero_si512());
5181 }
5182
5183 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_srli_epi64(__m512i __A,unsigned int __B)5184 _mm512_srli_epi64(__m512i __A, unsigned int __B)
5185 {
5186 return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, (int)__B);
5187 }
5188
5189 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_srli_epi64(__m512i __W,__mmask8 __U,__m512i __A,unsigned int __B)5190 _mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A,
5191 unsigned int __B)
5192 {
5193 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5194 (__v8di)_mm512_srli_epi64(__A, __B),
5195 (__v8di)__W);
5196 }
5197
5198 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_srli_epi64(__mmask8 __U,__m512i __A,unsigned int __B)5199 _mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A,
5200 unsigned int __B)
5201 {
5202 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5203 (__v8di)_mm512_srli_epi64(__A, __B),
5204 (__v8di)_mm512_setzero_si512());
5205 }
5206
5207 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_load_epi32(__m512i __W,__mmask16 __U,void const * __P)5208 _mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
5209 {
5210 return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
5211 (__v16si) __W,
5212 (__mmask16) __U);
5213 }
5214
5215 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_load_epi32(__mmask16 __U,void const * __P)5216 _mm512_maskz_load_epi32 (__mmask16 __U, void const *__P)
5217 {
5218 return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
5219 (__v16si)
5220 _mm512_setzero_si512 (),
5221 (__mmask16) __U);
5222 }
5223
5224 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_store_epi32(void * __P,__mmask16 __U,__m512i __A)5225 _mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A)
5226 {
5227 __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A,
5228 (__mmask16) __U);
5229 }
5230
5231 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_mov_epi32(__m512i __W,__mmask16 __U,__m512i __A)5232 _mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
5233 {
5234 return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
5235 (__v16si) __A,
5236 (__v16si) __W);
5237 }
5238
5239 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_mov_epi32(__mmask16 __U,__m512i __A)5240 _mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A)
5241 {
5242 return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
5243 (__v16si) __A,
5244 (__v16si) _mm512_setzero_si512 ());
5245 }
5246
5247 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_mov_epi64(__m512i __W,__mmask8 __U,__m512i __A)5248 _mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
5249 {
5250 return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
5251 (__v8di) __A,
5252 (__v8di) __W);
5253 }
5254
5255 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_mov_epi64(__mmask8 __U,__m512i __A)5256 _mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A)
5257 {
5258 return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
5259 (__v8di) __A,
5260 (__v8di) _mm512_setzero_si512 ());
5261 }
5262
5263 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_load_epi64(__m512i __W,__mmask8 __U,void const * __P)5264 _mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P)
5265 {
5266 return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
5267 (__v8di) __W,
5268 (__mmask8) __U);
5269 }
5270
5271 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_load_epi64(__mmask8 __U,void const * __P)5272 _mm512_maskz_load_epi64 (__mmask8 __U, void const *__P)
5273 {
5274 return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
5275 (__v8di)
5276 _mm512_setzero_si512 (),
5277 (__mmask8) __U);
5278 }
5279
5280 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_store_epi64(void * __P,__mmask8 __U,__m512i __A)5281 _mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A)
5282 {
5283 __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A,
5284 (__mmask8) __U);
5285 }
5286
5287 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_movedup_pd(__m512d __A)5288 _mm512_movedup_pd (__m512d __A)
5289 {
5290 return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A,
5291 0, 0, 2, 2, 4, 4, 6, 6);
5292 }
5293
5294 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_movedup_pd(__m512d __W,__mmask8 __U,__m512d __A)5295 _mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A)
5296 {
5297 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
5298 (__v8df)_mm512_movedup_pd(__A),
5299 (__v8df)__W);
5300 }
5301
5302 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_movedup_pd(__mmask8 __U,__m512d __A)5303 _mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A)
5304 {
5305 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
5306 (__v8df)_mm512_movedup_pd(__A),
5307 (__v8df)_mm512_setzero_pd());
5308 }
5309
5310 #define _mm512_fixupimm_round_pd(A, B, C, imm, R) \
5311 ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
5312 (__v8df)(__m512d)(B), \
5313 (__v8di)(__m512i)(C), (int)(imm), \
5314 (__mmask8)-1, (int)(R)))
5315
5316 #define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) \
5317 ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
5318 (__v8df)(__m512d)(B), \
5319 (__v8di)(__m512i)(C), (int)(imm), \
5320 (__mmask8)(U), (int)(R)))
5321
5322 #define _mm512_fixupimm_pd(A, B, C, imm) \
5323 ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
5324 (__v8df)(__m512d)(B), \
5325 (__v8di)(__m512i)(C), (int)(imm), \
5326 (__mmask8)-1, \
5327 _MM_FROUND_CUR_DIRECTION))
5328
5329 #define _mm512_mask_fixupimm_pd(A, U, B, C, imm) \
5330 ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
5331 (__v8df)(__m512d)(B), \
5332 (__v8di)(__m512i)(C), (int)(imm), \
5333 (__mmask8)(U), \
5334 _MM_FROUND_CUR_DIRECTION))
5335
5336 #define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \
5337 ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
5338 (__v8df)(__m512d)(B), \
5339 (__v8di)(__m512i)(C), \
5340 (int)(imm), (__mmask8)(U), \
5341 (int)(R)))
5342
5343 #define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) \
5344 ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
5345 (__v8df)(__m512d)(B), \
5346 (__v8di)(__m512i)(C), \
5347 (int)(imm), (__mmask8)(U), \
5348 _MM_FROUND_CUR_DIRECTION))
5349
5350 #define _mm512_fixupimm_round_ps(A, B, C, imm, R) \
5351 ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
5352 (__v16sf)(__m512)(B), \
5353 (__v16si)(__m512i)(C), (int)(imm), \
5354 (__mmask16)-1, (int)(R)))
5355
5356 #define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) \
5357 ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
5358 (__v16sf)(__m512)(B), \
5359 (__v16si)(__m512i)(C), (int)(imm), \
5360 (__mmask16)(U), (int)(R)))
5361
5362 #define _mm512_fixupimm_ps(A, B, C, imm) \
5363 ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
5364 (__v16sf)(__m512)(B), \
5365 (__v16si)(__m512i)(C), (int)(imm), \
5366 (__mmask16)-1, \
5367 _MM_FROUND_CUR_DIRECTION))
5368
5369 #define _mm512_mask_fixupimm_ps(A, U, B, C, imm) \
5370 ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
5371 (__v16sf)(__m512)(B), \
5372 (__v16si)(__m512i)(C), (int)(imm), \
5373 (__mmask16)(U), \
5374 _MM_FROUND_CUR_DIRECTION))
5375
5376 #define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \
5377 ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
5378 (__v16sf)(__m512)(B), \
5379 (__v16si)(__m512i)(C), \
5380 (int)(imm), (__mmask16)(U), \
5381 (int)(R)))
5382
5383 #define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) \
5384 ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
5385 (__v16sf)(__m512)(B), \
5386 (__v16si)(__m512i)(C), \
5387 (int)(imm), (__mmask16)(U), \
5388 _MM_FROUND_CUR_DIRECTION))
5389
5390 #define _mm_fixupimm_round_sd(A, B, C, imm, R) \
5391 ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
5392 (__v2df)(__m128d)(B), \
5393 (__v2di)(__m128i)(C), (int)(imm), \
5394 (__mmask8)-1, (int)(R)))
5395
5396 #define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) \
5397 ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
5398 (__v2df)(__m128d)(B), \
5399 (__v2di)(__m128i)(C), (int)(imm), \
5400 (__mmask8)(U), (int)(R)))
5401
5402 #define _mm_fixupimm_sd(A, B, C, imm) \
5403 ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
5404 (__v2df)(__m128d)(B), \
5405 (__v2di)(__m128i)(C), (int)(imm), \
5406 (__mmask8)-1, \
5407 _MM_FROUND_CUR_DIRECTION))
5408
5409 #define _mm_mask_fixupimm_sd(A, U, B, C, imm) \
5410 ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
5411 (__v2df)(__m128d)(B), \
5412 (__v2di)(__m128i)(C), (int)(imm), \
5413 (__mmask8)(U), \
5414 _MM_FROUND_CUR_DIRECTION))
5415
5416 #define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) \
5417 ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
5418 (__v2df)(__m128d)(B), \
5419 (__v2di)(__m128i)(C), (int)(imm), \
5420 (__mmask8)(U), (int)(R)))
5421
5422 #define _mm_maskz_fixupimm_sd(U, A, B, C, imm) \
5423 ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
5424 (__v2df)(__m128d)(B), \
5425 (__v2di)(__m128i)(C), (int)(imm), \
5426 (__mmask8)(U), \
5427 _MM_FROUND_CUR_DIRECTION))
5428
5429 #define _mm_fixupimm_round_ss(A, B, C, imm, R) \
5430 ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
5431 (__v4sf)(__m128)(B), \
5432 (__v4si)(__m128i)(C), (int)(imm), \
5433 (__mmask8)-1, (int)(R)))
5434
5435 #define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) \
5436 ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
5437 (__v4sf)(__m128)(B), \
5438 (__v4si)(__m128i)(C), (int)(imm), \
5439 (__mmask8)(U), (int)(R)))
5440
5441 #define _mm_fixupimm_ss(A, B, C, imm) \
5442 ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
5443 (__v4sf)(__m128)(B), \
5444 (__v4si)(__m128i)(C), (int)(imm), \
5445 (__mmask8)-1, \
5446 _MM_FROUND_CUR_DIRECTION))
5447
5448 #define _mm_mask_fixupimm_ss(A, U, B, C, imm) \
5449 ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
5450 (__v4sf)(__m128)(B), \
5451 (__v4si)(__m128i)(C), (int)(imm), \
5452 (__mmask8)(U), \
5453 _MM_FROUND_CUR_DIRECTION))
5454
5455 #define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) \
5456 ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
5457 (__v4sf)(__m128)(B), \
5458 (__v4si)(__m128i)(C), (int)(imm), \
5459 (__mmask8)(U), (int)(R)))
5460
5461 #define _mm_maskz_fixupimm_ss(U, A, B, C, imm) \
5462 ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
5463 (__v4sf)(__m128)(B), \
5464 (__v4si)(__m128i)(C), (int)(imm), \
5465 (__mmask8)(U), \
5466 _MM_FROUND_CUR_DIRECTION))
5467
5468 #define _mm_getexp_round_sd(A, B, R) \
5469 ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
5470 (__v2df)(__m128d)(B), \
5471 (__v2df)_mm_setzero_pd(), \
5472 (__mmask8)-1, (int)(R)))
5473
5474
5475 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_getexp_sd(__m128d __A,__m128d __B)5476 _mm_getexp_sd (__m128d __A, __m128d __B)
5477 {
5478 return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A,
5479 (__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
5480 }
5481
5482 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_getexp_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128d __B)5483 _mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
5484 {
5485 return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
5486 (__v2df) __B,
5487 (__v2df) __W,
5488 (__mmask8) __U,
5489 _MM_FROUND_CUR_DIRECTION);
5490 }
5491
5492 #define _mm_mask_getexp_round_sd(W, U, A, B, R) \
5493 ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
5494 (__v2df)(__m128d)(B), \
5495 (__v2df)(__m128d)(W), \
5496 (__mmask8)(U), (int)(R)))
5497
5498 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_getexp_sd(__mmask8 __U,__m128d __A,__m128d __B)5499 _mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B)
5500 {
5501 return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
5502 (__v2df) __B,
5503 (__v2df) _mm_setzero_pd (),
5504 (__mmask8) __U,
5505 _MM_FROUND_CUR_DIRECTION);
5506 }
5507
5508 #define _mm_maskz_getexp_round_sd(U, A, B, R) \
5509 ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
5510 (__v2df)(__m128d)(B), \
5511 (__v2df)_mm_setzero_pd(), \
5512 (__mmask8)(U), (int)(R)))
5513
5514 #define _mm_getexp_round_ss(A, B, R) \
5515 ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
5516 (__v4sf)(__m128)(B), \
5517 (__v4sf)_mm_setzero_ps(), \
5518 (__mmask8)-1, (int)(R)))
5519
5520 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_getexp_ss(__m128 __A,__m128 __B)5521 _mm_getexp_ss (__m128 __A, __m128 __B)
5522 {
5523 return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
5524 (__v4sf) __B, (__v4sf) _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
5525 }
5526
5527 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_getexp_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128 __B)5528 _mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
5529 {
5530 return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
5531 (__v4sf) __B,
5532 (__v4sf) __W,
5533 (__mmask8) __U,
5534 _MM_FROUND_CUR_DIRECTION);
5535 }
5536
5537 #define _mm_mask_getexp_round_ss(W, U, A, B, R) \
5538 ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
5539 (__v4sf)(__m128)(B), \
5540 (__v4sf)(__m128)(W), \
5541 (__mmask8)(U), (int)(R)))
5542
5543 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_getexp_ss(__mmask8 __U,__m128 __A,__m128 __B)5544 _mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B)
5545 {
5546 return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
5547 (__v4sf) __B,
5548 (__v4sf) _mm_setzero_ps (),
5549 (__mmask8) __U,
5550 _MM_FROUND_CUR_DIRECTION);
5551 }
5552
5553 #define _mm_maskz_getexp_round_ss(U, A, B, R) \
5554 ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
5555 (__v4sf)(__m128)(B), \
5556 (__v4sf)_mm_setzero_ps(), \
5557 (__mmask8)(U), (int)(R)))
5558
5559 #define _mm_getmant_round_sd(A, B, C, D, R) \
5560 ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5561 (__v2df)(__m128d)(B), \
5562 (int)(((D)<<2) | (C)), \
5563 (__v2df)_mm_setzero_pd(), \
5564 (__mmask8)-1, (int)(R)))
5565
5566 #define _mm_getmant_sd(A, B, C, D) \
5567 ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5568 (__v2df)(__m128d)(B), \
5569 (int)(((D)<<2) | (C)), \
5570 (__v2df)_mm_setzero_pd(), \
5571 (__mmask8)-1, \
5572 _MM_FROUND_CUR_DIRECTION))
5573
5574 #define _mm_mask_getmant_sd(W, U, A, B, C, D) \
5575 ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5576 (__v2df)(__m128d)(B), \
5577 (int)(((D)<<2) | (C)), \
5578 (__v2df)(__m128d)(W), \
5579 (__mmask8)(U), \
5580 _MM_FROUND_CUR_DIRECTION))
5581
5582 #define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R) \
5583 ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5584 (__v2df)(__m128d)(B), \
5585 (int)(((D)<<2) | (C)), \
5586 (__v2df)(__m128d)(W), \
5587 (__mmask8)(U), (int)(R)))
5588
5589 #define _mm_maskz_getmant_sd(U, A, B, C, D) \
5590 ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5591 (__v2df)(__m128d)(B), \
5592 (int)(((D)<<2) | (C)), \
5593 (__v2df)_mm_setzero_pd(), \
5594 (__mmask8)(U), \
5595 _MM_FROUND_CUR_DIRECTION))
5596
5597 #define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) \
5598 ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5599 (__v2df)(__m128d)(B), \
5600 (int)(((D)<<2) | (C)), \
5601 (__v2df)_mm_setzero_pd(), \
5602 (__mmask8)(U), (int)(R)))
5603
5604 #define _mm_getmant_round_ss(A, B, C, D, R) \
5605 ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5606 (__v4sf)(__m128)(B), \
5607 (int)(((D)<<2) | (C)), \
5608 (__v4sf)_mm_setzero_ps(), \
5609 (__mmask8)-1, (int)(R)))
5610
5611 #define _mm_getmant_ss(A, B, C, D) \
5612 ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5613 (__v4sf)(__m128)(B), \
5614 (int)(((D)<<2) | (C)), \
5615 (__v4sf)_mm_setzero_ps(), \
5616 (__mmask8)-1, \
5617 _MM_FROUND_CUR_DIRECTION))
5618
5619 #define _mm_mask_getmant_ss(W, U, A, B, C, D) \
5620 ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5621 (__v4sf)(__m128)(B), \
5622 (int)(((D)<<2) | (C)), \
5623 (__v4sf)(__m128)(W), \
5624 (__mmask8)(U), \
5625 _MM_FROUND_CUR_DIRECTION))
5626
5627 #define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R) \
5628 ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5629 (__v4sf)(__m128)(B), \
5630 (int)(((D)<<2) | (C)), \
5631 (__v4sf)(__m128)(W), \
5632 (__mmask8)(U), (int)(R)))
5633
5634 #define _mm_maskz_getmant_ss(U, A, B, C, D) \
5635 ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5636 (__v4sf)(__m128)(B), \
5637 (int)(((D)<<2) | (C)), \
5638 (__v4sf)_mm_setzero_ps(), \
5639 (__mmask8)(U), \
5640 _MM_FROUND_CUR_DIRECTION))
5641
5642 #define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) \
5643 ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5644 (__v4sf)(__m128)(B), \
5645 (int)(((D)<<2) | (C)), \
5646 (__v4sf)_mm_setzero_ps(), \
5647 (__mmask8)(U), (int)(R)))
5648
5649 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
_mm512_kmov(__mmask16 __A)5650 _mm512_kmov (__mmask16 __A)
5651 {
5652 return __A;
5653 }
5654
5655 #define _mm_comi_round_sd(A, B, P, R) \
5656 ((int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \
5657 (int)(P), (int)(R)))
5658
5659 #define _mm_comi_round_ss(A, B, P, R) \
5660 ((int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
5661 (int)(P), (int)(R)))
5662
5663 #ifdef __x86_64__
5664 #define _mm_cvt_roundsd_si64(A, R) \
5665 ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)))
5666 #endif
5667
5668 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_sll_epi32(__m512i __A,__m128i __B)5669 _mm512_sll_epi32(__m512i __A, __m128i __B)
5670 {
5671 return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B);
5672 }
5673
5674 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_sll_epi32(__m512i __W,__mmask16 __U,__m512i __A,__m128i __B)5675 _mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
5676 {
5677 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5678 (__v16si)_mm512_sll_epi32(__A, __B),
5679 (__v16si)__W);
5680 }
5681
5682 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_sll_epi32(__mmask16 __U,__m512i __A,__m128i __B)5683 _mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B)
5684 {
5685 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5686 (__v16si)_mm512_sll_epi32(__A, __B),
5687 (__v16si)_mm512_setzero_si512());
5688 }
5689
5690 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_sll_epi64(__m512i __A,__m128i __B)5691 _mm512_sll_epi64(__m512i __A, __m128i __B)
5692 {
5693 return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B);
5694 }
5695
5696 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_sll_epi64(__m512i __W,__mmask8 __U,__m512i __A,__m128i __B)5697 _mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
5698 {
5699 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5700 (__v8di)_mm512_sll_epi64(__A, __B),
5701 (__v8di)__W);
5702 }
5703
5704 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_sll_epi64(__mmask8 __U,__m512i __A,__m128i __B)5705 _mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B)
5706 {
5707 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5708 (__v8di)_mm512_sll_epi64(__A, __B),
5709 (__v8di)_mm512_setzero_si512());
5710 }
5711
5712 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_sllv_epi32(__m512i __X,__m512i __Y)5713 _mm512_sllv_epi32(__m512i __X, __m512i __Y)
5714 {
5715 return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y);
5716 }
5717
5718 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_sllv_epi32(__m512i __W,__mmask16 __U,__m512i __X,__m512i __Y)5719 _mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
5720 {
5721 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5722 (__v16si)_mm512_sllv_epi32(__X, __Y),
5723 (__v16si)__W);
5724 }
5725
5726 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_sllv_epi32(__mmask16 __U,__m512i __X,__m512i __Y)5727 _mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
5728 {
5729 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5730 (__v16si)_mm512_sllv_epi32(__X, __Y),
5731 (__v16si)_mm512_setzero_si512());
5732 }
5733
5734 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_sllv_epi64(__m512i __X,__m512i __Y)5735 _mm512_sllv_epi64(__m512i __X, __m512i __Y)
5736 {
5737 return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y);
5738 }
5739
5740 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_sllv_epi64(__m512i __W,__mmask8 __U,__m512i __X,__m512i __Y)5741 _mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
5742 {
5743 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5744 (__v8di)_mm512_sllv_epi64(__X, __Y),
5745 (__v8di)__W);
5746 }
5747
5748 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_sllv_epi64(__mmask8 __U,__m512i __X,__m512i __Y)5749 _mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
5750 {
5751 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5752 (__v8di)_mm512_sllv_epi64(__X, __Y),
5753 (__v8di)_mm512_setzero_si512());
5754 }
5755
5756 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_sra_epi32(__m512i __A,__m128i __B)5757 _mm512_sra_epi32(__m512i __A, __m128i __B)
5758 {
5759 return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B);
5760 }
5761
5762 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_sra_epi32(__m512i __W,__mmask16 __U,__m512i __A,__m128i __B)5763 _mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
5764 {
5765 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5766 (__v16si)_mm512_sra_epi32(__A, __B),
5767 (__v16si)__W);
5768 }
5769
5770 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_sra_epi32(__mmask16 __U,__m512i __A,__m128i __B)5771 _mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B)
5772 {
5773 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5774 (__v16si)_mm512_sra_epi32(__A, __B),
5775 (__v16si)_mm512_setzero_si512());
5776 }
5777
5778 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_sra_epi64(__m512i __A,__m128i __B)5779 _mm512_sra_epi64(__m512i __A, __m128i __B)
5780 {
5781 return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B);
5782 }
5783
5784 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_sra_epi64(__m512i __W,__mmask8 __U,__m512i __A,__m128i __B)5785 _mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
5786 {
5787 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5788 (__v8di)_mm512_sra_epi64(__A, __B),
5789 (__v8di)__W);
5790 }
5791
5792 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_sra_epi64(__mmask8 __U,__m512i __A,__m128i __B)5793 _mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B)
5794 {
5795 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5796 (__v8di)_mm512_sra_epi64(__A, __B),
5797 (__v8di)_mm512_setzero_si512());
5798 }
5799
5800 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_srav_epi32(__m512i __X,__m512i __Y)5801 _mm512_srav_epi32(__m512i __X, __m512i __Y)
5802 {
5803 return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y);
5804 }
5805
5806 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_srav_epi32(__m512i __W,__mmask16 __U,__m512i __X,__m512i __Y)5807 _mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
5808 {
5809 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5810 (__v16si)_mm512_srav_epi32(__X, __Y),
5811 (__v16si)__W);
5812 }
5813
5814 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_srav_epi32(__mmask16 __U,__m512i __X,__m512i __Y)5815 _mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
5816 {
5817 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5818 (__v16si)_mm512_srav_epi32(__X, __Y),
5819 (__v16si)_mm512_setzero_si512());
5820 }
5821
5822 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_srav_epi64(__m512i __X,__m512i __Y)5823 _mm512_srav_epi64(__m512i __X, __m512i __Y)
5824 {
5825 return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y);
5826 }
5827
5828 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_srav_epi64(__m512i __W,__mmask8 __U,__m512i __X,__m512i __Y)5829 _mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
5830 {
5831 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5832 (__v8di)_mm512_srav_epi64(__X, __Y),
5833 (__v8di)__W);
5834 }
5835
5836 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_srav_epi64(__mmask8 __U,__m512i __X,__m512i __Y)5837 _mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
5838 {
5839 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5840 (__v8di)_mm512_srav_epi64(__X, __Y),
5841 (__v8di)_mm512_setzero_si512());
5842 }
5843
5844 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_srl_epi32(__m512i __A,__m128i __B)5845 _mm512_srl_epi32(__m512i __A, __m128i __B)
5846 {
5847 return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B);
5848 }
5849
5850 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_srl_epi32(__m512i __W,__mmask16 __U,__m512i __A,__m128i __B)5851 _mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
5852 {
5853 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5854 (__v16si)_mm512_srl_epi32(__A, __B),
5855 (__v16si)__W);
5856 }
5857
5858 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_srl_epi32(__mmask16 __U,__m512i __A,__m128i __B)5859 _mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B)
5860 {
5861 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5862 (__v16si)_mm512_srl_epi32(__A, __B),
5863 (__v16si)_mm512_setzero_si512());
5864 }
5865
5866 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_srl_epi64(__m512i __A,__m128i __B)5867 _mm512_srl_epi64(__m512i __A, __m128i __B)
5868 {
5869 return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B);
5870 }
5871
5872 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_srl_epi64(__m512i __W,__mmask8 __U,__m512i __A,__m128i __B)5873 _mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
5874 {
5875 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5876 (__v8di)_mm512_srl_epi64(__A, __B),
5877 (__v8di)__W);
5878 }
5879
5880 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_srl_epi64(__mmask8 __U,__m512i __A,__m128i __B)5881 _mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B)
5882 {
5883 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5884 (__v8di)_mm512_srl_epi64(__A, __B),
5885 (__v8di)_mm512_setzero_si512());
5886 }
5887
5888 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_srlv_epi32(__m512i __X,__m512i __Y)5889 _mm512_srlv_epi32(__m512i __X, __m512i __Y)
5890 {
5891 return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y);
5892 }
5893
5894 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_srlv_epi32(__m512i __W,__mmask16 __U,__m512i __X,__m512i __Y)5895 _mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
5896 {
5897 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5898 (__v16si)_mm512_srlv_epi32(__X, __Y),
5899 (__v16si)__W);
5900 }
5901
5902 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_srlv_epi32(__mmask16 __U,__m512i __X,__m512i __Y)5903 _mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
5904 {
5905 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5906 (__v16si)_mm512_srlv_epi32(__X, __Y),
5907 (__v16si)_mm512_setzero_si512());
5908 }
5909
5910 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_srlv_epi64(__m512i __X,__m512i __Y)5911 _mm512_srlv_epi64 (__m512i __X, __m512i __Y)
5912 {
5913 return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y);
5914 }
5915
5916 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_srlv_epi64(__m512i __W,__mmask8 __U,__m512i __X,__m512i __Y)5917 _mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
5918 {
5919 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5920 (__v8di)_mm512_srlv_epi64(__X, __Y),
5921 (__v8di)__W);
5922 }
5923
5924 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_srlv_epi64(__mmask8 __U,__m512i __X,__m512i __Y)5925 _mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
5926 {
5927 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5928 (__v8di)_mm512_srlv_epi64(__X, __Y),
5929 (__v8di)_mm512_setzero_si512());
5930 }
5931
5932 /// \enum _MM_TERNLOG_ENUM
5933 /// A helper to represent the ternary logic operations among vector \a A,
5934 /// \a B and \a C. The representation is passed to \a imm.
5935 typedef enum {
5936 _MM_TERNLOG_A = 0xF0,
5937 _MM_TERNLOG_B = 0xCC,
5938 _MM_TERNLOG_C = 0xAA
5939 } _MM_TERNLOG_ENUM;
5940
5941 #define _mm512_ternarylogic_epi32(A, B, C, imm) \
5942 ((__m512i)__builtin_ia32_pternlogd512_mask( \
5943 (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), \
5944 (unsigned char)(imm), (__mmask16)-1))
5945
5946 #define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) \
5947 ((__m512i)__builtin_ia32_pternlogd512_mask( \
5948 (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), \
5949 (unsigned char)(imm), (__mmask16)(U)))
5950
5951 #define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) \
5952 ((__m512i)__builtin_ia32_pternlogd512_maskz( \
5953 (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), \
5954 (unsigned char)(imm), (__mmask16)(U)))
5955
5956 #define _mm512_ternarylogic_epi64(A, B, C, imm) \
5957 ((__m512i)__builtin_ia32_pternlogq512_mask( \
5958 (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), \
5959 (unsigned char)(imm), (__mmask8)-1))
5960
5961 #define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) \
5962 ((__m512i)__builtin_ia32_pternlogq512_mask( \
5963 (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), \
5964 (unsigned char)(imm), (__mmask8)(U)))
5965
5966 #define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) \
5967 ((__m512i)__builtin_ia32_pternlogq512_maskz( \
5968 (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), \
5969 (unsigned char)(imm), (__mmask8)(U)))
5970
5971 #ifdef __x86_64__
5972 #define _mm_cvt_roundsd_i64(A, R) \
5973 ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)))
5974 #endif
5975
5976 #define _mm_cvt_roundsd_si32(A, R) \
5977 ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)))
5978
5979 #define _mm_cvt_roundsd_i32(A, R) \
5980 ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)))
5981
5982 #define _mm_cvt_roundsd_u32(A, R) \
5983 ((unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R)))
5984
5985 static __inline__ unsigned __DEFAULT_FN_ATTRS128
_mm_cvtsd_u32(__m128d __A)5986 _mm_cvtsd_u32 (__m128d __A)
5987 {
5988 return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A,
5989 _MM_FROUND_CUR_DIRECTION);
5990 }
5991
5992 #ifdef __x86_64__
5993 #define _mm_cvt_roundsd_u64(A, R) \
5994 ((unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \
5995 (int)(R)))
5996
5997 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
_mm_cvtsd_u64(__m128d __A)5998 _mm_cvtsd_u64 (__m128d __A)
5999 {
6000 return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df)
6001 __A,
6002 _MM_FROUND_CUR_DIRECTION);
6003 }
6004 #endif
6005
6006 #define _mm_cvt_roundss_si32(A, R) \
6007 ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)))
6008
6009 #define _mm_cvt_roundss_i32(A, R) \
6010 ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)))
6011
6012 #ifdef __x86_64__
6013 #define _mm_cvt_roundss_si64(A, R) \
6014 ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)))
6015
6016 #define _mm_cvt_roundss_i64(A, R) \
6017 ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)))
6018 #endif
6019
6020 #define _mm_cvt_roundss_u32(A, R) \
6021 ((unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)))
6022
6023 static __inline__ unsigned __DEFAULT_FN_ATTRS128
_mm_cvtss_u32(__m128 __A)6024 _mm_cvtss_u32 (__m128 __A)
6025 {
6026 return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A,
6027 _MM_FROUND_CUR_DIRECTION);
6028 }
6029
6030 #ifdef __x86_64__
6031 #define _mm_cvt_roundss_u64(A, R) \
6032 ((unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \
6033 (int)(R)))
6034
6035 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
_mm_cvtss_u64(__m128 __A)6036 _mm_cvtss_u64 (__m128 __A)
6037 {
6038 return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf)
6039 __A,
6040 _MM_FROUND_CUR_DIRECTION);
6041 }
6042 #endif
6043
6044 #define _mm_cvtt_roundsd_i32(A, R) \
6045 ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)))
6046
6047 #define _mm_cvtt_roundsd_si32(A, R) \
6048 ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)))
6049
6050 static __inline__ int __DEFAULT_FN_ATTRS128
_mm_cvttsd_i32(__m128d __A)6051 _mm_cvttsd_i32 (__m128d __A)
6052 {
6053 return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A,
6054 _MM_FROUND_CUR_DIRECTION);
6055 }
6056
6057 #ifdef __x86_64__
6058 #define _mm_cvtt_roundsd_si64(A, R) \
6059 ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)))
6060
6061 #define _mm_cvtt_roundsd_i64(A, R) \
6062 ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)))
6063
6064 static __inline__ long long __DEFAULT_FN_ATTRS128
_mm_cvttsd_i64(__m128d __A)6065 _mm_cvttsd_i64 (__m128d __A)
6066 {
6067 return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
6068 _MM_FROUND_CUR_DIRECTION);
6069 }
6070 #endif
6071
6072 #define _mm_cvtt_roundsd_u32(A, R) \
6073 ((unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)))
6074
6075 static __inline__ unsigned __DEFAULT_FN_ATTRS128
_mm_cvttsd_u32(__m128d __A)6076 _mm_cvttsd_u32 (__m128d __A)
6077 {
6078 return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A,
6079 _MM_FROUND_CUR_DIRECTION);
6080 }
6081
6082 #ifdef __x86_64__
6083 #define _mm_cvtt_roundsd_u64(A, R) \
6084 ((unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \
6085 (int)(R)))
6086
6087 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
_mm_cvttsd_u64(__m128d __A)6088 _mm_cvttsd_u64 (__m128d __A)
6089 {
6090 return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df)
6091 __A,
6092 _MM_FROUND_CUR_DIRECTION);
6093 }
6094 #endif
6095
6096 #define _mm_cvtt_roundss_i32(A, R) \
6097 ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)))
6098
6099 #define _mm_cvtt_roundss_si32(A, R) \
6100 ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)))
6101
6102 static __inline__ int __DEFAULT_FN_ATTRS128
_mm_cvttss_i32(__m128 __A)6103 _mm_cvttss_i32 (__m128 __A)
6104 {
6105 return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A,
6106 _MM_FROUND_CUR_DIRECTION);
6107 }
6108
6109 #ifdef __x86_64__
6110 #define _mm_cvtt_roundss_i64(A, R) \
6111 ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)))
6112
6113 #define _mm_cvtt_roundss_si64(A, R) \
6114 ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)))
6115
6116 static __inline__ long long __DEFAULT_FN_ATTRS128
_mm_cvttss_i64(__m128 __A)6117 _mm_cvttss_i64 (__m128 __A)
6118 {
6119 return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
6120 _MM_FROUND_CUR_DIRECTION);
6121 }
6122 #endif
6123
6124 #define _mm_cvtt_roundss_u32(A, R) \
6125 ((unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)))
6126
6127 static __inline__ unsigned __DEFAULT_FN_ATTRS128
_mm_cvttss_u32(__m128 __A)6128 _mm_cvttss_u32 (__m128 __A)
6129 {
6130 return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A,
6131 _MM_FROUND_CUR_DIRECTION);
6132 }
6133
6134 #ifdef __x86_64__
6135 #define _mm_cvtt_roundss_u64(A, R) \
6136 ((unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \
6137 (int)(R)))
6138
6139 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
_mm_cvttss_u64(__m128 __A)6140 _mm_cvttss_u64 (__m128 __A)
6141 {
6142 return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf)
6143 __A,
6144 _MM_FROUND_CUR_DIRECTION);
6145 }
6146 #endif
6147
6148 #define _mm512_permute_pd(X, C) \
6149 ((__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C)))
6150
6151 #define _mm512_mask_permute_pd(W, U, X, C) \
6152 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6153 (__v8df)_mm512_permute_pd((X), (C)), \
6154 (__v8df)(__m512d)(W)))
6155
6156 #define _mm512_maskz_permute_pd(U, X, C) \
6157 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6158 (__v8df)_mm512_permute_pd((X), (C)), \
6159 (__v8df)_mm512_setzero_pd()))
6160
6161 #define _mm512_permute_ps(X, C) \
6162 ((__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C)))
6163
6164 #define _mm512_mask_permute_ps(W, U, X, C) \
6165 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6166 (__v16sf)_mm512_permute_ps((X), (C)), \
6167 (__v16sf)(__m512)(W)))
6168
6169 #define _mm512_maskz_permute_ps(U, X, C) \
6170 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6171 (__v16sf)_mm512_permute_ps((X), (C)), \
6172 (__v16sf)_mm512_setzero_ps()))
6173
6174 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_permutevar_pd(__m512d __A,__m512i __C)6175 _mm512_permutevar_pd(__m512d __A, __m512i __C)
6176 {
6177 return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C);
6178 }
6179
6180 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_permutevar_pd(__m512d __W,__mmask8 __U,__m512d __A,__m512i __C)6181 _mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
6182 {
6183 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
6184 (__v8df)_mm512_permutevar_pd(__A, __C),
6185 (__v8df)__W);
6186 }
6187
6188 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_permutevar_pd(__mmask8 __U,__m512d __A,__m512i __C)6189 _mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C)
6190 {
6191 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
6192 (__v8df)_mm512_permutevar_pd(__A, __C),
6193 (__v8df)_mm512_setzero_pd());
6194 }
6195
6196 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_permutevar_ps(__m512 __A,__m512i __C)6197 _mm512_permutevar_ps(__m512 __A, __m512i __C)
6198 {
6199 return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C);
6200 }
6201
6202 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_permutevar_ps(__m512 __W,__mmask16 __U,__m512 __A,__m512i __C)6203 _mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
6204 {
6205 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
6206 (__v16sf)_mm512_permutevar_ps(__A, __C),
6207 (__v16sf)__W);
6208 }
6209
6210 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_permutevar_ps(__mmask16 __U,__m512 __A,__m512i __C)6211 _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C)
6212 {
6213 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
6214 (__v16sf)_mm512_permutevar_ps(__A, __C),
6215 (__v16sf)_mm512_setzero_ps());
6216 }
6217
6218 static __inline __m512d __DEFAULT_FN_ATTRS512
_mm512_permutex2var_pd(__m512d __A,__m512i __I,__m512d __B)6219 _mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
6220 {
6221 return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I,
6222 (__v8df)__B);
6223 }
6224
6225 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_permutex2var_pd(__m512d __A,__mmask8 __U,__m512i __I,__m512d __B)6226 _mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
6227 {
6228 return (__m512d)__builtin_ia32_selectpd_512(__U,
6229 (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
6230 (__v8df)__A);
6231 }
6232
6233 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask2_permutex2var_pd(__m512d __A,__m512i __I,__mmask8 __U,__m512d __B)6234 _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U,
6235 __m512d __B)
6236 {
6237 return (__m512d)__builtin_ia32_selectpd_512(__U,
6238 (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
6239 (__v8df)(__m512d)__I);
6240 }
6241
6242 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_permutex2var_pd(__mmask8 __U,__m512d __A,__m512i __I,__m512d __B)6243 _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I,
6244 __m512d __B)
6245 {
6246 return (__m512d)__builtin_ia32_selectpd_512(__U,
6247 (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
6248 (__v8df)_mm512_setzero_pd());
6249 }
6250
6251 static __inline __m512 __DEFAULT_FN_ATTRS512
_mm512_permutex2var_ps(__m512 __A,__m512i __I,__m512 __B)6252 _mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
6253 {
6254 return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I,
6255 (__v16sf) __B);
6256 }
6257
6258 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_permutex2var_ps(__m512 __A,__mmask16 __U,__m512i __I,__m512 __B)6259 _mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
6260 {
6261 return (__m512)__builtin_ia32_selectps_512(__U,
6262 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
6263 (__v16sf)__A);
6264 }
6265
6266 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask2_permutex2var_ps(__m512 __A,__m512i __I,__mmask16 __U,__m512 __B)6267 _mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B)
6268 {
6269 return (__m512)__builtin_ia32_selectps_512(__U,
6270 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
6271 (__v16sf)(__m512)__I);
6272 }
6273
6274 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_permutex2var_ps(__mmask16 __U,__m512 __A,__m512i __I,__m512 __B)6275 _mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B)
6276 {
6277 return (__m512)__builtin_ia32_selectps_512(__U,
6278 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
6279 (__v16sf)_mm512_setzero_ps());
6280 }
6281
6282
6283 #define _mm512_cvtt_roundpd_epu32(A, R) \
6284 ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
6285 (__v8si)_mm256_undefined_si256(), \
6286 (__mmask8)-1, (int)(R)))
6287
6288 #define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) \
6289 ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
6290 (__v8si)(__m256i)(W), \
6291 (__mmask8)(U), (int)(R)))
6292
6293 #define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) \
6294 ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
6295 (__v8si)_mm256_setzero_si256(), \
6296 (__mmask8)(U), (int)(R)))
6297
6298 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_cvttpd_epu32(__m512d __A)6299 _mm512_cvttpd_epu32 (__m512d __A)
6300 {
6301 return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
6302 (__v8si)
6303 _mm256_undefined_si256 (),
6304 (__mmask8) -1,
6305 _MM_FROUND_CUR_DIRECTION);
6306 }
6307
6308 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttpd_epu32(__m256i __W,__mmask8 __U,__m512d __A)6309 _mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
6310 {
6311 return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
6312 (__v8si) __W,
6313 (__mmask8) __U,
6314 _MM_FROUND_CUR_DIRECTION);
6315 }
6316
6317 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttpd_epu32(__mmask8 __U,__m512d __A)6318 _mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A)
6319 {
6320 return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
6321 (__v8si)
6322 _mm256_setzero_si256 (),
6323 (__mmask8) __U,
6324 _MM_FROUND_CUR_DIRECTION);
6325 }
6326
6327 #define _mm_roundscale_round_sd(A, B, imm, R) \
6328 ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6329 (__v2df)(__m128d)(B), \
6330 (__v2df)_mm_setzero_pd(), \
6331 (__mmask8)-1, (int)(imm), \
6332 (int)(R)))
6333
6334 #define _mm_roundscale_sd(A, B, imm) \
6335 ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6336 (__v2df)(__m128d)(B), \
6337 (__v2df)_mm_setzero_pd(), \
6338 (__mmask8)-1, (int)(imm), \
6339 _MM_FROUND_CUR_DIRECTION))
6340
6341 #define _mm_mask_roundscale_sd(W, U, A, B, imm) \
6342 ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6343 (__v2df)(__m128d)(B), \
6344 (__v2df)(__m128d)(W), \
6345 (__mmask8)(U), (int)(imm), \
6346 _MM_FROUND_CUR_DIRECTION))
6347
6348 #define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) \
6349 ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6350 (__v2df)(__m128d)(B), \
6351 (__v2df)(__m128d)(W), \
6352 (__mmask8)(U), (int)(I), \
6353 (int)(R)))
6354
6355 #define _mm_maskz_roundscale_sd(U, A, B, I) \
6356 ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6357 (__v2df)(__m128d)(B), \
6358 (__v2df)_mm_setzero_pd(), \
6359 (__mmask8)(U), (int)(I), \
6360 _MM_FROUND_CUR_DIRECTION))
6361
6362 #define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \
6363 ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6364 (__v2df)(__m128d)(B), \
6365 (__v2df)_mm_setzero_pd(), \
6366 (__mmask8)(U), (int)(I), \
6367 (int)(R)))
6368
6369 #define _mm_roundscale_round_ss(A, B, imm, R) \
6370 ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6371 (__v4sf)(__m128)(B), \
6372 (__v4sf)_mm_setzero_ps(), \
6373 (__mmask8)-1, (int)(imm), \
6374 (int)(R)))
6375
6376 #define _mm_roundscale_ss(A, B, imm) \
6377 ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6378 (__v4sf)(__m128)(B), \
6379 (__v4sf)_mm_setzero_ps(), \
6380 (__mmask8)-1, (int)(imm), \
6381 _MM_FROUND_CUR_DIRECTION))
6382
6383 #define _mm_mask_roundscale_ss(W, U, A, B, I) \
6384 ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6385 (__v4sf)(__m128)(B), \
6386 (__v4sf)(__m128)(W), \
6387 (__mmask8)(U), (int)(I), \
6388 _MM_FROUND_CUR_DIRECTION))
6389
6390 #define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) \
6391 ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6392 (__v4sf)(__m128)(B), \
6393 (__v4sf)(__m128)(W), \
6394 (__mmask8)(U), (int)(I), \
6395 (int)(R)))
6396
6397 #define _mm_maskz_roundscale_ss(U, A, B, I) \
6398 ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6399 (__v4sf)(__m128)(B), \
6400 (__v4sf)_mm_setzero_ps(), \
6401 (__mmask8)(U), (int)(I), \
6402 _MM_FROUND_CUR_DIRECTION))
6403
6404 #define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \
6405 ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6406 (__v4sf)(__m128)(B), \
6407 (__v4sf)_mm_setzero_ps(), \
6408 (__mmask8)(U), (int)(I), \
6409 (int)(R)))
6410
6411 #define _mm512_scalef_round_pd(A, B, R) \
6412 ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
6413 (__v8df)(__m512d)(B), \
6414 (__v8df)_mm512_undefined_pd(), \
6415 (__mmask8)-1, (int)(R)))
6416
6417 #define _mm512_mask_scalef_round_pd(W, U, A, B, R) \
6418 ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
6419 (__v8df)(__m512d)(B), \
6420 (__v8df)(__m512d)(W), \
6421 (__mmask8)(U), (int)(R)))
6422
6423 #define _mm512_maskz_scalef_round_pd(U, A, B, R) \
6424 ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
6425 (__v8df)(__m512d)(B), \
6426 (__v8df)_mm512_setzero_pd(), \
6427 (__mmask8)(U), (int)(R)))
6428
6429 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_scalef_pd(__m512d __A,__m512d __B)6430 _mm512_scalef_pd (__m512d __A, __m512d __B)
6431 {
6432 return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
6433 (__v8df) __B,
6434 (__v8df)
6435 _mm512_undefined_pd (),
6436 (__mmask8) -1,
6437 _MM_FROUND_CUR_DIRECTION);
6438 }
6439
6440 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_scalef_pd(__m512d __W,__mmask8 __U,__m512d __A,__m512d __B)6441 _mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
6442 {
6443 return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
6444 (__v8df) __B,
6445 (__v8df) __W,
6446 (__mmask8) __U,
6447 _MM_FROUND_CUR_DIRECTION);
6448 }
6449
6450 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_scalef_pd(__mmask8 __U,__m512d __A,__m512d __B)6451 _mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B)
6452 {
6453 return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
6454 (__v8df) __B,
6455 (__v8df)
6456 _mm512_setzero_pd (),
6457 (__mmask8) __U,
6458 _MM_FROUND_CUR_DIRECTION);
6459 }
6460
6461 #define _mm512_scalef_round_ps(A, B, R) \
6462 ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
6463 (__v16sf)(__m512)(B), \
6464 (__v16sf)_mm512_undefined_ps(), \
6465 (__mmask16)-1, (int)(R)))
6466
6467 #define _mm512_mask_scalef_round_ps(W, U, A, B, R) \
6468 ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
6469 (__v16sf)(__m512)(B), \
6470 (__v16sf)(__m512)(W), \
6471 (__mmask16)(U), (int)(R)))
6472
6473 #define _mm512_maskz_scalef_round_ps(U, A, B, R) \
6474 ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
6475 (__v16sf)(__m512)(B), \
6476 (__v16sf)_mm512_setzero_ps(), \
6477 (__mmask16)(U), (int)(R)))
6478
6479 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_scalef_ps(__m512 __A,__m512 __B)6480 _mm512_scalef_ps (__m512 __A, __m512 __B)
6481 {
6482 return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
6483 (__v16sf) __B,
6484 (__v16sf)
6485 _mm512_undefined_ps (),
6486 (__mmask16) -1,
6487 _MM_FROUND_CUR_DIRECTION);
6488 }
6489
6490 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_scalef_ps(__m512 __W,__mmask16 __U,__m512 __A,__m512 __B)6491 _mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
6492 {
6493 return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
6494 (__v16sf) __B,
6495 (__v16sf) __W,
6496 (__mmask16) __U,
6497 _MM_FROUND_CUR_DIRECTION);
6498 }
6499
6500 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_scalef_ps(__mmask16 __U,__m512 __A,__m512 __B)6501 _mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B)
6502 {
6503 return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
6504 (__v16sf) __B,
6505 (__v16sf)
6506 _mm512_setzero_ps (),
6507 (__mmask16) __U,
6508 _MM_FROUND_CUR_DIRECTION);
6509 }
6510
6511 #define _mm_scalef_round_sd(A, B, R) \
6512 ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
6513 (__v2df)(__m128d)(B), \
6514 (__v2df)_mm_setzero_pd(), \
6515 (__mmask8)-1, (int)(R)))
6516
6517 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_scalef_sd(__m128d __A,__m128d __B)6518 _mm_scalef_sd (__m128d __A, __m128d __B)
6519 {
6520 return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A,
6521 (__v2df)( __B), (__v2df) _mm_setzero_pd(),
6522 (__mmask8) -1,
6523 _MM_FROUND_CUR_DIRECTION);
6524 }
6525
6526 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_scalef_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128d __B)6527 _mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
6528 {
6529 return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
6530 (__v2df) __B,
6531 (__v2df) __W,
6532 (__mmask8) __U,
6533 _MM_FROUND_CUR_DIRECTION);
6534 }
6535
6536 #define _mm_mask_scalef_round_sd(W, U, A, B, R) \
6537 ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
6538 (__v2df)(__m128d)(B), \
6539 (__v2df)(__m128d)(W), \
6540 (__mmask8)(U), (int)(R)))
6541
6542 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_scalef_sd(__mmask8 __U,__m128d __A,__m128d __B)6543 _mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B)
6544 {
6545 return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
6546 (__v2df) __B,
6547 (__v2df) _mm_setzero_pd (),
6548 (__mmask8) __U,
6549 _MM_FROUND_CUR_DIRECTION);
6550 }
6551
6552 #define _mm_maskz_scalef_round_sd(U, A, B, R) \
6553 ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
6554 (__v2df)(__m128d)(B), \
6555 (__v2df)_mm_setzero_pd(), \
6556 (__mmask8)(U), (int)(R)))
6557
6558 #define _mm_scalef_round_ss(A, B, R) \
6559 ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
6560 (__v4sf)(__m128)(B), \
6561 (__v4sf)_mm_setzero_ps(), \
6562 (__mmask8)-1, (int)(R)))
6563
6564 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_scalef_ss(__m128 __A,__m128 __B)6565 _mm_scalef_ss (__m128 __A, __m128 __B)
6566 {
6567 return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A,
6568 (__v4sf)( __B), (__v4sf) _mm_setzero_ps(),
6569 (__mmask8) -1,
6570 _MM_FROUND_CUR_DIRECTION);
6571 }
6572
6573 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_scalef_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128 __B)6574 _mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
6575 {
6576 return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
6577 (__v4sf) __B,
6578 (__v4sf) __W,
6579 (__mmask8) __U,
6580 _MM_FROUND_CUR_DIRECTION);
6581 }
6582
6583 #define _mm_mask_scalef_round_ss(W, U, A, B, R) \
6584 ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
6585 (__v4sf)(__m128)(B), \
6586 (__v4sf)(__m128)(W), \
6587 (__mmask8)(U), (int)(R)))
6588
6589 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_scalef_ss(__mmask8 __U,__m128 __A,__m128 __B)6590 _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
6591 {
6592 return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
6593 (__v4sf) __B,
6594 (__v4sf) _mm_setzero_ps (),
6595 (__mmask8) __U,
6596 _MM_FROUND_CUR_DIRECTION);
6597 }
6598
6599 #define _mm_maskz_scalef_round_ss(U, A, B, R) \
6600 ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
6601 (__v4sf)(__m128)(B), \
6602 (__v4sf)_mm_setzero_ps(), \
6603 (__mmask8)(U), \
6604 (int)(R)))
6605
6606 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_srai_epi32(__m512i __A,unsigned int __B)6607 _mm512_srai_epi32(__m512i __A, unsigned int __B)
6608 {
6609 return (__m512i)__builtin_ia32_psradi512((__v16si)__A, (int)__B);
6610 }
6611
6612 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_srai_epi32(__m512i __W,__mmask16 __U,__m512i __A,unsigned int __B)6613 _mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A,
6614 unsigned int __B)
6615 {
6616 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
6617 (__v16si)_mm512_srai_epi32(__A, __B),
6618 (__v16si)__W);
6619 }
6620
6621 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_srai_epi32(__mmask16 __U,__m512i __A,unsigned int __B)6622 _mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A,
6623 unsigned int __B) {
6624 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
6625 (__v16si)_mm512_srai_epi32(__A, __B),
6626 (__v16si)_mm512_setzero_si512());
6627 }
6628
6629 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_srai_epi64(__m512i __A,unsigned int __B)6630 _mm512_srai_epi64(__m512i __A, unsigned int __B)
6631 {
6632 return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, (int)__B);
6633 }
6634
6635 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_srai_epi64(__m512i __W,__mmask8 __U,__m512i __A,unsigned int __B)6636 _mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B)
6637 {
6638 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
6639 (__v8di)_mm512_srai_epi64(__A, __B),
6640 (__v8di)__W);
6641 }
6642
6643 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_srai_epi64(__mmask8 __U,__m512i __A,unsigned int __B)6644 _mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
6645 {
6646 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
6647 (__v8di)_mm512_srai_epi64(__A, __B),
6648 (__v8di)_mm512_setzero_si512());
6649 }
6650
6651 #define _mm512_shuffle_f32x4(A, B, imm) \
6652 ((__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \
6653 (__v16sf)(__m512)(B), (int)(imm)))
6654
6655 #define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \
6656 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6657 (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
6658 (__v16sf)(__m512)(W)))
6659
6660 #define _mm512_maskz_shuffle_f32x4(U, A, B, imm) \
6661 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6662 (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
6663 (__v16sf)_mm512_setzero_ps()))
6664
6665 #define _mm512_shuffle_f64x2(A, B, imm) \
6666 ((__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \
6667 (__v8df)(__m512d)(B), (int)(imm)))
6668
6669 #define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \
6670 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6671 (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
6672 (__v8df)(__m512d)(W)))
6673
6674 #define _mm512_maskz_shuffle_f64x2(U, A, B, imm) \
6675 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6676 (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
6677 (__v8df)_mm512_setzero_pd()))
6678
6679 #define _mm512_shuffle_i32x4(A, B, imm) \
6680 ((__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \
6681 (__v16si)(__m512i)(B), (int)(imm)))
6682
6683 #define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \
6684 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
6685 (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
6686 (__v16si)(__m512i)(W)))
6687
6688 #define _mm512_maskz_shuffle_i32x4(U, A, B, imm) \
6689 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
6690 (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
6691 (__v16si)_mm512_setzero_si512()))
6692
6693 #define _mm512_shuffle_i64x2(A, B, imm) \
6694 ((__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \
6695 (__v8di)(__m512i)(B), (int)(imm)))
6696
6697 #define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \
6698 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
6699 (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
6700 (__v8di)(__m512i)(W)))
6701
6702 #define _mm512_maskz_shuffle_i64x2(U, A, B, imm) \
6703 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
6704 (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
6705 (__v8di)_mm512_setzero_si512()))
6706
6707 #define _mm512_shuffle_pd(A, B, M) \
6708 ((__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \
6709 (__v8df)(__m512d)(B), (int)(M)))
6710
6711 #define _mm512_mask_shuffle_pd(W, U, A, B, M) \
6712 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6713 (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
6714 (__v8df)(__m512d)(W)))
6715
6716 #define _mm512_maskz_shuffle_pd(U, A, B, M) \
6717 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6718 (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
6719 (__v8df)_mm512_setzero_pd()))
6720
6721 #define _mm512_shuffle_ps(A, B, M) \
6722 ((__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \
6723 (__v16sf)(__m512)(B), (int)(M)))
6724
6725 #define _mm512_mask_shuffle_ps(W, U, A, B, M) \
6726 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6727 (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
6728 (__v16sf)(__m512)(W)))
6729
6730 #define _mm512_maskz_shuffle_ps(U, A, B, M) \
6731 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6732 (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
6733 (__v16sf)_mm512_setzero_ps()))
6734
6735 #define _mm_sqrt_round_sd(A, B, R) \
6736 ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
6737 (__v2df)(__m128d)(B), \
6738 (__v2df)_mm_setzero_pd(), \
6739 (__mmask8)-1, (int)(R)))
6740
6741 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_sqrt_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128d __B)6742 _mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
6743 {
6744 return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
6745 (__v2df) __B,
6746 (__v2df) __W,
6747 (__mmask8) __U,
6748 _MM_FROUND_CUR_DIRECTION);
6749 }
6750
6751 #define _mm_mask_sqrt_round_sd(W, U, A, B, R) \
6752 ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
6753 (__v2df)(__m128d)(B), \
6754 (__v2df)(__m128d)(W), \
6755 (__mmask8)(U), (int)(R)))
6756
6757 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_sqrt_sd(__mmask8 __U,__m128d __A,__m128d __B)6758 _mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B)
6759 {
6760 return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
6761 (__v2df) __B,
6762 (__v2df) _mm_setzero_pd (),
6763 (__mmask8) __U,
6764 _MM_FROUND_CUR_DIRECTION);
6765 }
6766
6767 #define _mm_maskz_sqrt_round_sd(U, A, B, R) \
6768 ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
6769 (__v2df)(__m128d)(B), \
6770 (__v2df)_mm_setzero_pd(), \
6771 (__mmask8)(U), (int)(R)))
6772
6773 #define _mm_sqrt_round_ss(A, B, R) \
6774 ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
6775 (__v4sf)(__m128)(B), \
6776 (__v4sf)_mm_setzero_ps(), \
6777 (__mmask8)-1, (int)(R)))
6778
6779 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_sqrt_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128 __B)6780 _mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
6781 {
6782 return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
6783 (__v4sf) __B,
6784 (__v4sf) __W,
6785 (__mmask8) __U,
6786 _MM_FROUND_CUR_DIRECTION);
6787 }
6788
6789 #define _mm_mask_sqrt_round_ss(W, U, A, B, R) \
6790 ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
6791 (__v4sf)(__m128)(B), \
6792 (__v4sf)(__m128)(W), (__mmask8)(U), \
6793 (int)(R)))
6794
6795 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_sqrt_ss(__mmask8 __U,__m128 __A,__m128 __B)6796 _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
6797 {
6798 return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
6799 (__v4sf) __B,
6800 (__v4sf) _mm_setzero_ps (),
6801 (__mmask8) __U,
6802 _MM_FROUND_CUR_DIRECTION);
6803 }
6804
6805 #define _mm_maskz_sqrt_round_ss(U, A, B, R) \
6806 ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
6807 (__v4sf)(__m128)(B), \
6808 (__v4sf)_mm_setzero_ps(), \
6809 (__mmask8)(U), (int)(R)))
6810
6811 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_broadcast_f32x4(__m128 __A)6812 _mm512_broadcast_f32x4(__m128 __A)
6813 {
6814 return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
6815 0, 1, 2, 3, 0, 1, 2, 3,
6816 0, 1, 2, 3, 0, 1, 2, 3);
6817 }
6818
6819 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_broadcast_f32x4(__m512 __O,__mmask16 __M,__m128 __A)6820 _mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A)
6821 {
6822 return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
6823 (__v16sf)_mm512_broadcast_f32x4(__A),
6824 (__v16sf)__O);
6825 }
6826
6827 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_broadcast_f32x4(__mmask16 __M,__m128 __A)6828 _mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A)
6829 {
6830 return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
6831 (__v16sf)_mm512_broadcast_f32x4(__A),
6832 (__v16sf)_mm512_setzero_ps());
6833 }
6834
6835 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_broadcast_f64x4(__m256d __A)6836 _mm512_broadcast_f64x4(__m256d __A)
6837 {
6838 return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A,
6839 0, 1, 2, 3, 0, 1, 2, 3);
6840 }
6841
6842 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_broadcast_f64x4(__m512d __O,__mmask8 __M,__m256d __A)6843 _mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A)
6844 {
6845 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
6846 (__v8df)_mm512_broadcast_f64x4(__A),
6847 (__v8df)__O);
6848 }
6849
6850 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_broadcast_f64x4(__mmask8 __M,__m256d __A)6851 _mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A)
6852 {
6853 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
6854 (__v8df)_mm512_broadcast_f64x4(__A),
6855 (__v8df)_mm512_setzero_pd());
6856 }
6857
6858 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_broadcast_i32x4(__m128i __A)6859 _mm512_broadcast_i32x4(__m128i __A)
6860 {
6861 return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
6862 0, 1, 2, 3, 0, 1, 2, 3,
6863 0, 1, 2, 3, 0, 1, 2, 3);
6864 }
6865
6866 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_broadcast_i32x4(__m512i __O,__mmask16 __M,__m128i __A)6867 _mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A)
6868 {
6869 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
6870 (__v16si)_mm512_broadcast_i32x4(__A),
6871 (__v16si)__O);
6872 }
6873
6874 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_broadcast_i32x4(__mmask16 __M,__m128i __A)6875 _mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A)
6876 {
6877 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
6878 (__v16si)_mm512_broadcast_i32x4(__A),
6879 (__v16si)_mm512_setzero_si512());
6880 }
6881
6882 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_broadcast_i64x4(__m256i __A)6883 _mm512_broadcast_i64x4(__m256i __A)
6884 {
6885 return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A,
6886 0, 1, 2, 3, 0, 1, 2, 3);
6887 }
6888
6889 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_broadcast_i64x4(__m512i __O,__mmask8 __M,__m256i __A)6890 _mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A)
6891 {
6892 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
6893 (__v8di)_mm512_broadcast_i64x4(__A),
6894 (__v8di)__O);
6895 }
6896
6897 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_broadcast_i64x4(__mmask8 __M,__m256i __A)6898 _mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A)
6899 {
6900 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
6901 (__v8di)_mm512_broadcast_i64x4(__A),
6902 (__v8di)_mm512_setzero_si512());
6903 }
6904
6905 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_broadcastsd_pd(__m512d __O,__mmask8 __M,__m128d __A)6906 _mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
6907 {
6908 return (__m512d)__builtin_ia32_selectpd_512(__M,
6909 (__v8df) _mm512_broadcastsd_pd(__A),
6910 (__v8df) __O);
6911 }
6912
6913 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_broadcastsd_pd(__mmask8 __M,__m128d __A)6914 _mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
6915 {
6916 return (__m512d)__builtin_ia32_selectpd_512(__M,
6917 (__v8df) _mm512_broadcastsd_pd(__A),
6918 (__v8df) _mm512_setzero_pd());
6919 }
6920
6921 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_broadcastss_ps(__m512 __O,__mmask16 __M,__m128 __A)6922 _mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
6923 {
6924 return (__m512)__builtin_ia32_selectps_512(__M,
6925 (__v16sf) _mm512_broadcastss_ps(__A),
6926 (__v16sf) __O);
6927 }
6928
6929 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_broadcastss_ps(__mmask16 __M,__m128 __A)6930 _mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
6931 {
6932 return (__m512)__builtin_ia32_selectps_512(__M,
6933 (__v16sf) _mm512_broadcastss_ps(__A),
6934 (__v16sf) _mm512_setzero_ps());
6935 }
6936
6937 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_cvtsepi32_epi8(__m512i __A)6938 _mm512_cvtsepi32_epi8 (__m512i __A)
6939 {
6940 return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
6941 (__v16qi) _mm_undefined_si128 (),
6942 (__mmask16) -1);
6943 }
6944
6945 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtsepi32_epi8(__m128i __O,__mmask16 __M,__m512i __A)6946 _mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
6947 {
6948 return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
6949 (__v16qi) __O, __M);
6950 }
6951
6952 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtsepi32_epi8(__mmask16 __M,__m512i __A)6953 _mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A)
6954 {
6955 return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
6956 (__v16qi) _mm_setzero_si128 (),
6957 __M);
6958 }
6959
6960 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_cvtsepi32_storeu_epi8(void * __P,__mmask16 __M,__m512i __A)6961 _mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
6962 {
6963 __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
6964 }
6965
6966 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_cvtsepi32_epi16(__m512i __A)6967 _mm512_cvtsepi32_epi16 (__m512i __A)
6968 {
6969 return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
6970 (__v16hi) _mm256_undefined_si256 (),
6971 (__mmask16) -1);
6972 }
6973
6974 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtsepi32_epi16(__m256i __O,__mmask16 __M,__m512i __A)6975 _mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
6976 {
6977 return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
6978 (__v16hi) __O, __M);
6979 }
6980
6981 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtsepi32_epi16(__mmask16 __M,__m512i __A)6982 _mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A)
6983 {
6984 return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
6985 (__v16hi) _mm256_setzero_si256 (),
6986 __M);
6987 }
6988
6989 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_cvtsepi32_storeu_epi16(void * __P,__mmask16 __M,__m512i __A)6990 _mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
6991 {
6992 __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
6993 }
6994
6995 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_cvtsepi64_epi8(__m512i __A)6996 _mm512_cvtsepi64_epi8 (__m512i __A)
6997 {
6998 return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
6999 (__v16qi) _mm_undefined_si128 (),
7000 (__mmask8) -1);
7001 }
7002
7003 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtsepi64_epi8(__m128i __O,__mmask8 __M,__m512i __A)7004 _mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
7005 {
7006 return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
7007 (__v16qi) __O, __M);
7008 }
7009
7010 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtsepi64_epi8(__mmask8 __M,__m512i __A)7011 _mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A)
7012 {
7013 return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
7014 (__v16qi) _mm_setzero_si128 (),
7015 __M);
7016 }
7017
7018 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_cvtsepi64_storeu_epi8(void * __P,__mmask8 __M,__m512i __A)7019 _mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
7020 {
7021 __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
7022 }
7023
7024 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_cvtsepi64_epi32(__m512i __A)7025 _mm512_cvtsepi64_epi32 (__m512i __A)
7026 {
7027 return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
7028 (__v8si) _mm256_undefined_si256 (),
7029 (__mmask8) -1);
7030 }
7031
7032 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtsepi64_epi32(__m256i __O,__mmask8 __M,__m512i __A)7033 _mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
7034 {
7035 return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
7036 (__v8si) __O, __M);
7037 }
7038
7039 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtsepi64_epi32(__mmask8 __M,__m512i __A)7040 _mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A)
7041 {
7042 return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
7043 (__v8si) _mm256_setzero_si256 (),
7044 __M);
7045 }
7046
7047 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_cvtsepi64_storeu_epi32(void * __P,__mmask8 __M,__m512i __A)7048 _mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A)
7049 {
7050 __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
7051 }
7052
7053 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_cvtsepi64_epi16(__m512i __A)7054 _mm512_cvtsepi64_epi16 (__m512i __A)
7055 {
7056 return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
7057 (__v8hi) _mm_undefined_si128 (),
7058 (__mmask8) -1);
7059 }
7060
7061 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtsepi64_epi16(__m128i __O,__mmask8 __M,__m512i __A)7062 _mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
7063 {
7064 return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
7065 (__v8hi) __O, __M);
7066 }
7067
7068 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtsepi64_epi16(__mmask8 __M,__m512i __A)7069 _mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A)
7070 {
7071 return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
7072 (__v8hi) _mm_setzero_si128 (),
7073 __M);
7074 }
7075
7076 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_cvtsepi64_storeu_epi16(void * __P,__mmask8 __M,__m512i __A)7077 _mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A)
7078 {
7079 __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
7080 }
7081
7082 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_cvtusepi32_epi8(__m512i __A)7083 _mm512_cvtusepi32_epi8 (__m512i __A)
7084 {
7085 return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
7086 (__v16qi) _mm_undefined_si128 (),
7087 (__mmask16) -1);
7088 }
7089
7090 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtusepi32_epi8(__m128i __O,__mmask16 __M,__m512i __A)7091 _mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
7092 {
7093 return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
7094 (__v16qi) __O,
7095 __M);
7096 }
7097
7098 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtusepi32_epi8(__mmask16 __M,__m512i __A)7099 _mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A)
7100 {
7101 return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
7102 (__v16qi) _mm_setzero_si128 (),
7103 __M);
7104 }
7105
7106 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_cvtusepi32_storeu_epi8(void * __P,__mmask16 __M,__m512i __A)7107 _mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
7108 {
7109 __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
7110 }
7111
7112 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_cvtusepi32_epi16(__m512i __A)7113 _mm512_cvtusepi32_epi16 (__m512i __A)
7114 {
7115 return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
7116 (__v16hi) _mm256_undefined_si256 (),
7117 (__mmask16) -1);
7118 }
7119
7120 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtusepi32_epi16(__m256i __O,__mmask16 __M,__m512i __A)7121 _mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
7122 {
7123 return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
7124 (__v16hi) __O,
7125 __M);
7126 }
7127
7128 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtusepi32_epi16(__mmask16 __M,__m512i __A)7129 _mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A)
7130 {
7131 return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
7132 (__v16hi) _mm256_setzero_si256 (),
7133 __M);
7134 }
7135
7136 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_cvtusepi32_storeu_epi16(void * __P,__mmask16 __M,__m512i __A)7137 _mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
7138 {
7139 __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
7140 }
7141
7142 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_cvtusepi64_epi8(__m512i __A)7143 _mm512_cvtusepi64_epi8 (__m512i __A)
7144 {
7145 return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
7146 (__v16qi) _mm_undefined_si128 (),
7147 (__mmask8) -1);
7148 }
7149
7150 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtusepi64_epi8(__m128i __O,__mmask8 __M,__m512i __A)7151 _mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
7152 {
7153 return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
7154 (__v16qi) __O,
7155 __M);
7156 }
7157
7158 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtusepi64_epi8(__mmask8 __M,__m512i __A)7159 _mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A)
7160 {
7161 return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
7162 (__v16qi) _mm_setzero_si128 (),
7163 __M);
7164 }
7165
7166 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_cvtusepi64_storeu_epi8(void * __P,__mmask8 __M,__m512i __A)7167 _mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
7168 {
7169 __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
7170 }
7171
7172 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_cvtusepi64_epi32(__m512i __A)7173 _mm512_cvtusepi64_epi32 (__m512i __A)
7174 {
7175 return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
7176 (__v8si) _mm256_undefined_si256 (),
7177 (__mmask8) -1);
7178 }
7179
7180 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtusepi64_epi32(__m256i __O,__mmask8 __M,__m512i __A)7181 _mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
7182 {
7183 return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
7184 (__v8si) __O, __M);
7185 }
7186
7187 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtusepi64_epi32(__mmask8 __M,__m512i __A)7188 _mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A)
7189 {
7190 return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
7191 (__v8si) _mm256_setzero_si256 (),
7192 __M);
7193 }
7194
7195 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_cvtusepi64_storeu_epi32(void * __P,__mmask8 __M,__m512i __A)7196 _mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
7197 {
7198 __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M);
7199 }
7200
7201 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_cvtusepi64_epi16(__m512i __A)7202 _mm512_cvtusepi64_epi16 (__m512i __A)
7203 {
7204 return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
7205 (__v8hi) _mm_undefined_si128 (),
7206 (__mmask8) -1);
7207 }
7208
7209 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtusepi64_epi16(__m128i __O,__mmask8 __M,__m512i __A)7210 _mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
7211 {
7212 return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
7213 (__v8hi) __O, __M);
7214 }
7215
7216 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtusepi64_epi16(__mmask8 __M,__m512i __A)7217 _mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A)
7218 {
7219 return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
7220 (__v8hi) _mm_setzero_si128 (),
7221 __M);
7222 }
7223
7224 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_cvtusepi64_storeu_epi16(void * __P,__mmask8 __M,__m512i __A)7225 _mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
7226 {
7227 __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M);
7228 }
7229
7230 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_cvtepi32_epi8(__m512i __A)7231 _mm512_cvtepi32_epi8 (__m512i __A)
7232 {
7233 return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
7234 (__v16qi) _mm_undefined_si128 (),
7235 (__mmask16) -1);
7236 }
7237
7238 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi32_epi8(__m128i __O,__mmask16 __M,__m512i __A)7239 _mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
7240 {
7241 return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
7242 (__v16qi) __O, __M);
7243 }
7244
7245 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi32_epi8(__mmask16 __M,__m512i __A)7246 _mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A)
7247 {
7248 return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
7249 (__v16qi) _mm_setzero_si128 (),
7250 __M);
7251 }
7252
7253 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi32_storeu_epi8(void * __P,__mmask16 __M,__m512i __A)7254 _mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
7255 {
7256 __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
7257 }
7258
7259 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_cvtepi32_epi16(__m512i __A)7260 _mm512_cvtepi32_epi16 (__m512i __A)
7261 {
7262 return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
7263 (__v16hi) _mm256_undefined_si256 (),
7264 (__mmask16) -1);
7265 }
7266
7267 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi32_epi16(__m256i __O,__mmask16 __M,__m512i __A)7268 _mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
7269 {
7270 return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
7271 (__v16hi) __O, __M);
7272 }
7273
7274 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi32_epi16(__mmask16 __M,__m512i __A)7275 _mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A)
7276 {
7277 return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
7278 (__v16hi) _mm256_setzero_si256 (),
7279 __M);
7280 }
7281
7282 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi32_storeu_epi16(void * __P,__mmask16 __M,__m512i __A)7283 _mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A)
7284 {
7285 __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M);
7286 }
7287
7288 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_cvtepi64_epi8(__m512i __A)7289 _mm512_cvtepi64_epi8 (__m512i __A)
7290 {
7291 return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
7292 (__v16qi) _mm_undefined_si128 (),
7293 (__mmask8) -1);
7294 }
7295
7296 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi64_epi8(__m128i __O,__mmask8 __M,__m512i __A)7297 _mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
7298 {
7299 return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
7300 (__v16qi) __O, __M);
7301 }
7302
7303 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi64_epi8(__mmask8 __M,__m512i __A)7304 _mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A)
7305 {
7306 return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
7307 (__v16qi) _mm_setzero_si128 (),
7308 __M);
7309 }
7310
7311 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi64_storeu_epi8(void * __P,__mmask8 __M,__m512i __A)7312 _mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
7313 {
7314 __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
7315 }
7316
7317 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_cvtepi64_epi32(__m512i __A)7318 _mm512_cvtepi64_epi32 (__m512i __A)
7319 {
7320 return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
7321 (__v8si) _mm256_undefined_si256 (),
7322 (__mmask8) -1);
7323 }
7324
7325 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi64_epi32(__m256i __O,__mmask8 __M,__m512i __A)7326 _mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
7327 {
7328 return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
7329 (__v8si) __O, __M);
7330 }
7331
7332 static __inline__ __m256i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi64_epi32(__mmask8 __M,__m512i __A)7333 _mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A)
7334 {
7335 return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
7336 (__v8si) _mm256_setzero_si256 (),
7337 __M);
7338 }
7339
7340 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi64_storeu_epi32(void * __P,__mmask8 __M,__m512i __A)7341 _mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
7342 {
7343 __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
7344 }
7345
7346 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_cvtepi64_epi16(__m512i __A)7347 _mm512_cvtepi64_epi16 (__m512i __A)
7348 {
7349 return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
7350 (__v8hi) _mm_undefined_si128 (),
7351 (__mmask8) -1);
7352 }
7353
7354 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi64_epi16(__m128i __O,__mmask8 __M,__m512i __A)7355 _mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
7356 {
7357 return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
7358 (__v8hi) __O, __M);
7359 }
7360
7361 static __inline__ __m128i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi64_epi16(__mmask8 __M,__m512i __A)7362 _mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A)
7363 {
7364 return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
7365 (__v8hi) _mm_setzero_si128 (),
7366 __M);
7367 }
7368
7369 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi64_storeu_epi16(void * __P,__mmask8 __M,__m512i __A)7370 _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
7371 {
7372 __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
7373 }
7374
7375 #define _mm512_extracti32x4_epi32(A, imm) \
7376 ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
7377 (__v4si)_mm_undefined_si128(), \
7378 (__mmask8)-1))
7379
7380 #define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \
7381 ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
7382 (__v4si)(__m128i)(W), \
7383 (__mmask8)(U)))
7384
7385 #define _mm512_maskz_extracti32x4_epi32(U, A, imm) \
7386 ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
7387 (__v4si)_mm_setzero_si128(), \
7388 (__mmask8)(U)))
7389
7390 #define _mm512_extracti64x4_epi64(A, imm) \
7391 ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
7392 (__v4di)_mm256_undefined_si256(), \
7393 (__mmask8)-1))
7394
7395 #define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \
7396 ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
7397 (__v4di)(__m256i)(W), \
7398 (__mmask8)(U)))
7399
7400 #define _mm512_maskz_extracti64x4_epi64(U, A, imm) \
7401 ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
7402 (__v4di)_mm256_setzero_si256(), \
7403 (__mmask8)(U)))
7404
7405 #define _mm512_insertf64x4(A, B, imm) \
7406 ((__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \
7407 (__v4df)(__m256d)(B), (int)(imm)))
7408
7409 #define _mm512_mask_insertf64x4(W, U, A, B, imm) \
7410 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
7411 (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
7412 (__v8df)(__m512d)(W)))
7413
7414 #define _mm512_maskz_insertf64x4(U, A, B, imm) \
7415 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
7416 (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
7417 (__v8df)_mm512_setzero_pd()))
7418
7419 #define _mm512_inserti64x4(A, B, imm) \
7420 ((__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \
7421 (__v4di)(__m256i)(B), (int)(imm)))
7422
7423 #define _mm512_mask_inserti64x4(W, U, A, B, imm) \
7424 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
7425 (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
7426 (__v8di)(__m512i)(W)))
7427
7428 #define _mm512_maskz_inserti64x4(U, A, B, imm) \
7429 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
7430 (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
7431 (__v8di)_mm512_setzero_si512()))
7432
7433 #define _mm512_insertf32x4(A, B, imm) \
7434 ((__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \
7435 (__v4sf)(__m128)(B), (int)(imm)))
7436
7437 #define _mm512_mask_insertf32x4(W, U, A, B, imm) \
7438 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
7439 (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
7440 (__v16sf)(__m512)(W)))
7441
7442 #define _mm512_maskz_insertf32x4(U, A, B, imm) \
7443 ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
7444 (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
7445 (__v16sf)_mm512_setzero_ps()))
7446
7447 #define _mm512_inserti32x4(A, B, imm) \
7448 ((__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \
7449 (__v4si)(__m128i)(B), (int)(imm)))
7450
7451 #define _mm512_mask_inserti32x4(W, U, A, B, imm) \
7452 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
7453 (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
7454 (__v16si)(__m512i)(W)))
7455
7456 #define _mm512_maskz_inserti32x4(U, A, B, imm) \
7457 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
7458 (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
7459 (__v16si)_mm512_setzero_si512()))
7460
7461 #define _mm512_getmant_round_pd(A, B, C, R) \
7462 ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7463 (int)(((C)<<2) | (B)), \
7464 (__v8df)_mm512_undefined_pd(), \
7465 (__mmask8)-1, (int)(R)))
7466
7467 #define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) \
7468 ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7469 (int)(((C)<<2) | (B)), \
7470 (__v8df)(__m512d)(W), \
7471 (__mmask8)(U), (int)(R)))
7472
7473 #define _mm512_maskz_getmant_round_pd(U, A, B, C, R) \
7474 ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7475 (int)(((C)<<2) | (B)), \
7476 (__v8df)_mm512_setzero_pd(), \
7477 (__mmask8)(U), (int)(R)))
7478
7479 #define _mm512_getmant_pd(A, B, C) \
7480 ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7481 (int)(((C)<<2) | (B)), \
7482 (__v8df)_mm512_setzero_pd(), \
7483 (__mmask8)-1, \
7484 _MM_FROUND_CUR_DIRECTION))
7485
7486 #define _mm512_mask_getmant_pd(W, U, A, B, C) \
7487 ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7488 (int)(((C)<<2) | (B)), \
7489 (__v8df)(__m512d)(W), \
7490 (__mmask8)(U), \
7491 _MM_FROUND_CUR_DIRECTION))
7492
7493 #define _mm512_maskz_getmant_pd(U, A, B, C) \
7494 ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7495 (int)(((C)<<2) | (B)), \
7496 (__v8df)_mm512_setzero_pd(), \
7497 (__mmask8)(U), \
7498 _MM_FROUND_CUR_DIRECTION))
7499
7500 #define _mm512_getmant_round_ps(A, B, C, R) \
7501 ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7502 (int)(((C)<<2) | (B)), \
7503 (__v16sf)_mm512_undefined_ps(), \
7504 (__mmask16)-1, (int)(R)))
7505
7506 #define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) \
7507 ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7508 (int)(((C)<<2) | (B)), \
7509 (__v16sf)(__m512)(W), \
7510 (__mmask16)(U), (int)(R)))
7511
7512 #define _mm512_maskz_getmant_round_ps(U, A, B, C, R) \
7513 ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7514 (int)(((C)<<2) | (B)), \
7515 (__v16sf)_mm512_setzero_ps(), \
7516 (__mmask16)(U), (int)(R)))
7517
7518 #define _mm512_getmant_ps(A, B, C) \
7519 ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7520 (int)(((C)<<2)|(B)), \
7521 (__v16sf)_mm512_undefined_ps(), \
7522 (__mmask16)-1, \
7523 _MM_FROUND_CUR_DIRECTION))
7524
7525 #define _mm512_mask_getmant_ps(W, U, A, B, C) \
7526 ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7527 (int)(((C)<<2)|(B)), \
7528 (__v16sf)(__m512)(W), \
7529 (__mmask16)(U), \
7530 _MM_FROUND_CUR_DIRECTION))
7531
7532 #define _mm512_maskz_getmant_ps(U, A, B, C) \
7533 ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7534 (int)(((C)<<2)|(B)), \
7535 (__v16sf)_mm512_setzero_ps(), \
7536 (__mmask16)(U), \
7537 _MM_FROUND_CUR_DIRECTION))
7538
7539 #define _mm512_getexp_round_pd(A, R) \
7540 ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
7541 (__v8df)_mm512_undefined_pd(), \
7542 (__mmask8)-1, (int)(R)))
7543
7544 #define _mm512_mask_getexp_round_pd(W, U, A, R) \
7545 ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
7546 (__v8df)(__m512d)(W), \
7547 (__mmask8)(U), (int)(R)))
7548
7549 #define _mm512_maskz_getexp_round_pd(U, A, R) \
7550 ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
7551 (__v8df)_mm512_setzero_pd(), \
7552 (__mmask8)(U), (int)(R)))
7553
7554 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_getexp_pd(__m512d __A)7555 _mm512_getexp_pd (__m512d __A)
7556 {
7557 return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
7558 (__v8df) _mm512_undefined_pd (),
7559 (__mmask8) -1,
7560 _MM_FROUND_CUR_DIRECTION);
7561 }
7562
7563 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_getexp_pd(__m512d __W,__mmask8 __U,__m512d __A)7564 _mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A)
7565 {
7566 return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
7567 (__v8df) __W,
7568 (__mmask8) __U,
7569 _MM_FROUND_CUR_DIRECTION);
7570 }
7571
7572 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_getexp_pd(__mmask8 __U,__m512d __A)7573 _mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A)
7574 {
7575 return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
7576 (__v8df) _mm512_setzero_pd (),
7577 (__mmask8) __U,
7578 _MM_FROUND_CUR_DIRECTION);
7579 }
7580
7581 #define _mm512_getexp_round_ps(A, R) \
7582 ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
7583 (__v16sf)_mm512_undefined_ps(), \
7584 (__mmask16)-1, (int)(R)))
7585
7586 #define _mm512_mask_getexp_round_ps(W, U, A, R) \
7587 ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
7588 (__v16sf)(__m512)(W), \
7589 (__mmask16)(U), (int)(R)))
7590
7591 #define _mm512_maskz_getexp_round_ps(U, A, R) \
7592 ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
7593 (__v16sf)_mm512_setzero_ps(), \
7594 (__mmask16)(U), (int)(R)))
7595
7596 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_getexp_ps(__m512 __A)7597 _mm512_getexp_ps (__m512 __A)
7598 {
7599 return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
7600 (__v16sf) _mm512_undefined_ps (),
7601 (__mmask16) -1,
7602 _MM_FROUND_CUR_DIRECTION);
7603 }
7604
7605 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_getexp_ps(__m512 __W,__mmask16 __U,__m512 __A)7606 _mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A)
7607 {
7608 return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
7609 (__v16sf) __W,
7610 (__mmask16) __U,
7611 _MM_FROUND_CUR_DIRECTION);
7612 }
7613
7614 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_getexp_ps(__mmask16 __U,__m512 __A)7615 _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
7616 {
7617 return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
7618 (__v16sf) _mm512_setzero_ps (),
7619 (__mmask16) __U,
7620 _MM_FROUND_CUR_DIRECTION);
7621 }
7622
7623 #define _mm512_i64gather_ps(index, addr, scale) \
7624 ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \
7625 (void const *)(addr), \
7626 (__v8di)(__m512i)(index), (__mmask8)-1, \
7627 (int)(scale)))
7628
7629 #define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) \
7630 ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\
7631 (void const *)(addr), \
7632 (__v8di)(__m512i)(index), \
7633 (__mmask8)(mask), (int)(scale)))
7634
7635 #define _mm512_i64gather_epi32(index, addr, scale) \
7636 ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \
7637 (void const *)(addr), \
7638 (__v8di)(__m512i)(index), \
7639 (__mmask8)-1, (int)(scale)))
7640
7641 #define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) \
7642 ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \
7643 (void const *)(addr), \
7644 (__v8di)(__m512i)(index), \
7645 (__mmask8)(mask), (int)(scale)))
7646
7647 #define _mm512_i64gather_pd(index, addr, scale) \
7648 ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \
7649 (void const *)(addr), \
7650 (__v8di)(__m512i)(index), (__mmask8)-1, \
7651 (int)(scale)))
7652
7653 #define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) \
7654 ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \
7655 (void const *)(addr), \
7656 (__v8di)(__m512i)(index), \
7657 (__mmask8)(mask), (int)(scale)))
7658
7659 #define _mm512_i64gather_epi64(index, addr, scale) \
7660 ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \
7661 (void const *)(addr), \
7662 (__v8di)(__m512i)(index), (__mmask8)-1, \
7663 (int)(scale)))
7664
7665 #define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) \
7666 ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \
7667 (void const *)(addr), \
7668 (__v8di)(__m512i)(index), \
7669 (__mmask8)(mask), (int)(scale)))
7670
7671 #define _mm512_i32gather_ps(index, addr, scale) \
7672 ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \
7673 (void const *)(addr), \
7674 (__v16si)(__m512)(index), \
7675 (__mmask16)-1, (int)(scale)))
7676
7677 #define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \
7678 ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \
7679 (void const *)(addr), \
7680 (__v16si)(__m512)(index), \
7681 (__mmask16)(mask), (int)(scale)))
7682
7683 #define _mm512_i32gather_epi32(index, addr, scale) \
7684 ((__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \
7685 (void const *)(addr), \
7686 (__v16si)(__m512i)(index), \
7687 (__mmask16)-1, (int)(scale)))
7688
7689 #define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) \
7690 ((__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \
7691 (void const *)(addr), \
7692 (__v16si)(__m512i)(index), \
7693 (__mmask16)(mask), (int)(scale)))
7694
7695 #define _mm512_i32gather_pd(index, addr, scale) \
7696 ((__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \
7697 (void const *)(addr), \
7698 (__v8si)(__m256i)(index), (__mmask8)-1, \
7699 (int)(scale)))
7700
7701 #define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) \
7702 ((__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \
7703 (void const *)(addr), \
7704 (__v8si)(__m256i)(index), \
7705 (__mmask8)(mask), (int)(scale)))
7706
7707 #define _mm512_i32gather_epi64(index, addr, scale) \
7708 ((__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \
7709 (void const *)(addr), \
7710 (__v8si)(__m256i)(index), (__mmask8)-1, \
7711 (int)(scale)))
7712
7713 #define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) \
7714 ((__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \
7715 (void const *)(addr), \
7716 (__v8si)(__m256i)(index), \
7717 (__mmask8)(mask), (int)(scale)))
7718
7719 #define _mm512_i64scatter_ps(addr, index, v1, scale) \
7720 __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)-1, \
7721 (__v8di)(__m512i)(index), \
7722 (__v8sf)(__m256)(v1), (int)(scale))
7723
7724 #define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) \
7725 __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)(mask), \
7726 (__v8di)(__m512i)(index), \
7727 (__v8sf)(__m256)(v1), (int)(scale))
7728
7729 #define _mm512_i64scatter_epi32(addr, index, v1, scale) \
7730 __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)-1, \
7731 (__v8di)(__m512i)(index), \
7732 (__v8si)(__m256i)(v1), (int)(scale))
7733
7734 #define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) \
7735 __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)(mask), \
7736 (__v8di)(__m512i)(index), \
7737 (__v8si)(__m256i)(v1), (int)(scale))
7738
7739 #define _mm512_i64scatter_pd(addr, index, v1, scale) \
7740 __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)-1, \
7741 (__v8di)(__m512i)(index), \
7742 (__v8df)(__m512d)(v1), (int)(scale))
7743
7744 #define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) \
7745 __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)(mask), \
7746 (__v8di)(__m512i)(index), \
7747 (__v8df)(__m512d)(v1), (int)(scale))
7748
7749 #define _mm512_i64scatter_epi64(addr, index, v1, scale) \
7750 __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)-1, \
7751 (__v8di)(__m512i)(index), \
7752 (__v8di)(__m512i)(v1), (int)(scale))
7753
7754 #define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) \
7755 __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)(mask), \
7756 (__v8di)(__m512i)(index), \
7757 (__v8di)(__m512i)(v1), (int)(scale))
7758
7759 #define _mm512_i32scatter_ps(addr, index, v1, scale) \
7760 __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)-1, \
7761 (__v16si)(__m512i)(index), \
7762 (__v16sf)(__m512)(v1), (int)(scale))
7763
7764 #define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) \
7765 __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)(mask), \
7766 (__v16si)(__m512i)(index), \
7767 (__v16sf)(__m512)(v1), (int)(scale))
7768
7769 #define _mm512_i32scatter_epi32(addr, index, v1, scale) \
7770 __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)-1, \
7771 (__v16si)(__m512i)(index), \
7772 (__v16si)(__m512i)(v1), (int)(scale))
7773
7774 #define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) \
7775 __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)(mask), \
7776 (__v16si)(__m512i)(index), \
7777 (__v16si)(__m512i)(v1), (int)(scale))
7778
7779 #define _mm512_i32scatter_pd(addr, index, v1, scale) \
7780 __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)-1, \
7781 (__v8si)(__m256i)(index), \
7782 (__v8df)(__m512d)(v1), (int)(scale))
7783
7784 #define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) \
7785 __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)(mask), \
7786 (__v8si)(__m256i)(index), \
7787 (__v8df)(__m512d)(v1), (int)(scale))
7788
7789 #define _mm512_i32scatter_epi64(addr, index, v1, scale) \
7790 __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)-1, \
7791 (__v8si)(__m256i)(index), \
7792 (__v8di)(__m512i)(v1), (int)(scale))
7793
7794 #define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) \
7795 __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)(mask), \
7796 (__v8si)(__m256i)(index), \
7797 (__v8di)(__m512i)(v1), (int)(scale))
7798
7799 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_fmadd_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128 __B)7800 _mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
7801 {
7802 return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
7803 (__v4sf)__A,
7804 (__v4sf)__B,
7805 (__mmask8)__U,
7806 _MM_FROUND_CUR_DIRECTION);
7807 }
7808
7809 #define _mm_fmadd_round_ss(A, B, C, R) \
7810 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
7811 (__v4sf)(__m128)(B), \
7812 (__v4sf)(__m128)(C), (__mmask8)-1, \
7813 (int)(R)))
7814
7815 #define _mm_mask_fmadd_round_ss(W, U, A, B, R) \
7816 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
7817 (__v4sf)(__m128)(A), \
7818 (__v4sf)(__m128)(B), (__mmask8)(U), \
7819 (int)(R)))
7820
7821 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_fmadd_ss(__mmask8 __U,__m128 __A,__m128 __B,__m128 __C)7822 _mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
7823 {
7824 return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
7825 (__v4sf)__B,
7826 (__v4sf)__C,
7827 (__mmask8)__U,
7828 _MM_FROUND_CUR_DIRECTION);
7829 }
7830
7831 #define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \
7832 ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
7833 (__v4sf)(__m128)(B), \
7834 (__v4sf)(__m128)(C), (__mmask8)(U), \
7835 (int)(R)))
7836
7837 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask3_fmadd_ss(__m128 __W,__m128 __X,__m128 __Y,__mmask8 __U)7838 _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
7839 {
7840 return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W,
7841 (__v4sf)__X,
7842 (__v4sf)__Y,
7843 (__mmask8)__U,
7844 _MM_FROUND_CUR_DIRECTION);
7845 }
7846
7847 #define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) \
7848 ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
7849 (__v4sf)(__m128)(X), \
7850 (__v4sf)(__m128)(Y), (__mmask8)(U), \
7851 (int)(R)))
7852
7853 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_fmsub_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128 __B)7854 _mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
7855 {
7856 return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
7857 (__v4sf)__A,
7858 -(__v4sf)__B,
7859 (__mmask8)__U,
7860 _MM_FROUND_CUR_DIRECTION);
7861 }
7862
7863 #define _mm_fmsub_round_ss(A, B, C, R) \
7864 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
7865 (__v4sf)(__m128)(B), \
7866 -(__v4sf)(__m128)(C), (__mmask8)-1, \
7867 (int)(R)))
7868
7869 #define _mm_mask_fmsub_round_ss(W, U, A, B, R) \
7870 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
7871 (__v4sf)(__m128)(A), \
7872 -(__v4sf)(__m128)(B), (__mmask8)(U), \
7873 (int)(R)))
7874
7875 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_fmsub_ss(__mmask8 __U,__m128 __A,__m128 __B,__m128 __C)7876 _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
7877 {
7878 return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
7879 (__v4sf)__B,
7880 -(__v4sf)__C,
7881 (__mmask8)__U,
7882 _MM_FROUND_CUR_DIRECTION);
7883 }
7884
7885 #define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \
7886 ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
7887 (__v4sf)(__m128)(B), \
7888 -(__v4sf)(__m128)(C), (__mmask8)(U), \
7889 (int)(R)))
7890
7891 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask3_fmsub_ss(__m128 __W,__m128 __X,__m128 __Y,__mmask8 __U)7892 _mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
7893 {
7894 return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W,
7895 (__v4sf)__X,
7896 (__v4sf)__Y,
7897 (__mmask8)__U,
7898 _MM_FROUND_CUR_DIRECTION);
7899 }
7900
7901 #define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) \
7902 ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
7903 (__v4sf)(__m128)(X), \
7904 (__v4sf)(__m128)(Y), (__mmask8)(U), \
7905 (int)(R)))
7906
7907 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_fnmadd_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128 __B)7908 _mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
7909 {
7910 return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
7911 -(__v4sf)__A,
7912 (__v4sf)__B,
7913 (__mmask8)__U,
7914 _MM_FROUND_CUR_DIRECTION);
7915 }
7916
7917 #define _mm_fnmadd_round_ss(A, B, C, R) \
7918 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
7919 -(__v4sf)(__m128)(B), \
7920 (__v4sf)(__m128)(C), (__mmask8)-1, \
7921 (int)(R)))
7922
7923 #define _mm_mask_fnmadd_round_ss(W, U, A, B, R) \
7924 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
7925 -(__v4sf)(__m128)(A), \
7926 (__v4sf)(__m128)(B), (__mmask8)(U), \
7927 (int)(R)))
7928
7929 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_fnmadd_ss(__mmask8 __U,__m128 __A,__m128 __B,__m128 __C)7930 _mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
7931 {
7932 return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
7933 -(__v4sf)__B,
7934 (__v4sf)__C,
7935 (__mmask8)__U,
7936 _MM_FROUND_CUR_DIRECTION);
7937 }
7938
7939 #define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \
7940 ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
7941 -(__v4sf)(__m128)(B), \
7942 (__v4sf)(__m128)(C), (__mmask8)(U), \
7943 (int)(R)))
7944
7945 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask3_fnmadd_ss(__m128 __W,__m128 __X,__m128 __Y,__mmask8 __U)7946 _mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
7947 {
7948 return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W,
7949 -(__v4sf)__X,
7950 (__v4sf)__Y,
7951 (__mmask8)__U,
7952 _MM_FROUND_CUR_DIRECTION);
7953 }
7954
7955 #define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) \
7956 ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
7957 -(__v4sf)(__m128)(X), \
7958 (__v4sf)(__m128)(Y), (__mmask8)(U), \
7959 (int)(R)))
7960
7961 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_fnmsub_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128 __B)7962 _mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
7963 {
7964 return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
7965 -(__v4sf)__A,
7966 -(__v4sf)__B,
7967 (__mmask8)__U,
7968 _MM_FROUND_CUR_DIRECTION);
7969 }
7970
7971 #define _mm_fnmsub_round_ss(A, B, C, R) \
7972 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
7973 -(__v4sf)(__m128)(B), \
7974 -(__v4sf)(__m128)(C), (__mmask8)-1, \
7975 (int)(R)))
7976
7977 #define _mm_mask_fnmsub_round_ss(W, U, A, B, R) \
7978 ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
7979 -(__v4sf)(__m128)(A), \
7980 -(__v4sf)(__m128)(B), (__mmask8)(U), \
7981 (int)(R)))
7982
7983 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_fnmsub_ss(__mmask8 __U,__m128 __A,__m128 __B,__m128 __C)7984 _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
7985 {
7986 return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
7987 -(__v4sf)__B,
7988 -(__v4sf)__C,
7989 (__mmask8)__U,
7990 _MM_FROUND_CUR_DIRECTION);
7991 }
7992
7993 #define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \
7994 ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
7995 -(__v4sf)(__m128)(B), \
7996 -(__v4sf)(__m128)(C), (__mmask8)(U), \
7997 (int)(R)))
7998
7999 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask3_fnmsub_ss(__m128 __W,__m128 __X,__m128 __Y,__mmask8 __U)8000 _mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
8001 {
8002 return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W,
8003 -(__v4sf)__X,
8004 (__v4sf)__Y,
8005 (__mmask8)__U,
8006 _MM_FROUND_CUR_DIRECTION);
8007 }
8008
8009 #define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) \
8010 ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
8011 -(__v4sf)(__m128)(X), \
8012 (__v4sf)(__m128)(Y), (__mmask8)(U), \
8013 (int)(R)))
8014
8015 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_fmadd_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128d __B)8016 _mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
8017 {
8018 return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
8019 (__v2df)__A,
8020 (__v2df)__B,
8021 (__mmask8)__U,
8022 _MM_FROUND_CUR_DIRECTION);
8023 }
8024
8025 #define _mm_fmadd_round_sd(A, B, C, R) \
8026 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
8027 (__v2df)(__m128d)(B), \
8028 (__v2df)(__m128d)(C), (__mmask8)-1, \
8029 (int)(R)))
8030
8031 #define _mm_mask_fmadd_round_sd(W, U, A, B, R) \
8032 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
8033 (__v2df)(__m128d)(A), \
8034 (__v2df)(__m128d)(B), (__mmask8)(U), \
8035 (int)(R)))
8036
8037 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_fmadd_sd(__mmask8 __U,__m128d __A,__m128d __B,__m128d __C)8038 _mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
8039 {
8040 return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
8041 (__v2df)__B,
8042 (__v2df)__C,
8043 (__mmask8)__U,
8044 _MM_FROUND_CUR_DIRECTION);
8045 }
8046
8047 #define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \
8048 ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
8049 (__v2df)(__m128d)(B), \
8050 (__v2df)(__m128d)(C), (__mmask8)(U), \
8051 (int)(R)))
8052
8053 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask3_fmadd_sd(__m128d __W,__m128d __X,__m128d __Y,__mmask8 __U)8054 _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
8055 {
8056 return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W,
8057 (__v2df)__X,
8058 (__v2df)__Y,
8059 (__mmask8)__U,
8060 _MM_FROUND_CUR_DIRECTION);
8061 }
8062
8063 #define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) \
8064 ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
8065 (__v2df)(__m128d)(X), \
8066 (__v2df)(__m128d)(Y), (__mmask8)(U), \
8067 (int)(R)))
8068
8069 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_fmsub_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128d __B)8070 _mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
8071 {
8072 return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
8073 (__v2df)__A,
8074 -(__v2df)__B,
8075 (__mmask8)__U,
8076 _MM_FROUND_CUR_DIRECTION);
8077 }
8078
8079 #define _mm_fmsub_round_sd(A, B, C, R) \
8080 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
8081 (__v2df)(__m128d)(B), \
8082 -(__v2df)(__m128d)(C), (__mmask8)-1, \
8083 (int)(R)))
8084
8085 #define _mm_mask_fmsub_round_sd(W, U, A, B, R) \
8086 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
8087 (__v2df)(__m128d)(A), \
8088 -(__v2df)(__m128d)(B), (__mmask8)(U), \
8089 (int)(R)))
8090
8091 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_fmsub_sd(__mmask8 __U,__m128d __A,__m128d __B,__m128d __C)8092 _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
8093 {
8094 return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
8095 (__v2df)__B,
8096 -(__v2df)__C,
8097 (__mmask8)__U,
8098 _MM_FROUND_CUR_DIRECTION);
8099 }
8100
8101 #define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \
8102 ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
8103 (__v2df)(__m128d)(B), \
8104 -(__v2df)(__m128d)(C), \
8105 (__mmask8)(U), (int)(R)))
8106
8107 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask3_fmsub_sd(__m128d __W,__m128d __X,__m128d __Y,__mmask8 __U)8108 _mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
8109 {
8110 return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W,
8111 (__v2df)__X,
8112 (__v2df)__Y,
8113 (__mmask8)__U,
8114 _MM_FROUND_CUR_DIRECTION);
8115 }
8116
8117 #define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) \
8118 ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
8119 (__v2df)(__m128d)(X), \
8120 (__v2df)(__m128d)(Y), \
8121 (__mmask8)(U), (int)(R)))
8122
8123 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_fnmadd_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128d __B)8124 _mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
8125 {
8126 return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
8127 -(__v2df)__A,
8128 (__v2df)__B,
8129 (__mmask8)__U,
8130 _MM_FROUND_CUR_DIRECTION);
8131 }
8132
8133 #define _mm_fnmadd_round_sd(A, B, C, R) \
8134 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
8135 -(__v2df)(__m128d)(B), \
8136 (__v2df)(__m128d)(C), (__mmask8)-1, \
8137 (int)(R)))
8138
8139 #define _mm_mask_fnmadd_round_sd(W, U, A, B, R) \
8140 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
8141 -(__v2df)(__m128d)(A), \
8142 (__v2df)(__m128d)(B), (__mmask8)(U), \
8143 (int)(R)))
8144
8145 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_fnmadd_sd(__mmask8 __U,__m128d __A,__m128d __B,__m128d __C)8146 _mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
8147 {
8148 return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
8149 -(__v2df)__B,
8150 (__v2df)__C,
8151 (__mmask8)__U,
8152 _MM_FROUND_CUR_DIRECTION);
8153 }
8154
8155 #define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \
8156 ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
8157 -(__v2df)(__m128d)(B), \
8158 (__v2df)(__m128d)(C), (__mmask8)(U), \
8159 (int)(R)))
8160
8161 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask3_fnmadd_sd(__m128d __W,__m128d __X,__m128d __Y,__mmask8 __U)8162 _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
8163 {
8164 return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W,
8165 -(__v2df)__X,
8166 (__v2df)__Y,
8167 (__mmask8)__U,
8168 _MM_FROUND_CUR_DIRECTION);
8169 }
8170
8171 #define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) \
8172 ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
8173 -(__v2df)(__m128d)(X), \
8174 (__v2df)(__m128d)(Y), (__mmask8)(U), \
8175 (int)(R)))
8176
8177 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_fnmsub_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128d __B)8178 _mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
8179 {
8180 return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
8181 -(__v2df)__A,
8182 -(__v2df)__B,
8183 (__mmask8)__U,
8184 _MM_FROUND_CUR_DIRECTION);
8185 }
8186
8187 #define _mm_fnmsub_round_sd(A, B, C, R) \
8188 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
8189 -(__v2df)(__m128d)(B), \
8190 -(__v2df)(__m128d)(C), (__mmask8)-1, \
8191 (int)(R)))
8192
8193 #define _mm_mask_fnmsub_round_sd(W, U, A, B, R) \
8194 ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
8195 -(__v2df)(__m128d)(A), \
8196 -(__v2df)(__m128d)(B), (__mmask8)(U), \
8197 (int)(R)))
8198
8199 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_fnmsub_sd(__mmask8 __U,__m128d __A,__m128d __B,__m128d __C)8200 _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
8201 {
8202 return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
8203 -(__v2df)__B,
8204 -(__v2df)__C,
8205 (__mmask8)__U,
8206 _MM_FROUND_CUR_DIRECTION);
8207 }
8208
8209 #define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \
8210 ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
8211 -(__v2df)(__m128d)(B), \
8212 -(__v2df)(__m128d)(C), \
8213 (__mmask8)(U), \
8214 (int)(R)))
8215
8216 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask3_fnmsub_sd(__m128d __W,__m128d __X,__m128d __Y,__mmask8 __U)8217 _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
8218 {
8219 return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W,
8220 -(__v2df)__X,
8221 (__v2df)__Y,
8222 (__mmask8)__U,
8223 _MM_FROUND_CUR_DIRECTION);
8224 }
8225
8226 #define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) \
8227 ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
8228 -(__v2df)(__m128d)(X), \
8229 (__v2df)(__m128d)(Y), \
8230 (__mmask8)(U), (int)(R)))
8231
8232 #define _mm512_permutex_pd(X, C) \
8233 ((__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C)))
8234
8235 #define _mm512_mask_permutex_pd(W, U, X, C) \
8236 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
8237 (__v8df)_mm512_permutex_pd((X), (C)), \
8238 (__v8df)(__m512d)(W)))
8239
8240 #define _mm512_maskz_permutex_pd(U, X, C) \
8241 ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
8242 (__v8df)_mm512_permutex_pd((X), (C)), \
8243 (__v8df)_mm512_setzero_pd()))
8244
8245 #define _mm512_permutex_epi64(X, C) \
8246 ((__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C)))
8247
8248 #define _mm512_mask_permutex_epi64(W, U, X, C) \
8249 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
8250 (__v8di)_mm512_permutex_epi64((X), (C)), \
8251 (__v8di)(__m512i)(W)))
8252
8253 #define _mm512_maskz_permutex_epi64(U, X, C) \
8254 ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
8255 (__v8di)_mm512_permutex_epi64((X), (C)), \
8256 (__v8di)_mm512_setzero_si512()))
8257
8258 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_permutexvar_pd(__m512i __X,__m512d __Y)8259 _mm512_permutexvar_pd (__m512i __X, __m512d __Y)
8260 {
8261 return (__m512d)__builtin_ia32_permvardf512((__v8df) __Y, (__v8di) __X);
8262 }
8263
8264 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_permutexvar_pd(__m512d __W,__mmask8 __U,__m512i __X,__m512d __Y)8265 _mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y)
8266 {
8267 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
8268 (__v8df)_mm512_permutexvar_pd(__X, __Y),
8269 (__v8df)__W);
8270 }
8271
8272 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_permutexvar_pd(__mmask8 __U,__m512i __X,__m512d __Y)8273 _mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y)
8274 {
8275 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
8276 (__v8df)_mm512_permutexvar_pd(__X, __Y),
8277 (__v8df)_mm512_setzero_pd());
8278 }
8279
8280 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_permutexvar_epi64(__m512i __X,__m512i __Y)8281 _mm512_permutexvar_epi64 (__m512i __X, __m512i __Y)
8282 {
8283 return (__m512i)__builtin_ia32_permvardi512((__v8di)__Y, (__v8di)__X);
8284 }
8285
8286 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_permutexvar_epi64(__mmask8 __M,__m512i __X,__m512i __Y)8287 _mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y)
8288 {
8289 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
8290 (__v8di)_mm512_permutexvar_epi64(__X, __Y),
8291 (__v8di)_mm512_setzero_si512());
8292 }
8293
8294 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_permutexvar_epi64(__m512i __W,__mmask8 __M,__m512i __X,__m512i __Y)8295 _mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X,
8296 __m512i __Y)
8297 {
8298 return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
8299 (__v8di)_mm512_permutexvar_epi64(__X, __Y),
8300 (__v8di)__W);
8301 }
8302
8303 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_permutexvar_ps(__m512i __X,__m512 __Y)8304 _mm512_permutexvar_ps (__m512i __X, __m512 __Y)
8305 {
8306 return (__m512)__builtin_ia32_permvarsf512((__v16sf)__Y, (__v16si)__X);
8307 }
8308
8309 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_permutexvar_ps(__m512 __W,__mmask16 __U,__m512i __X,__m512 __Y)8310 _mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y)
8311 {
8312 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8313 (__v16sf)_mm512_permutexvar_ps(__X, __Y),
8314 (__v16sf)__W);
8315 }
8316
8317 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_permutexvar_ps(__mmask16 __U,__m512i __X,__m512 __Y)8318 _mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y)
8319 {
8320 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8321 (__v16sf)_mm512_permutexvar_ps(__X, __Y),
8322 (__v16sf)_mm512_setzero_ps());
8323 }
8324
8325 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_permutexvar_epi32(__m512i __X,__m512i __Y)8326 _mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
8327 {
8328 return (__m512i)__builtin_ia32_permvarsi512((__v16si)__Y, (__v16si)__X);
8329 }
8330
8331 #define _mm512_permutevar_epi32 _mm512_permutexvar_epi32
8332
8333 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_permutexvar_epi32(__mmask16 __M,__m512i __X,__m512i __Y)8334 _mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y)
8335 {
8336 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
8337 (__v16si)_mm512_permutexvar_epi32(__X, __Y),
8338 (__v16si)_mm512_setzero_si512());
8339 }
8340
8341 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_permutexvar_epi32(__m512i __W,__mmask16 __M,__m512i __X,__m512i __Y)8342 _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
8343 __m512i __Y)
8344 {
8345 return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
8346 (__v16si)_mm512_permutexvar_epi32(__X, __Y),
8347 (__v16si)__W);
8348 }
8349
8350 #define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32
8351
8352 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
_mm512_kand(__mmask16 __A,__mmask16 __B)8353 _mm512_kand (__mmask16 __A, __mmask16 __B)
8354 {
8355 return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B);
8356 }
8357
8358 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
_mm512_kandn(__mmask16 __A,__mmask16 __B)8359 _mm512_kandn (__mmask16 __A, __mmask16 __B)
8360 {
8361 return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B);
8362 }
8363
8364 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
_mm512_kor(__mmask16 __A,__mmask16 __B)8365 _mm512_kor (__mmask16 __A, __mmask16 __B)
8366 {
8367 return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
8368 }
8369
8370 static __inline__ int __DEFAULT_FN_ATTRS
_mm512_kortestc(__mmask16 __A,__mmask16 __B)8371 _mm512_kortestc (__mmask16 __A, __mmask16 __B)
8372 {
8373 return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B);
8374 }
8375
8376 static __inline__ int __DEFAULT_FN_ATTRS
_mm512_kortestz(__mmask16 __A,__mmask16 __B)8377 _mm512_kortestz (__mmask16 __A, __mmask16 __B)
8378 {
8379 return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B);
8380 }
8381
8382 static __inline__ unsigned char __DEFAULT_FN_ATTRS
_kortestc_mask16_u8(__mmask16 __A,__mmask16 __B)8383 _kortestc_mask16_u8(__mmask16 __A, __mmask16 __B)
8384 {
8385 return (unsigned char)__builtin_ia32_kortestchi(__A, __B);
8386 }
8387
8388 static __inline__ unsigned char __DEFAULT_FN_ATTRS
_kortestz_mask16_u8(__mmask16 __A,__mmask16 __B)8389 _kortestz_mask16_u8(__mmask16 __A, __mmask16 __B)
8390 {
8391 return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
8392 }
8393
8394 static __inline__ unsigned char __DEFAULT_FN_ATTRS
_kortest_mask16_u8(__mmask16 __A,__mmask16 __B,unsigned char * __C)8395 _kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) {
8396 *__C = (unsigned char)__builtin_ia32_kortestchi(__A, __B);
8397 return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
8398 }
8399
8400 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
_mm512_kunpackb(__mmask16 __A,__mmask16 __B)8401 _mm512_kunpackb (__mmask16 __A, __mmask16 __B)
8402 {
8403 return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
8404 }
8405
8406 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
_mm512_kxnor(__mmask16 __A,__mmask16 __B)8407 _mm512_kxnor (__mmask16 __A, __mmask16 __B)
8408 {
8409 return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B);
8410 }
8411
8412 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
_mm512_kxor(__mmask16 __A,__mmask16 __B)8413 _mm512_kxor (__mmask16 __A, __mmask16 __B)
8414 {
8415 return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B);
8416 }
8417
8418 #define _kand_mask16 _mm512_kand
8419 #define _kandn_mask16 _mm512_kandn
8420 #define _knot_mask16 _mm512_knot
8421 #define _kor_mask16 _mm512_kor
8422 #define _kxnor_mask16 _mm512_kxnor
8423 #define _kxor_mask16 _mm512_kxor
8424
8425 #define _kshiftli_mask16(A, I) \
8426 ((__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(A), (unsigned int)(I)))
8427
8428 #define _kshiftri_mask16(A, I) \
8429 ((__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I)))
8430
8431 static __inline__ unsigned int __DEFAULT_FN_ATTRS
_cvtmask16_u32(__mmask16 __A)8432 _cvtmask16_u32(__mmask16 __A) {
8433 return (unsigned int)__builtin_ia32_kmovw((__mmask16)__A);
8434 }
8435
8436 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
_cvtu32_mask16(unsigned int __A)8437 _cvtu32_mask16(unsigned int __A) {
8438 return (__mmask16)__builtin_ia32_kmovw((__mmask16)__A);
8439 }
8440
8441 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
_load_mask16(__mmask16 * __A)8442 _load_mask16(__mmask16 *__A) {
8443 return (__mmask16)__builtin_ia32_kmovw(*(__mmask16 *)__A);
8444 }
8445
8446 static __inline__ void __DEFAULT_FN_ATTRS
_store_mask16(__mmask16 * __A,__mmask16 __B)8447 _store_mask16(__mmask16 *__A, __mmask16 __B) {
8448 *(__mmask16 *)__A = __builtin_ia32_kmovw((__mmask16)__B);
8449 }
8450
8451 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_stream_si512(void * __P,__m512i __A)8452 _mm512_stream_si512 (void * __P, __m512i __A)
8453 {
8454 typedef __v8di __v8di_aligned __attribute__((aligned(64)));
8455 __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P);
8456 }
8457
8458 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_stream_load_si512(void const * __P)8459 _mm512_stream_load_si512 (void const *__P)
8460 {
8461 typedef __v8di __v8di_aligned __attribute__((aligned(64)));
8462 return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P);
8463 }
8464
8465 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_stream_pd(void * __P,__m512d __A)8466 _mm512_stream_pd (void *__P, __m512d __A)
8467 {
8468 typedef __v8df __v8df_aligned __attribute__((aligned(64)));
8469 __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P);
8470 }
8471
8472 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_stream_ps(void * __P,__m512 __A)8473 _mm512_stream_ps (void *__P, __m512 __A)
8474 {
8475 typedef __v16sf __v16sf_aligned __attribute__((aligned(64)));
8476 __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P);
8477 }
8478
8479 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_compress_pd(__m512d __W,__mmask8 __U,__m512d __A)8480 _mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A)
8481 {
8482 return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
8483 (__v8df) __W,
8484 (__mmask8) __U);
8485 }
8486
8487 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_compress_pd(__mmask8 __U,__m512d __A)8488 _mm512_maskz_compress_pd (__mmask8 __U, __m512d __A)
8489 {
8490 return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
8491 (__v8df)
8492 _mm512_setzero_pd (),
8493 (__mmask8) __U);
8494 }
8495
8496 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_compress_epi64(__m512i __W,__mmask8 __U,__m512i __A)8497 _mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
8498 {
8499 return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
8500 (__v8di) __W,
8501 (__mmask8) __U);
8502 }
8503
8504 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_compress_epi64(__mmask8 __U,__m512i __A)8505 _mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A)
8506 {
8507 return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
8508 (__v8di)
8509 _mm512_setzero_si512 (),
8510 (__mmask8) __U);
8511 }
8512
8513 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_compress_ps(__m512 __W,__mmask16 __U,__m512 __A)8514 _mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A)
8515 {
8516 return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
8517 (__v16sf) __W,
8518 (__mmask16) __U);
8519 }
8520
8521 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_compress_ps(__mmask16 __U,__m512 __A)8522 _mm512_maskz_compress_ps (__mmask16 __U, __m512 __A)
8523 {
8524 return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
8525 (__v16sf)
8526 _mm512_setzero_ps (),
8527 (__mmask16) __U);
8528 }
8529
8530 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_compress_epi32(__m512i __W,__mmask16 __U,__m512i __A)8531 _mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
8532 {
8533 return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
8534 (__v16si) __W,
8535 (__mmask16) __U);
8536 }
8537
8538 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_compress_epi32(__mmask16 __U,__m512i __A)8539 _mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A)
8540 {
8541 return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
8542 (__v16si)
8543 _mm512_setzero_si512 (),
8544 (__mmask16) __U);
8545 }
8546
8547 #define _mm_cmp_round_ss_mask(X, Y, P, R) \
8548 ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
8549 (__v4sf)(__m128)(Y), (int)(P), \
8550 (__mmask8)-1, (int)(R)))
8551
8552 #define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \
8553 ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
8554 (__v4sf)(__m128)(Y), (int)(P), \
8555 (__mmask8)(M), (int)(R)))
8556
8557 #define _mm_cmp_ss_mask(X, Y, P) \
8558 ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
8559 (__v4sf)(__m128)(Y), (int)(P), \
8560 (__mmask8)-1, \
8561 _MM_FROUND_CUR_DIRECTION))
8562
8563 #define _mm_mask_cmp_ss_mask(M, X, Y, P) \
8564 ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
8565 (__v4sf)(__m128)(Y), (int)(P), \
8566 (__mmask8)(M), \
8567 _MM_FROUND_CUR_DIRECTION))
8568
8569 #define _mm_cmp_round_sd_mask(X, Y, P, R) \
8570 ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
8571 (__v2df)(__m128d)(Y), (int)(P), \
8572 (__mmask8)-1, (int)(R)))
8573
8574 #define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \
8575 ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
8576 (__v2df)(__m128d)(Y), (int)(P), \
8577 (__mmask8)(M), (int)(R)))
8578
8579 #define _mm_cmp_sd_mask(X, Y, P) \
8580 ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
8581 (__v2df)(__m128d)(Y), (int)(P), \
8582 (__mmask8)-1, \
8583 _MM_FROUND_CUR_DIRECTION))
8584
8585 #define _mm_mask_cmp_sd_mask(M, X, Y, P) \
8586 ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
8587 (__v2df)(__m128d)(Y), (int)(P), \
8588 (__mmask8)(M), \
8589 _MM_FROUND_CUR_DIRECTION))
8590
8591 /* Bit Test */
8592
8593 static __inline __mmask16 __DEFAULT_FN_ATTRS512
_mm512_test_epi32_mask(__m512i __A,__m512i __B)8594 _mm512_test_epi32_mask (__m512i __A, __m512i __B)
8595 {
8596 return _mm512_cmpneq_epi32_mask (_mm512_and_epi32(__A, __B),
8597 _mm512_setzero_si512());
8598 }
8599
8600 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
_mm512_mask_test_epi32_mask(__mmask16 __U,__m512i __A,__m512i __B)8601 _mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
8602 {
8603 return _mm512_mask_cmpneq_epi32_mask (__U, _mm512_and_epi32 (__A, __B),
8604 _mm512_setzero_si512());
8605 }
8606
8607 static __inline __mmask8 __DEFAULT_FN_ATTRS512
_mm512_test_epi64_mask(__m512i __A,__m512i __B)8608 _mm512_test_epi64_mask (__m512i __A, __m512i __B)
8609 {
8610 return _mm512_cmpneq_epi64_mask (_mm512_and_epi32 (__A, __B),
8611 _mm512_setzero_si512());
8612 }
8613
8614 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
_mm512_mask_test_epi64_mask(__mmask8 __U,__m512i __A,__m512i __B)8615 _mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
8616 {
8617 return _mm512_mask_cmpneq_epi64_mask (__U, _mm512_and_epi32 (__A, __B),
8618 _mm512_setzero_si512());
8619 }
8620
8621 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
_mm512_testn_epi32_mask(__m512i __A,__m512i __B)8622 _mm512_testn_epi32_mask (__m512i __A, __m512i __B)
8623 {
8624 return _mm512_cmpeq_epi32_mask (_mm512_and_epi32 (__A, __B),
8625 _mm512_setzero_si512());
8626 }
8627
8628 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
_mm512_mask_testn_epi32_mask(__mmask16 __U,__m512i __A,__m512i __B)8629 _mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
8630 {
8631 return _mm512_mask_cmpeq_epi32_mask (__U, _mm512_and_epi32 (__A, __B),
8632 _mm512_setzero_si512());
8633 }
8634
8635 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
_mm512_testn_epi64_mask(__m512i __A,__m512i __B)8636 _mm512_testn_epi64_mask (__m512i __A, __m512i __B)
8637 {
8638 return _mm512_cmpeq_epi64_mask (_mm512_and_epi32 (__A, __B),
8639 _mm512_setzero_si512());
8640 }
8641
8642 static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
_mm512_mask_testn_epi64_mask(__mmask8 __U,__m512i __A,__m512i __B)8643 _mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
8644 {
8645 return _mm512_mask_cmpeq_epi64_mask (__U, _mm512_and_epi32 (__A, __B),
8646 _mm512_setzero_si512());
8647 }
8648
8649 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_movehdup_ps(__m512 __A)8650 _mm512_movehdup_ps (__m512 __A)
8651 {
8652 return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
8653 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
8654 }
8655
8656 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_movehdup_ps(__m512 __W,__mmask16 __U,__m512 __A)8657 _mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
8658 {
8659 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8660 (__v16sf)_mm512_movehdup_ps(__A),
8661 (__v16sf)__W);
8662 }
8663
8664 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_movehdup_ps(__mmask16 __U,__m512 __A)8665 _mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
8666 {
8667 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8668 (__v16sf)_mm512_movehdup_ps(__A),
8669 (__v16sf)_mm512_setzero_ps());
8670 }
8671
8672 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_moveldup_ps(__m512 __A)8673 _mm512_moveldup_ps (__m512 __A)
8674 {
8675 return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
8676 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
8677 }
8678
8679 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_moveldup_ps(__m512 __W,__mmask16 __U,__m512 __A)8680 _mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
8681 {
8682 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8683 (__v16sf)_mm512_moveldup_ps(__A),
8684 (__v16sf)__W);
8685 }
8686
8687 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_moveldup_ps(__mmask16 __U,__m512 __A)8688 _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
8689 {
8690 return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8691 (__v16sf)_mm512_moveldup_ps(__A),
8692 (__v16sf)_mm512_setzero_ps());
8693 }
8694
8695 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_move_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128 __B)8696 _mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
8697 {
8698 return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W);
8699 }
8700
8701 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_move_ss(__mmask8 __U,__m128 __A,__m128 __B)8702 _mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
8703 {
8704 return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B),
8705 _mm_setzero_ps());
8706 }
8707
8708 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_move_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128d __B)8709 _mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
8710 {
8711 return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W);
8712 }
8713
8714 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_move_sd(__mmask8 __U,__m128d __A,__m128d __B)8715 _mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
8716 {
8717 return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B),
8718 _mm_setzero_pd());
8719 }
8720
8721 static __inline__ void __DEFAULT_FN_ATTRS128
_mm_mask_store_ss(float * __W,__mmask8 __U,__m128 __A)8722 _mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
8723 {
8724 __builtin_ia32_storess128_mask ((__v4sf *)__W, __A, __U & 1);
8725 }
8726
8727 static __inline__ void __DEFAULT_FN_ATTRS128
_mm_mask_store_sd(double * __W,__mmask8 __U,__m128d __A)8728 _mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A)
8729 {
8730 __builtin_ia32_storesd128_mask ((__v2df *)__W, __A, __U & 1);
8731 }
8732
8733 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_load_ss(__m128 __W,__mmask8 __U,const float * __A)8734 _mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A)
8735 {
8736 __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W,
8737 (__v4sf)_mm_setzero_ps(),
8738 0, 4, 4, 4);
8739
8740 return (__m128) __builtin_ia32_loadss128_mask ((const __v4sf *) __A, src, __U & 1);
8741 }
8742
8743 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_load_ss(__mmask8 __U,const float * __A)8744 _mm_maskz_load_ss (__mmask8 __U, const float* __A)
8745 {
8746 return (__m128)__builtin_ia32_loadss128_mask ((const __v4sf *) __A,
8747 (__v4sf) _mm_setzero_ps(),
8748 __U & 1);
8749 }
8750
8751 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_load_sd(__m128d __W,__mmask8 __U,const double * __A)8752 _mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A)
8753 {
8754 __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W,
8755 (__v2df)_mm_setzero_pd(),
8756 0, 2);
8757
8758 return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, src, __U & 1);
8759 }
8760
8761 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_load_sd(__mmask8 __U,const double * __A)8762 _mm_maskz_load_sd (__mmask8 __U, const double* __A)
8763 {
8764 return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A,
8765 (__v2df) _mm_setzero_pd(),
8766 __U & 1);
8767 }
8768
8769 #define _mm512_shuffle_epi32(A, I) \
8770 ((__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I)))
8771
8772 #define _mm512_mask_shuffle_epi32(W, U, A, I) \
8773 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
8774 (__v16si)_mm512_shuffle_epi32((A), (I)), \
8775 (__v16si)(__m512i)(W)))
8776
8777 #define _mm512_maskz_shuffle_epi32(U, A, I) \
8778 ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
8779 (__v16si)_mm512_shuffle_epi32((A), (I)), \
8780 (__v16si)_mm512_setzero_si512()))
8781
8782 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_expand_pd(__m512d __W,__mmask8 __U,__m512d __A)8783 _mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
8784 {
8785 return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
8786 (__v8df) __W,
8787 (__mmask8) __U);
8788 }
8789
8790 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_expand_pd(__mmask8 __U,__m512d __A)8791 _mm512_maskz_expand_pd (__mmask8 __U, __m512d __A)
8792 {
8793 return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
8794 (__v8df) _mm512_setzero_pd (),
8795 (__mmask8) __U);
8796 }
8797
8798 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_expand_epi64(__m512i __W,__mmask8 __U,__m512i __A)8799 _mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
8800 {
8801 return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
8802 (__v8di) __W,
8803 (__mmask8) __U);
8804 }
8805
8806 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_expand_epi64(__mmask8 __U,__m512i __A)8807 _mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A)
8808 {
8809 return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
8810 (__v8di) _mm512_setzero_si512 (),
8811 (__mmask8) __U);
8812 }
8813
8814 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_expandloadu_pd(__m512d __W,__mmask8 __U,void const * __P)8815 _mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P)
8816 {
8817 return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
8818 (__v8df) __W,
8819 (__mmask8) __U);
8820 }
8821
8822 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_expandloadu_pd(__mmask8 __U,void const * __P)8823 _mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P)
8824 {
8825 return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
8826 (__v8df) _mm512_setzero_pd(),
8827 (__mmask8) __U);
8828 }
8829
8830 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_expandloadu_epi64(__m512i __W,__mmask8 __U,void const * __P)8831 _mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P)
8832 {
8833 return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
8834 (__v8di) __W,
8835 (__mmask8) __U);
8836 }
8837
8838 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_expandloadu_epi64(__mmask8 __U,void const * __P)8839 _mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P)
8840 {
8841 return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
8842 (__v8di) _mm512_setzero_si512(),
8843 (__mmask8) __U);
8844 }
8845
8846 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_expandloadu_ps(__m512 __W,__mmask16 __U,void const * __P)8847 _mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P)
8848 {
8849 return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
8850 (__v16sf) __W,
8851 (__mmask16) __U);
8852 }
8853
8854 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_expandloadu_ps(__mmask16 __U,void const * __P)8855 _mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P)
8856 {
8857 return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
8858 (__v16sf) _mm512_setzero_ps(),
8859 (__mmask16) __U);
8860 }
8861
8862 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_expandloadu_epi32(__m512i __W,__mmask16 __U,void const * __P)8863 _mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P)
8864 {
8865 return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
8866 (__v16si) __W,
8867 (__mmask16) __U);
8868 }
8869
8870 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_expandloadu_epi32(__mmask16 __U,void const * __P)8871 _mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P)
8872 {
8873 return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
8874 (__v16si) _mm512_setzero_si512(),
8875 (__mmask16) __U);
8876 }
8877
8878 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_expand_ps(__m512 __W,__mmask16 __U,__m512 __A)8879 _mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A)
8880 {
8881 return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
8882 (__v16sf) __W,
8883 (__mmask16) __U);
8884 }
8885
8886 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_expand_ps(__mmask16 __U,__m512 __A)8887 _mm512_maskz_expand_ps (__mmask16 __U, __m512 __A)
8888 {
8889 return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
8890 (__v16sf) _mm512_setzero_ps(),
8891 (__mmask16) __U);
8892 }
8893
8894 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_expand_epi32(__m512i __W,__mmask16 __U,__m512i __A)8895 _mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
8896 {
8897 return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
8898 (__v16si) __W,
8899 (__mmask16) __U);
8900 }
8901
8902 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_expand_epi32(__mmask16 __U,__m512i __A)8903 _mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A)
8904 {
8905 return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
8906 (__v16si) _mm512_setzero_si512(),
8907 (__mmask16) __U);
8908 }
8909
8910 #define _mm512_cvt_roundps_pd(A, R) \
8911 ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
8912 (__v8df)_mm512_undefined_pd(), \
8913 (__mmask8)-1, (int)(R)))
8914
8915 #define _mm512_mask_cvt_roundps_pd(W, U, A, R) \
8916 ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
8917 (__v8df)(__m512d)(W), \
8918 (__mmask8)(U), (int)(R)))
8919
8920 #define _mm512_maskz_cvt_roundps_pd(U, A, R) \
8921 ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
8922 (__v8df)_mm512_setzero_pd(), \
8923 (__mmask8)(U), (int)(R)))
8924
8925 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_cvtps_pd(__m256 __A)8926 _mm512_cvtps_pd (__m256 __A)
8927 {
8928 return (__m512d) __builtin_convertvector((__v8sf)__A, __v8df);
8929 }
8930
8931 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_cvtps_pd(__m512d __W,__mmask8 __U,__m256 __A)8932 _mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A)
8933 {
8934 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
8935 (__v8df)_mm512_cvtps_pd(__A),
8936 (__v8df)__W);
8937 }
8938
8939 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtps_pd(__mmask8 __U,__m256 __A)8940 _mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
8941 {
8942 return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
8943 (__v8df)_mm512_cvtps_pd(__A),
8944 (__v8df)_mm512_setzero_pd());
8945 }
8946
8947 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_cvtpslo_pd(__m512 __A)8948 _mm512_cvtpslo_pd (__m512 __A)
8949 {
8950 return (__m512d) _mm512_cvtps_pd(_mm512_castps512_ps256(__A));
8951 }
8952
8953 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_cvtpslo_pd(__m512d __W,__mmask8 __U,__m512 __A)8954 _mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A)
8955 {
8956 return (__m512d) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A));
8957 }
8958
8959 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_mov_pd(__m512d __W,__mmask8 __U,__m512d __A)8960 _mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
8961 {
8962 return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
8963 (__v8df) __A,
8964 (__v8df) __W);
8965 }
8966
8967 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_mov_pd(__mmask8 __U,__m512d __A)8968 _mm512_maskz_mov_pd (__mmask8 __U, __m512d __A)
8969 {
8970 return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
8971 (__v8df) __A,
8972 (__v8df) _mm512_setzero_pd ());
8973 }
8974
8975 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_mov_ps(__m512 __W,__mmask16 __U,__m512 __A)8976 _mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A)
8977 {
8978 return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
8979 (__v16sf) __A,
8980 (__v16sf) __W);
8981 }
8982
8983 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_mov_ps(__mmask16 __U,__m512 __A)8984 _mm512_maskz_mov_ps (__mmask16 __U, __m512 __A)
8985 {
8986 return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
8987 (__v16sf) __A,
8988 (__v16sf) _mm512_setzero_ps ());
8989 }
8990
8991 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_compressstoreu_pd(void * __P,__mmask8 __U,__m512d __A)8992 _mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A)
8993 {
8994 __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A,
8995 (__mmask8) __U);
8996 }
8997
8998 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_compressstoreu_epi64(void * __P,__mmask8 __U,__m512i __A)8999 _mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A)
9000 {
9001 __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A,
9002 (__mmask8) __U);
9003 }
9004
9005 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_compressstoreu_ps(void * __P,__mmask16 __U,__m512 __A)9006 _mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A)
9007 {
9008 __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A,
9009 (__mmask16) __U);
9010 }
9011
9012 static __inline__ void __DEFAULT_FN_ATTRS512
_mm512_mask_compressstoreu_epi32(void * __P,__mmask16 __U,__m512i __A)9013 _mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A)
9014 {
9015 __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A,
9016 (__mmask16) __U);
9017 }
9018
9019 #define _mm_cvt_roundsd_ss(A, B, R) \
9020 ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
9021 (__v2df)(__m128d)(B), \
9022 (__v4sf)_mm_undefined_ps(), \
9023 (__mmask8)-1, (int)(R)))
9024
9025 #define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) \
9026 ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
9027 (__v2df)(__m128d)(B), \
9028 (__v4sf)(__m128)(W), \
9029 (__mmask8)(U), (int)(R)))
9030
9031 #define _mm_maskz_cvt_roundsd_ss(U, A, B, R) \
9032 ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
9033 (__v2df)(__m128d)(B), \
9034 (__v4sf)_mm_setzero_ps(), \
9035 (__mmask8)(U), (int)(R)))
9036
9037 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_cvtsd_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128d __B)9038 _mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B)
9039 {
9040 return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A,
9041 (__v2df)__B,
9042 (__v4sf)__W,
9043 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
9044 }
9045
9046 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_cvtsd_ss(__mmask8 __U,__m128 __A,__m128d __B)9047 _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
9048 {
9049 return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A,
9050 (__v2df)__B,
9051 (__v4sf)_mm_setzero_ps(),
9052 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
9053 }
9054
9055 #define _mm_cvtss_i32 _mm_cvtss_si32
9056 #define _mm_cvtsd_i32 _mm_cvtsd_si32
9057 #define _mm_cvti32_sd _mm_cvtsi32_sd
9058 #define _mm_cvti32_ss _mm_cvtsi32_ss
9059 #ifdef __x86_64__
9060 #define _mm_cvtss_i64 _mm_cvtss_si64
9061 #define _mm_cvtsd_i64 _mm_cvtsd_si64
9062 #define _mm_cvti64_sd _mm_cvtsi64_sd
9063 #define _mm_cvti64_ss _mm_cvtsi64_ss
9064 #endif
9065
9066 #ifdef __x86_64__
9067 #define _mm_cvt_roundi64_sd(A, B, R) \
9068 ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
9069 (int)(R)))
9070
9071 #define _mm_cvt_roundsi64_sd(A, B, R) \
9072 ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
9073 (int)(R)))
9074 #endif
9075
9076 #define _mm_cvt_roundsi32_ss(A, B, R) \
9077 ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)))
9078
9079 #define _mm_cvt_roundi32_ss(A, B, R) \
9080 ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)))
9081
9082 #ifdef __x86_64__
9083 #define _mm_cvt_roundsi64_ss(A, B, R) \
9084 ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
9085 (int)(R)))
9086
9087 #define _mm_cvt_roundi64_ss(A, B, R) \
9088 ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
9089 (int)(R)))
9090 #endif
9091
9092 #define _mm_cvt_roundss_sd(A, B, R) \
9093 ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
9094 (__v4sf)(__m128)(B), \
9095 (__v2df)_mm_undefined_pd(), \
9096 (__mmask8)-1, (int)(R)))
9097
9098 #define _mm_mask_cvt_roundss_sd(W, U, A, B, R) \
9099 ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
9100 (__v4sf)(__m128)(B), \
9101 (__v2df)(__m128d)(W), \
9102 (__mmask8)(U), (int)(R)))
9103
9104 #define _mm_maskz_cvt_roundss_sd(U, A, B, R) \
9105 ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
9106 (__v4sf)(__m128)(B), \
9107 (__v2df)_mm_setzero_pd(), \
9108 (__mmask8)(U), (int)(R)))
9109
9110 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_mask_cvtss_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128 __B)9111 _mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B)
9112 {
9113 return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A,
9114 (__v4sf)__B,
9115 (__v2df)__W,
9116 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
9117 }
9118
9119 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_cvtss_sd(__mmask8 __U,__m128d __A,__m128 __B)9120 _mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B)
9121 {
9122 return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A,
9123 (__v4sf)__B,
9124 (__v2df)_mm_setzero_pd(),
9125 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
9126 }
9127
9128 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_cvtu32_sd(__m128d __A,unsigned __B)9129 _mm_cvtu32_sd (__m128d __A, unsigned __B)
9130 {
9131 __A[0] = __B;
9132 return __A;
9133 }
9134
9135 #ifdef __x86_64__
9136 #define _mm_cvt_roundu64_sd(A, B, R) \
9137 ((__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \
9138 (unsigned long long)(B), (int)(R)))
9139
9140 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_cvtu64_sd(__m128d __A,unsigned long long __B)9141 _mm_cvtu64_sd (__m128d __A, unsigned long long __B)
9142 {
9143 __A[0] = __B;
9144 return __A;
9145 }
9146 #endif
9147
9148 #define _mm_cvt_roundu32_ss(A, B, R) \
9149 ((__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \
9150 (int)(R)))
9151
9152 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtu32_ss(__m128 __A,unsigned __B)9153 _mm_cvtu32_ss (__m128 __A, unsigned __B)
9154 {
9155 __A[0] = __B;
9156 return __A;
9157 }
9158
9159 #ifdef __x86_64__
9160 #define _mm_cvt_roundu64_ss(A, B, R) \
9161 ((__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \
9162 (unsigned long long)(B), (int)(R)))
9163
9164 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_cvtu64_ss(__m128 __A,unsigned long long __B)9165 _mm_cvtu64_ss (__m128 __A, unsigned long long __B)
9166 {
9167 __A[0] = __B;
9168 return __A;
9169 }
9170 #endif
9171
9172 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_set1_epi32(__m512i __O,__mmask16 __M,int __A)9173 _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
9174 {
9175 return (__m512i) __builtin_ia32_selectd_512(__M,
9176 (__v16si) _mm512_set1_epi32(__A),
9177 (__v16si) __O);
9178 }
9179
9180 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_set1_epi64(__m512i __O,__mmask8 __M,long long __A)9181 _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
9182 {
9183 return (__m512i) __builtin_ia32_selectq_512(__M,
9184 (__v8di) _mm512_set1_epi64(__A),
9185 (__v8di) __O);
9186 }
9187
9188 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_set_epi8(char __e63,char __e62,char __e61,char __e60,char __e59,char __e58,char __e57,char __e56,char __e55,char __e54,char __e53,char __e52,char __e51,char __e50,char __e49,char __e48,char __e47,char __e46,char __e45,char __e44,char __e43,char __e42,char __e41,char __e40,char __e39,char __e38,char __e37,char __e36,char __e35,char __e34,char __e33,char __e32,char __e31,char __e30,char __e29,char __e28,char __e27,char __e26,char __e25,char __e24,char __e23,char __e22,char __e21,char __e20,char __e19,char __e18,char __e17,char __e16,char __e15,char __e14,char __e13,char __e12,char __e11,char __e10,char __e9,char __e8,char __e7,char __e6,char __e5,char __e4,char __e3,char __e2,char __e1,char __e0)9189 _mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59,
9190 char __e58, char __e57, char __e56, char __e55, char __e54, char __e53,
9191 char __e52, char __e51, char __e50, char __e49, char __e48, char __e47,
9192 char __e46, char __e45, char __e44, char __e43, char __e42, char __e41,
9193 char __e40, char __e39, char __e38, char __e37, char __e36, char __e35,
9194 char __e34, char __e33, char __e32, char __e31, char __e30, char __e29,
9195 char __e28, char __e27, char __e26, char __e25, char __e24, char __e23,
9196 char __e22, char __e21, char __e20, char __e19, char __e18, char __e17,
9197 char __e16, char __e15, char __e14, char __e13, char __e12, char __e11,
9198 char __e10, char __e9, char __e8, char __e7, char __e6, char __e5,
9199 char __e4, char __e3, char __e2, char __e1, char __e0) {
9200
9201 return __extension__ (__m512i)(__v64qi)
9202 {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
9203 __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
9204 __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
9205 __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31,
9206 __e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39,
9207 __e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47,
9208 __e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55,
9209 __e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63};
9210 }
9211
9212 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_set_epi16(short __e31,short __e30,short __e29,short __e28,short __e27,short __e26,short __e25,short __e24,short __e23,short __e22,short __e21,short __e20,short __e19,short __e18,short __e17,short __e16,short __e15,short __e14,short __e13,short __e12,short __e11,short __e10,short __e9,short __e8,short __e7,short __e6,short __e5,short __e4,short __e3,short __e2,short __e1,short __e0)9213 _mm512_set_epi16(short __e31, short __e30, short __e29, short __e28,
9214 short __e27, short __e26, short __e25, short __e24, short __e23,
9215 short __e22, short __e21, short __e20, short __e19, short __e18,
9216 short __e17, short __e16, short __e15, short __e14, short __e13,
9217 short __e12, short __e11, short __e10, short __e9, short __e8,
9218 short __e7, short __e6, short __e5, short __e4, short __e3,
9219 short __e2, short __e1, short __e0) {
9220 return __extension__ (__m512i)(__v32hi)
9221 {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
9222 __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
9223 __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
9224 __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 };
9225 }
9226
9227 static __inline __m512i __DEFAULT_FN_ATTRS512
_mm512_set_epi32(int __A,int __B,int __C,int __D,int __E,int __F,int __G,int __H,int __I,int __J,int __K,int __L,int __M,int __N,int __O,int __P)9228 _mm512_set_epi32 (int __A, int __B, int __C, int __D,
9229 int __E, int __F, int __G, int __H,
9230 int __I, int __J, int __K, int __L,
9231 int __M, int __N, int __O, int __P)
9232 {
9233 return __extension__ (__m512i)(__v16si)
9234 { __P, __O, __N, __M, __L, __K, __J, __I,
9235 __H, __G, __F, __E, __D, __C, __B, __A };
9236 }
9237
9238 #define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7, \
9239 e8,e9,e10,e11,e12,e13,e14,e15) \
9240 _mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \
9241 (e5),(e4),(e3),(e2),(e1),(e0))
9242
9243 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_set_epi64(long long __A,long long __B,long long __C,long long __D,long long __E,long long __F,long long __G,long long __H)9244 _mm512_set_epi64 (long long __A, long long __B, long long __C,
9245 long long __D, long long __E, long long __F,
9246 long long __G, long long __H)
9247 {
9248 return __extension__ (__m512i) (__v8di)
9249 { __H, __G, __F, __E, __D, __C, __B, __A };
9250 }
9251
9252 #define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7) \
9253 _mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
9254
9255 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_set_pd(double __A,double __B,double __C,double __D,double __E,double __F,double __G,double __H)9256 _mm512_set_pd (double __A, double __B, double __C, double __D,
9257 double __E, double __F, double __G, double __H)
9258 {
9259 return __extension__ (__m512d)
9260 { __H, __G, __F, __E, __D, __C, __B, __A };
9261 }
9262
9263 #define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7) \
9264 _mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
9265
9266 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_set_ps(float __A,float __B,float __C,float __D,float __E,float __F,float __G,float __H,float __I,float __J,float __K,float __L,float __M,float __N,float __O,float __P)9267 _mm512_set_ps (float __A, float __B, float __C, float __D,
9268 float __E, float __F, float __G, float __H,
9269 float __I, float __J, float __K, float __L,
9270 float __M, float __N, float __O, float __P)
9271 {
9272 return __extension__ (__m512)
9273 { __P, __O, __N, __M, __L, __K, __J, __I,
9274 __H, __G, __F, __E, __D, __C, __B, __A };
9275 }
9276
9277 #define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \
9278 _mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \
9279 (e4),(e3),(e2),(e1),(e0))
9280
9281 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_abs_ps(__m512 __A)9282 _mm512_abs_ps(__m512 __A)
9283 {
9284 return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
9285 }
9286
9287 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_abs_ps(__m512 __W,__mmask16 __K,__m512 __A)9288 _mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A)
9289 {
9290 return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
9291 }
9292
9293 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_abs_pd(__m512d __A)9294 _mm512_abs_pd(__m512d __A)
9295 {
9296 return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ;
9297 }
9298
9299 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_abs_pd(__m512d __W,__mmask8 __K,__m512d __A)9300 _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
9301 {
9302 return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A);
9303 }
9304
9305 /* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as
9306 * outputs. This class of vector operation forms the basis of many scientific
9307 * computations. In vector-reduction arithmetic, the evaluation order is
9308 * independent of the order of the input elements of V.
9309
9310 * For floating-point intrinsics:
9311 * 1. When using fadd/fmul intrinsics, the order of operations within the
9312 * vector is unspecified (associative math).
9313 * 2. When using fmin/fmax intrinsics, NaN or -0.0 elements within the vector
9314 * produce unspecified results.
9315
9316 * Used bisection method. At each step, we partition the vector with previous
9317 * step in half, and the operation is performed on its two halves.
9318 * This takes log2(n) steps where n is the number of elements in the vector.
9319 */
9320
_mm512_reduce_add_epi64(__m512i __W)9321 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
9322 return __builtin_reduce_add((__v8di)__W);
9323 }
9324
_mm512_reduce_mul_epi64(__m512i __W)9325 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {
9326 return __builtin_reduce_mul((__v8di)__W);
9327 }
9328
_mm512_reduce_and_epi64(__m512i __W)9329 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {
9330 return __builtin_reduce_and((__v8di)__W);
9331 }
9332
_mm512_reduce_or_epi64(__m512i __W)9333 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) {
9334 return __builtin_reduce_or((__v8di)__W);
9335 }
9336
9337 static __inline__ long long __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_add_epi64(__mmask8 __M,__m512i __W)9338 _mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
9339 __W = _mm512_maskz_mov_epi64(__M, __W);
9340 return __builtin_reduce_add((__v8di)__W);
9341 }
9342
9343 static __inline__ long long __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_mul_epi64(__mmask8 __M,__m512i __W)9344 _mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
9345 __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W);
9346 return __builtin_reduce_mul((__v8di)__W);
9347 }
9348
9349 static __inline__ long long __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_and_epi64(__mmask8 __M,__m512i __W)9350 _mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
9351 __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __W);
9352 return __builtin_reduce_and((__v8di)__W);
9353 }
9354
9355 static __inline__ long long __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_or_epi64(__mmask8 __M,__m512i __W)9356 _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
9357 __W = _mm512_maskz_mov_epi64(__M, __W);
9358 return __builtin_reduce_or((__v8di)__W);
9359 }
9360
9361 // -0.0 is used to ignore the start value since it is the neutral value of
9362 // floating point addition. For more information, please refer to
9363 // https://llvm.org/docs/LangRef.html#llvm-vector-reduce-fadd-intrinsic
_mm512_reduce_add_pd(__m512d __W)9364 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) {
9365 return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
9366 }
9367
_mm512_reduce_mul_pd(__m512d __W)9368 static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) {
9369 return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
9370 }
9371
9372 static __inline__ double __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_add_pd(__mmask8 __M,__m512d __W)9373 _mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
9374 __W = _mm512_maskz_mov_pd(__M, __W);
9375 return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
9376 }
9377
9378 static __inline__ double __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_mul_pd(__mmask8 __M,__m512d __W)9379 _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
9380 __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W);
9381 return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
9382 }
9383
9384 static __inline__ int __DEFAULT_FN_ATTRS512
_mm512_reduce_add_epi32(__m512i __W)9385 _mm512_reduce_add_epi32(__m512i __W) {
9386 return __builtin_reduce_add((__v16si)__W);
9387 }
9388
9389 static __inline__ int __DEFAULT_FN_ATTRS512
_mm512_reduce_mul_epi32(__m512i __W)9390 _mm512_reduce_mul_epi32(__m512i __W) {
9391 return __builtin_reduce_mul((__v16si)__W);
9392 }
9393
9394 static __inline__ int __DEFAULT_FN_ATTRS512
_mm512_reduce_and_epi32(__m512i __W)9395 _mm512_reduce_and_epi32(__m512i __W) {
9396 return __builtin_reduce_and((__v16si)__W);
9397 }
9398
9399 static __inline__ int __DEFAULT_FN_ATTRS512
_mm512_reduce_or_epi32(__m512i __W)9400 _mm512_reduce_or_epi32(__m512i __W) {
9401 return __builtin_reduce_or((__v16si)__W);
9402 }
9403
9404 static __inline__ int __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_add_epi32(__mmask16 __M,__m512i __W)9405 _mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
9406 __W = _mm512_maskz_mov_epi32(__M, __W);
9407 return __builtin_reduce_add((__v16si)__W);
9408 }
9409
9410 static __inline__ int __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_mul_epi32(__mmask16 __M,__m512i __W)9411 _mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
9412 __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W);
9413 return __builtin_reduce_mul((__v16si)__W);
9414 }
9415
9416 static __inline__ int __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_and_epi32(__mmask16 __M,__m512i __W)9417 _mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
9418 __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __W);
9419 return __builtin_reduce_and((__v16si)__W);
9420 }
9421
9422 static __inline__ int __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_or_epi32(__mmask16 __M,__m512i __W)9423 _mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
9424 __W = _mm512_maskz_mov_epi32(__M, __W);
9425 return __builtin_reduce_or((__v16si)__W);
9426 }
9427
9428 static __inline__ float __DEFAULT_FN_ATTRS512
_mm512_reduce_add_ps(__m512 __W)9429 _mm512_reduce_add_ps(__m512 __W) {
9430 return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
9431 }
9432
9433 static __inline__ float __DEFAULT_FN_ATTRS512
_mm512_reduce_mul_ps(__m512 __W)9434 _mm512_reduce_mul_ps(__m512 __W) {
9435 return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
9436 }
9437
9438 static __inline__ float __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_add_ps(__mmask16 __M,__m512 __W)9439 _mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
9440 __W = _mm512_maskz_mov_ps(__M, __W);
9441 return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
9442 }
9443
9444 static __inline__ float __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_mul_ps(__mmask16 __M,__m512 __W)9445 _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
9446 __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W);
9447 return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
9448 }
9449
9450 static __inline__ long long __DEFAULT_FN_ATTRS512
_mm512_reduce_max_epi64(__m512i __V)9451 _mm512_reduce_max_epi64(__m512i __V) {
9452 return __builtin_reduce_max((__v8di)__V);
9453 }
9454
9455 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
_mm512_reduce_max_epu64(__m512i __V)9456 _mm512_reduce_max_epu64(__m512i __V) {
9457 return __builtin_reduce_max((__v8du)__V);
9458 }
9459
9460 static __inline__ long long __DEFAULT_FN_ATTRS512
_mm512_reduce_min_epi64(__m512i __V)9461 _mm512_reduce_min_epi64(__m512i __V) {
9462 return __builtin_reduce_min((__v8di)__V);
9463 }
9464
9465 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
_mm512_reduce_min_epu64(__m512i __V)9466 _mm512_reduce_min_epu64(__m512i __V) {
9467 return __builtin_reduce_min((__v8du)__V);
9468 }
9469
9470 static __inline__ long long __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_max_epi64(__mmask8 __M,__m512i __V)9471 _mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
9472 __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V);
9473 return __builtin_reduce_max((__v8di)__V);
9474 }
9475
9476 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_max_epu64(__mmask8 __M,__m512i __V)9477 _mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
9478 __V = _mm512_maskz_mov_epi64(__M, __V);
9479 return __builtin_reduce_max((__v8du)__V);
9480 }
9481
9482 static __inline__ long long __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_min_epi64(__mmask8 __M,__m512i __V)9483 _mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
9484 __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V);
9485 return __builtin_reduce_min((__v8di)__V);
9486 }
9487
9488 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_min_epu64(__mmask8 __M,__m512i __V)9489 _mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
9490 __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __V);
9491 return __builtin_reduce_min((__v8du)__V);
9492 }
9493 static __inline__ int __DEFAULT_FN_ATTRS512
_mm512_reduce_max_epi32(__m512i __V)9494 _mm512_reduce_max_epi32(__m512i __V) {
9495 return __builtin_reduce_max((__v16si)__V);
9496 }
9497
9498 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
_mm512_reduce_max_epu32(__m512i __V)9499 _mm512_reduce_max_epu32(__m512i __V) {
9500 return __builtin_reduce_max((__v16su)__V);
9501 }
9502
9503 static __inline__ int __DEFAULT_FN_ATTRS512
_mm512_reduce_min_epi32(__m512i __V)9504 _mm512_reduce_min_epi32(__m512i __V) {
9505 return __builtin_reduce_min((__v16si)__V);
9506 }
9507
9508 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
_mm512_reduce_min_epu32(__m512i __V)9509 _mm512_reduce_min_epu32(__m512i __V) {
9510 return __builtin_reduce_min((__v16su)__V);
9511 }
9512
9513 static __inline__ int __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_max_epi32(__mmask16 __M,__m512i __V)9514 _mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
9515 __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V);
9516 return __builtin_reduce_max((__v16si)__V);
9517 }
9518
9519 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_max_epu32(__mmask16 __M,__m512i __V)9520 _mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
9521 __V = _mm512_maskz_mov_epi32(__M, __V);
9522 return __builtin_reduce_max((__v16su)__V);
9523 }
9524
9525 static __inline__ int __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_min_epi32(__mmask16 __M,__m512i __V)9526 _mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
9527 __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V);
9528 return __builtin_reduce_min((__v16si)__V);
9529 }
9530
9531 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_min_epu32(__mmask16 __M,__m512i __V)9532 _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
9533 __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __V);
9534 return __builtin_reduce_min((__v16su)__V);
9535 }
9536
9537 static __inline__ double __DEFAULT_FN_ATTRS512
_mm512_reduce_max_pd(__m512d __V)9538 _mm512_reduce_max_pd(__m512d __V) {
9539 return __builtin_ia32_reduce_fmax_pd512(__V);
9540 }
9541
9542 static __inline__ double __DEFAULT_FN_ATTRS512
_mm512_reduce_min_pd(__m512d __V)9543 _mm512_reduce_min_pd(__m512d __V) {
9544 return __builtin_ia32_reduce_fmin_pd512(__V);
9545 }
9546
9547 static __inline__ double __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_max_pd(__mmask8 __M,__m512d __V)9548 _mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
9549 __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V);
9550 return __builtin_ia32_reduce_fmax_pd512(__V);
9551 }
9552
9553 static __inline__ double __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_min_pd(__mmask8 __M,__m512d __V)9554 _mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
9555 __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V);
9556 return __builtin_ia32_reduce_fmin_pd512(__V);
9557 }
9558
9559 static __inline__ float __DEFAULT_FN_ATTRS512
_mm512_reduce_max_ps(__m512 __V)9560 _mm512_reduce_max_ps(__m512 __V) {
9561 return __builtin_ia32_reduce_fmax_ps512(__V);
9562 }
9563
9564 static __inline__ float __DEFAULT_FN_ATTRS512
_mm512_reduce_min_ps(__m512 __V)9565 _mm512_reduce_min_ps(__m512 __V) {
9566 return __builtin_ia32_reduce_fmin_ps512(__V);
9567 }
9568
9569 static __inline__ float __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_max_ps(__mmask16 __M,__m512 __V)9570 _mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
9571 __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V);
9572 return __builtin_ia32_reduce_fmax_ps512(__V);
9573 }
9574
9575 static __inline__ float __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_min_ps(__mmask16 __M,__m512 __V)9576 _mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
9577 __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V);
9578 return __builtin_ia32_reduce_fmin_ps512(__V);
9579 }
9580
9581 /// Moves the least significant 32 bits of a vector of [16 x i32] to a
9582 /// 32-bit signed integer value.
9583 ///
9584 /// \headerfile <x86intrin.h>
9585 ///
9586 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
9587 ///
9588 /// \param __A
9589 /// A vector of [16 x i32]. The least significant 32 bits are moved to the
9590 /// destination.
9591 /// \returns A 32-bit signed integer containing the moved value.
9592 static __inline__ int __DEFAULT_FN_ATTRS512
_mm512_cvtsi512_si32(__m512i __A)9593 _mm512_cvtsi512_si32(__m512i __A) {
9594 __v16si __b = (__v16si)__A;
9595 return __b[0];
9596 }
9597
9598 /// Loads 8 double-precision (64-bit) floating-point elements stored at memory
9599 /// locations starting at location \a base_addr at packed 32-bit integer indices
9600 /// stored in the lower half of \a vindex scaled by \a scale them in dst.
9601 ///
9602 /// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
9603 ///
9604 /// \code{.operation}
9605 /// FOR j := 0 to 7
9606 /// i := j*64
9607 /// m := j*32
9608 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9609 /// dst[i+63:i] := MEM[addr+63:addr]
9610 /// ENDFOR
9611 /// dst[MAX:512] := 0
9612 /// \endcode
9613 #define _mm512_i32logather_pd(vindex, base_addr, scale) \
9614 _mm512_i32gather_pd(_mm512_castsi512_si256(vindex), (base_addr), (scale))
9615
9616 /// Loads 8 double-precision (64-bit) floating-point elements from memory
9617 /// starting at location \a base_addr at packed 32-bit integer indices stored in
9618 /// the lower half of \a vindex scaled by \a scale into dst using writemask
9619 /// \a mask (elements are copied from \a src when the corresponding mask bit is
9620 /// not set).
9621 ///
9622 /// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
9623 ///
9624 /// \code{.operation}
9625 /// FOR j := 0 to 7
9626 /// i := j*64
9627 /// m := j*32
9628 /// IF mask[j]
9629 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9630 /// dst[i+63:i] := MEM[addr+63:addr]
9631 /// ELSE
9632 /// dst[i+63:i] := src[i+63:i]
9633 /// FI
9634 /// ENDFOR
9635 /// dst[MAX:512] := 0
9636 /// \endcode
9637 #define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale) \
9638 _mm512_mask_i32gather_pd((src), (mask), _mm512_castsi512_si256(vindex), \
9639 (base_addr), (scale))
9640
9641 /// Loads 8 64-bit integer elements from memory starting at location \a base_addr
9642 /// at packed 32-bit integer indices stored in the lower half of \a vindex
9643 /// scaled by \a scale and stores them in dst.
9644 ///
9645 /// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
9646 ///
9647 /// \code{.operation}
9648 /// FOR j := 0 to 7
9649 /// i := j*64
9650 /// m := j*32
9651 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9652 /// dst[i+63:i] := MEM[addr+63:addr]
9653 /// ENDFOR
9654 /// dst[MAX:512] := 0
9655 /// \endcode
9656 #define _mm512_i32logather_epi64(vindex, base_addr, scale) \
9657 _mm512_i32gather_epi64(_mm512_castsi512_si256(vindex), (base_addr), (scale))
9658
9659 /// Loads 8 64-bit integer elements from memory starting at location \a base_addr
9660 /// at packed 32-bit integer indices stored in the lower half of \a vindex
9661 /// scaled by \a scale and stores them in dst using writemask \a mask (elements
9662 /// are copied from \a src when the corresponding mask bit is not set).
9663 ///
9664 /// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
9665 ///
9666 /// \code{.operation}
9667 /// FOR j := 0 to 7
9668 /// i := j*64
9669 /// m := j*32
9670 /// IF mask[j]
9671 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9672 /// dst[i+63:i] := MEM[addr+63:addr]
9673 /// ELSE
9674 /// dst[i+63:i] := src[i+63:i]
9675 /// FI
9676 /// ENDFOR
9677 /// dst[MAX:512] := 0
9678 /// \endcode
9679 #define _mm512_mask_i32logather_epi64(src, mask, vindex, base_addr, scale) \
9680 _mm512_mask_i32gather_epi64((src), (mask), _mm512_castsi512_si256(vindex), \
9681 (base_addr), (scale))
9682
9683 /// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
9684 /// and to memory locations starting at location \a base_addr at packed 32-bit
9685 /// integer indices stored in \a vindex scaled by \a scale.
9686 ///
9687 /// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
9688 ///
9689 /// \code{.operation}
9690 /// FOR j := 0 to 7
9691 /// i := j*64
9692 /// m := j*32
9693 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9694 /// MEM[addr+63:addr] := v1[i+63:i]
9695 /// ENDFOR
9696 /// \endcode
9697 #define _mm512_i32loscatter_pd(base_addr, vindex, v1, scale) \
9698 _mm512_i32scatter_pd((base_addr), _mm512_castsi512_si256(vindex), (v1), (scale))
9699
9700 /// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
9701 /// to memory locations starting at location \a base_addr at packed 32-bit
9702 /// integer indices stored in \a vindex scaled by \a scale. Only those elements
9703 /// whose corresponding mask bit is set in writemask \a mask are written to
9704 /// memory.
9705 ///
9706 /// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
9707 ///
9708 /// \code{.operation}
9709 /// FOR j := 0 to 7
9710 /// i := j*64
9711 /// m := j*32
9712 /// IF mask[j]
9713 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9714 /// MEM[addr+63:addr] := a[i+63:i]
9715 /// FI
9716 /// ENDFOR
9717 /// \endcode
9718 #define _mm512_mask_i32loscatter_pd(base_addr, mask, vindex, v1, scale) \
9719 _mm512_mask_i32scatter_pd((base_addr), (mask), \
9720 _mm512_castsi512_si256(vindex), (v1), (scale))
9721
9722 /// Stores 8 packed 64-bit integer elements located in \a v1 and stores them in
9723 /// memory locations starting at location \a base_addr at packed 32-bit integer
9724 /// indices stored in \a vindex scaled by \a scale.
9725 ///
9726 /// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
9727 ///
9728 /// \code{.operation}
9729 /// FOR j := 0 to 7
9730 /// i := j*64
9731 /// m := j*32
9732 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9733 /// MEM[addr+63:addr] := a[i+63:i]
9734 /// ENDFOR
9735 /// \endcode
9736 #define _mm512_i32loscatter_epi64(base_addr, vindex, v1, scale) \
9737 _mm512_i32scatter_epi64((base_addr), \
9738 _mm512_castsi512_si256(vindex), (v1), (scale))
9739
9740 /// Stores 8 packed 64-bit integer elements located in a and stores them in
9741 /// memory locations starting at location \a base_addr at packed 32-bit integer
9742 /// indices stored in \a vindex scaled by scale using writemask \a mask (elements
9743 /// whose corresponding mask bit is not set are not written to memory).
9744 ///
9745 /// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
9746 ///
9747 /// \code{.operation}
9748 /// FOR j := 0 to 7
9749 /// i := j*64
9750 /// m := j*32
9751 /// IF mask[j]
9752 /// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9753 /// MEM[addr+63:addr] := a[i+63:i]
9754 /// FI
9755 /// ENDFOR
9756 /// \endcode
9757 #define _mm512_mask_i32loscatter_epi64(base_addr, mask, vindex, v1, scale) \
9758 _mm512_mask_i32scatter_epi64((base_addr), (mask), \
9759 _mm512_castsi512_si256(vindex), (v1), (scale))
9760
9761 #undef __DEFAULT_FN_ATTRS512
9762 #undef __DEFAULT_FN_ATTRS128
9763 #undef __DEFAULT_FN_ATTRS
9764
9765 #endif /* __AVX512FINTRIN_H */
9766