1 /*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9 #ifndef __IMMINTRIN_H
10 #error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
11 #endif
12
13 #ifdef __SSE2__
14
15 #ifndef __AVX512FP16INTRIN_H
16 #define __AVX512FP16INTRIN_H
17
18 /* Define the default attributes for the functions in this file. */
19 typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
20 typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
21 typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
22
23 /* Define the default attributes for the functions in this file. */
24 #define __DEFAULT_FN_ATTRS512 \
25 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
26 __min_vector_width__(512)))
27 #define __DEFAULT_FN_ATTRS256 \
28 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
29 __min_vector_width__(256)))
30 #define __DEFAULT_FN_ATTRS128 \
31 __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
32 __min_vector_width__(128)))
33
_mm512_cvtsh_h(__m512h __a)34 static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
35 return __a[0];
36 }
37
_mm_setzero_ph(void)38 static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) {
39 return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
40 }
41
_mm256_setzero_ph(void)42 static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) {
43 return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
44 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
45 }
46
_mm256_undefined_ph(void)47 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
48 return (__m256h)__builtin_ia32_undef256();
49 }
50
_mm512_setzero_ph(void)51 static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) {
52 return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
53 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
54 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
55 }
56
_mm_undefined_ph(void)57 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
58 return (__m128h)__builtin_ia32_undef128();
59 }
60
_mm512_undefined_ph(void)61 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
62 return (__m512h)__builtin_ia32_undef512();
63 }
64
_mm512_set1_ph(_Float16 __h)65 static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) {
66 return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
67 __h, __h, __h, __h, __h, __h, __h, __h,
68 __h, __h, __h, __h, __h, __h, __h, __h,
69 __h, __h, __h, __h, __h, __h, __h, __h};
70 }
71
72 static __inline __m512h __DEFAULT_FN_ATTRS512
_mm512_set_ph(_Float16 __h1,_Float16 __h2,_Float16 __h3,_Float16 __h4,_Float16 __h5,_Float16 __h6,_Float16 __h7,_Float16 __h8,_Float16 __h9,_Float16 __h10,_Float16 __h11,_Float16 __h12,_Float16 __h13,_Float16 __h14,_Float16 __h15,_Float16 __h16,_Float16 __h17,_Float16 __h18,_Float16 __h19,_Float16 __h20,_Float16 __h21,_Float16 __h22,_Float16 __h23,_Float16 __h24,_Float16 __h25,_Float16 __h26,_Float16 __h27,_Float16 __h28,_Float16 __h29,_Float16 __h30,_Float16 __h31,_Float16 __h32)73 _mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
74 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
75 _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
76 _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
77 _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
78 _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
79 _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
80 _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
81 return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
82 __h25, __h24, __h23, __h22, __h21, __h20, __h19,
83 __h18, __h17, __h16, __h15, __h14, __h13, __h12,
84 __h11, __h10, __h9, __h8, __h7, __h6, __h5,
85 __h4, __h3, __h2, __h1};
86 }
87
88 #define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
89 h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24, \
90 h25, h26, h27, h28, h29, h30, h31, h32) \
91 _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \
92 (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \
93 (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6), \
94 (h5), (h4), (h3), (h2), (h1))
95
96 static __inline __m512h __DEFAULT_FN_ATTRS512
_mm512_set1_pch(_Float16 _Complex h)97 _mm512_set1_pch(_Float16 _Complex h) {
98 return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, h));
99 }
100
_mm_castph_ps(__m128h __a)101 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
102 return (__m128)__a;
103 }
104
_mm256_castph_ps(__m256h __a)105 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) {
106 return (__m256)__a;
107 }
108
_mm512_castph_ps(__m512h __a)109 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) {
110 return (__m512)__a;
111 }
112
_mm_castph_pd(__m128h __a)113 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) {
114 return (__m128d)__a;
115 }
116
_mm256_castph_pd(__m256h __a)117 static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) {
118 return (__m256d)__a;
119 }
120
_mm512_castph_pd(__m512h __a)121 static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) {
122 return (__m512d)__a;
123 }
124
_mm_castph_si128(__m128h __a)125 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) {
126 return (__m128i)__a;
127 }
128
129 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_castph_si256(__m256h __a)130 _mm256_castph_si256(__m256h __a) {
131 return (__m256i)__a;
132 }
133
134 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_castph_si512(__m512h __a)135 _mm512_castph_si512(__m512h __a) {
136 return (__m512i)__a;
137 }
138
_mm_castps_ph(__m128 __a)139 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) {
140 return (__m128h)__a;
141 }
142
_mm256_castps_ph(__m256 __a)143 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) {
144 return (__m256h)__a;
145 }
146
_mm512_castps_ph(__m512 __a)147 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) {
148 return (__m512h)__a;
149 }
150
_mm_castpd_ph(__m128d __a)151 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) {
152 return (__m128h)__a;
153 }
154
_mm256_castpd_ph(__m256d __a)155 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) {
156 return (__m256h)__a;
157 }
158
_mm512_castpd_ph(__m512d __a)159 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) {
160 return (__m512h)__a;
161 }
162
_mm_castsi128_ph(__m128i __a)163 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) {
164 return (__m128h)__a;
165 }
166
167 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_castsi256_ph(__m256i __a)168 _mm256_castsi256_ph(__m256i __a) {
169 return (__m256h)__a;
170 }
171
172 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_castsi512_ph(__m512i __a)173 _mm512_castsi512_ph(__m512i __a) {
174 return (__m512h)__a;
175 }
176
177 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_castph256_ph128(__m256h __a)178 _mm256_castph256_ph128(__m256h __a) {
179 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
180 }
181
182 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_castph512_ph128(__m512h __a)183 _mm512_castph512_ph128(__m512h __a) {
184 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
185 }
186
187 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_castph512_ph256(__m512h __a)188 _mm512_castph512_ph256(__m512h __a) {
189 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
190 12, 13, 14, 15);
191 }
192
193 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_castph128_ph256(__m128h __a)194 _mm256_castph128_ph256(__m128h __a) {
195 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
196 -1, -1, -1, -1, -1);
197 }
198
199 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_castph128_ph512(__m128h __a)200 _mm512_castph128_ph512(__m128h __a) {
201 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
202 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
203 -1, -1, -1, -1, -1, -1, -1, -1, -1);
204 }
205
206 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_castph256_ph512(__m256h __a)207 _mm512_castph256_ph512(__m256h __a) {
208 return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
209 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
210 -1, -1, -1, -1, -1, -1, -1, -1);
211 }
212
213 /// Constructs a 256-bit floating-point vector of [16 x half] from a
214 /// 128-bit floating-point vector of [8 x half]. The lower 128 bits
215 /// contain the value of the source vector. The upper 384 bits are set
216 /// to zero.
217 ///
218 /// \headerfile <x86intrin.h>
219 ///
220 /// This intrinsic has no corresponding instruction.
221 ///
222 /// \param __a
223 /// A 128-bit vector of [8 x half].
224 /// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
225 /// contain the value of the parameter. The upper 384 bits are set to zero.
226 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_zextph128_ph256(__m128h __a)227 _mm256_zextph128_ph256(__m128h __a) {
228 return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
229 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
230 }
231
232 /// Constructs a 512-bit floating-point vector of [32 x half] from a
233 /// 128-bit floating-point vector of [8 x half]. The lower 128 bits
234 /// contain the value of the source vector. The upper 384 bits are set
235 /// to zero.
236 ///
237 /// \headerfile <x86intrin.h>
238 ///
239 /// This intrinsic has no corresponding instruction.
240 ///
241 /// \param __a
242 /// A 128-bit vector of [8 x half].
243 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
244 /// contain the value of the parameter. The upper 384 bits are set to zero.
245 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_zextph128_ph512(__m128h __a)246 _mm512_zextph128_ph512(__m128h __a) {
247 return __builtin_shufflevector(
248 __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
249 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
250 }
251
252 /// Constructs a 512-bit floating-point vector of [32 x half] from a
253 /// 256-bit floating-point vector of [16 x half]. The lower 256 bits
254 /// contain the value of the source vector. The upper 256 bits are set
255 /// to zero.
256 ///
257 /// \headerfile <x86intrin.h>
258 ///
259 /// This intrinsic has no corresponding instruction.
260 ///
261 /// \param __a
262 /// A 256-bit vector of [16 x half].
263 /// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
264 /// contain the value of the parameter. The upper 256 bits are set to zero.
265 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_zextph256_ph512(__m256h __a)266 _mm512_zextph256_ph512(__m256h __a) {
267 return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
268 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
269 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
270 29, 30, 31);
271 }
272
273 #define _mm_comi_round_sh(A, B, P, R) \
274 __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
275
276 #define _mm_comi_sh(A, B, pred) \
277 _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
278
_mm_comieq_sh(__m128h A,__m128h B)279 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h A,
280 __m128h B) {
281 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OS,
282 _MM_FROUND_CUR_DIRECTION);
283 }
284
_mm_comilt_sh(__m128h A,__m128h B)285 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h A,
286 __m128h B) {
287 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OS,
288 _MM_FROUND_CUR_DIRECTION);
289 }
290
_mm_comile_sh(__m128h A,__m128h B)291 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h A,
292 __m128h B) {
293 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OS,
294 _MM_FROUND_CUR_DIRECTION);
295 }
296
_mm_comigt_sh(__m128h A,__m128h B)297 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h A,
298 __m128h B) {
299 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OS,
300 _MM_FROUND_CUR_DIRECTION);
301 }
302
_mm_comige_sh(__m128h A,__m128h B)303 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h A,
304 __m128h B) {
305 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OS,
306 _MM_FROUND_CUR_DIRECTION);
307 }
308
_mm_comineq_sh(__m128h A,__m128h B)309 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h A,
310 __m128h B) {
311 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_US,
312 _MM_FROUND_CUR_DIRECTION);
313 }
314
_mm_ucomieq_sh(__m128h A,__m128h B)315 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h A,
316 __m128h B) {
317 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OQ,
318 _MM_FROUND_CUR_DIRECTION);
319 }
320
_mm_ucomilt_sh(__m128h A,__m128h B)321 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h A,
322 __m128h B) {
323 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OQ,
324 _MM_FROUND_CUR_DIRECTION);
325 }
326
_mm_ucomile_sh(__m128h A,__m128h B)327 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h A,
328 __m128h B) {
329 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OQ,
330 _MM_FROUND_CUR_DIRECTION);
331 }
332
_mm_ucomigt_sh(__m128h A,__m128h B)333 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h A,
334 __m128h B) {
335 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OQ,
336 _MM_FROUND_CUR_DIRECTION);
337 }
338
_mm_ucomige_sh(__m128h A,__m128h B)339 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h A,
340 __m128h B) {
341 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OQ,
342 _MM_FROUND_CUR_DIRECTION);
343 }
344
_mm_ucomineq_sh(__m128h A,__m128h B)345 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h A,
346 __m128h B) {
347 return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_UQ,
348 _MM_FROUND_CUR_DIRECTION);
349 }
350
_mm512_add_ph(__m512h __A,__m512h __B)351 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
352 __m512h __B) {
353 return (__m512h)((__v32hf)__A + (__v32hf)__B);
354 }
355
356 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_add_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)357 _mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
358 return (__m512h)__builtin_ia32_selectph_512(
359 (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
360 }
361
362 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_add_ph(__mmask32 __U,__m512h __A,__m512h __B)363 _mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
364 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
365 (__v32hf)_mm512_add_ph(__A, __B),
366 (__v32hf)_mm512_setzero_ph());
367 }
368
369 #define _mm512_add_round_ph(A, B, R) \
370 ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \
371 (__v32hf)(__m512h)(B), (int)(R)))
372
373 #define _mm512_mask_add_round_ph(W, U, A, B, R) \
374 ((__m512h)__builtin_ia32_selectph_512( \
375 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
376 (__v32hf)(__m512h)(W)))
377
378 #define _mm512_maskz_add_round_ph(U, A, B, R) \
379 ((__m512h)__builtin_ia32_selectph_512( \
380 (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
381 (__v32hf)_mm512_setzero_ph()))
382
_mm512_sub_ph(__m512h __A,__m512h __B)383 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
384 __m512h __B) {
385 return (__m512h)((__v32hf)__A - (__v32hf)__B);
386 }
387
388 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_sub_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)389 _mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
390 return (__m512h)__builtin_ia32_selectph_512(
391 (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
392 }
393
394 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_sub_ph(__mmask32 __U,__m512h __A,__m512h __B)395 _mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
396 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
397 (__v32hf)_mm512_sub_ph(__A, __B),
398 (__v32hf)_mm512_setzero_ph());
399 }
400
401 #define _mm512_sub_round_ph(A, B, R) \
402 ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \
403 (__v32hf)(__m512h)(B), (int)(R)))
404
405 #define _mm512_mask_sub_round_ph(W, U, A, B, R) \
406 ((__m512h)__builtin_ia32_selectph_512( \
407 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
408 (__v32hf)(__m512h)(W)))
409
410 #define _mm512_maskz_sub_round_ph(U, A, B, R) \
411 ((__m512h)__builtin_ia32_selectph_512( \
412 (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
413 (__v32hf)_mm512_setzero_ph()))
414
_mm512_mul_ph(__m512h __A,__m512h __B)415 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
416 __m512h __B) {
417 return (__m512h)((__v32hf)__A * (__v32hf)__B);
418 }
419
420 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_mul_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)421 _mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
422 return (__m512h)__builtin_ia32_selectph_512(
423 (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
424 }
425
426 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_mul_ph(__mmask32 __U,__m512h __A,__m512h __B)427 _mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
428 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
429 (__v32hf)_mm512_mul_ph(__A, __B),
430 (__v32hf)_mm512_setzero_ph());
431 }
432
433 #define _mm512_mul_round_ph(A, B, R) \
434 ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \
435 (__v32hf)(__m512h)(B), (int)(R)))
436
437 #define _mm512_mask_mul_round_ph(W, U, A, B, R) \
438 ((__m512h)__builtin_ia32_selectph_512( \
439 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
440 (__v32hf)(__m512h)(W)))
441
442 #define _mm512_maskz_mul_round_ph(U, A, B, R) \
443 ((__m512h)__builtin_ia32_selectph_512( \
444 (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
445 (__v32hf)_mm512_setzero_ph()))
446
_mm512_div_ph(__m512h __A,__m512h __B)447 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
448 __m512h __B) {
449 return (__m512h)((__v32hf)__A / (__v32hf)__B);
450 }
451
452 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_div_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)453 _mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
454 return (__m512h)__builtin_ia32_selectph_512(
455 (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
456 }
457
458 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_div_ph(__mmask32 __U,__m512h __A,__m512h __B)459 _mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
460 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
461 (__v32hf)_mm512_div_ph(__A, __B),
462 (__v32hf)_mm512_setzero_ph());
463 }
464
465 #define _mm512_div_round_ph(A, B, R) \
466 ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \
467 (__v32hf)(__m512h)(B), (int)(R)))
468
469 #define _mm512_mask_div_round_ph(W, U, A, B, R) \
470 ((__m512h)__builtin_ia32_selectph_512( \
471 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
472 (__v32hf)(__m512h)(W)))
473
474 #define _mm512_maskz_div_round_ph(U, A, B, R) \
475 ((__m512h)__builtin_ia32_selectph_512( \
476 (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
477 (__v32hf)_mm512_setzero_ph()))
478
_mm512_min_ph(__m512h __A,__m512h __B)479 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A,
480 __m512h __B) {
481 return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
482 _MM_FROUND_CUR_DIRECTION);
483 }
484
485 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_min_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)486 _mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
487 return (__m512h)__builtin_ia32_selectph_512(
488 (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
489 }
490
491 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_min_ph(__mmask32 __U,__m512h __A,__m512h __B)492 _mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
493 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
494 (__v32hf)_mm512_min_ph(__A, __B),
495 (__v32hf)_mm512_setzero_ph());
496 }
497
498 #define _mm512_min_round_ph(A, B, R) \
499 ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \
500 (__v32hf)(__m512h)(B), (int)(R)))
501
502 #define _mm512_mask_min_round_ph(W, U, A, B, R) \
503 ((__m512h)__builtin_ia32_selectph_512( \
504 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
505 (__v32hf)(__m512h)(W)))
506
507 #define _mm512_maskz_min_round_ph(U, A, B, R) \
508 ((__m512h)__builtin_ia32_selectph_512( \
509 (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
510 (__v32hf)_mm512_setzero_ph()))
511
_mm512_max_ph(__m512h __A,__m512h __B)512 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A,
513 __m512h __B) {
514 return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
515 _MM_FROUND_CUR_DIRECTION);
516 }
517
518 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_max_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)519 _mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
520 return (__m512h)__builtin_ia32_selectph_512(
521 (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
522 }
523
524 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_max_ph(__mmask32 __U,__m512h __A,__m512h __B)525 _mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
526 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
527 (__v32hf)_mm512_max_ph(__A, __B),
528 (__v32hf)_mm512_setzero_ph());
529 }
530
531 #define _mm512_max_round_ph(A, B, R) \
532 ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \
533 (__v32hf)(__m512h)(B), (int)(R)))
534
535 #define _mm512_mask_max_round_ph(W, U, A, B, R) \
536 ((__m512h)__builtin_ia32_selectph_512( \
537 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
538 (__v32hf)(__m512h)(W)))
539
540 #define _mm512_maskz_max_round_ph(U, A, B, R) \
541 ((__m512h)__builtin_ia32_selectph_512( \
542 (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
543 (__v32hf)_mm512_setzero_ph()))
544
_mm512_abs_ph(__m512h __A)545 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
546 return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
547 }
548
_mm512_conj_pch(__m512h __A)549 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
550 return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f));
551 }
552
553 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_conj_pch(__m512h __W,__mmask16 __U,__m512h __A)554 _mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
555 return (__m512h)__builtin_ia32_selectps_512(
556 (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
557 }
558
559 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_conj_pch(__mmask16 __U,__m512h __A)560 _mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
561 return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
562 (__v16sf)_mm512_conj_pch(__A),
563 (__v16sf)_mm512_setzero_ps());
564 }
565
_mm_add_sh(__m128h __A,__m128h __B)566 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A,
567 __m128h __B) {
568 __A[0] += __B[0];
569 return __A;
570 }
571
_mm_mask_add_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)572 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W,
573 __mmask8 __U,
574 __m128h __A,
575 __m128h __B) {
576 __A = _mm_add_sh(__A, __B);
577 return __builtin_ia32_selectsh_128(__U, __A, __W);
578 }
579
_mm_maskz_add_sh(__mmask8 __U,__m128h __A,__m128h __B)580 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U,
581 __m128h __A,
582 __m128h __B) {
583 __A = _mm_add_sh(__A, __B);
584 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
585 }
586
587 #define _mm_add_round_sh(A, B, R) \
588 ((__m128h)__builtin_ia32_addsh_round_mask( \
589 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
590 (__mmask8)-1, (int)(R)))
591
592 #define _mm_mask_add_round_sh(W, U, A, B, R) \
593 ((__m128h)__builtin_ia32_addsh_round_mask( \
594 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
595 (__mmask8)(U), (int)(R)))
596
597 #define _mm_maskz_add_round_sh(U, A, B, R) \
598 ((__m128h)__builtin_ia32_addsh_round_mask( \
599 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
600 (__mmask8)(U), (int)(R)))
601
_mm_sub_sh(__m128h __A,__m128h __B)602 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A,
603 __m128h __B) {
604 __A[0] -= __B[0];
605 return __A;
606 }
607
_mm_mask_sub_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)608 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W,
609 __mmask8 __U,
610 __m128h __A,
611 __m128h __B) {
612 __A = _mm_sub_sh(__A, __B);
613 return __builtin_ia32_selectsh_128(__U, __A, __W);
614 }
615
_mm_maskz_sub_sh(__mmask8 __U,__m128h __A,__m128h __B)616 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U,
617 __m128h __A,
618 __m128h __B) {
619 __A = _mm_sub_sh(__A, __B);
620 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
621 }
622
623 #define _mm_sub_round_sh(A, B, R) \
624 ((__m128h)__builtin_ia32_subsh_round_mask( \
625 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
626 (__mmask8)-1, (int)(R)))
627
628 #define _mm_mask_sub_round_sh(W, U, A, B, R) \
629 ((__m128h)__builtin_ia32_subsh_round_mask( \
630 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
631 (__mmask8)(U), (int)(R)))
632
633 #define _mm_maskz_sub_round_sh(U, A, B, R) \
634 ((__m128h)__builtin_ia32_subsh_round_mask( \
635 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
636 (__mmask8)(U), (int)(R)))
637
_mm_mul_sh(__m128h __A,__m128h __B)638 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A,
639 __m128h __B) {
640 __A[0] *= __B[0];
641 return __A;
642 }
643
_mm_mask_mul_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)644 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W,
645 __mmask8 __U,
646 __m128h __A,
647 __m128h __B) {
648 __A = _mm_mul_sh(__A, __B);
649 return __builtin_ia32_selectsh_128(__U, __A, __W);
650 }
651
_mm_maskz_mul_sh(__mmask8 __U,__m128h __A,__m128h __B)652 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U,
653 __m128h __A,
654 __m128h __B) {
655 __A = _mm_mul_sh(__A, __B);
656 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
657 }
658
659 #define _mm_mul_round_sh(A, B, R) \
660 ((__m128h)__builtin_ia32_mulsh_round_mask( \
661 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
662 (__mmask8)-1, (int)(R)))
663
664 #define _mm_mask_mul_round_sh(W, U, A, B, R) \
665 ((__m128h)__builtin_ia32_mulsh_round_mask( \
666 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
667 (__mmask8)(U), (int)(R)))
668
669 #define _mm_maskz_mul_round_sh(U, A, B, R) \
670 ((__m128h)__builtin_ia32_mulsh_round_mask( \
671 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
672 (__mmask8)(U), (int)(R)))
673
_mm_div_sh(__m128h __A,__m128h __B)674 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A,
675 __m128h __B) {
676 __A[0] /= __B[0];
677 return __A;
678 }
679
_mm_mask_div_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)680 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W,
681 __mmask8 __U,
682 __m128h __A,
683 __m128h __B) {
684 __A = _mm_div_sh(__A, __B);
685 return __builtin_ia32_selectsh_128(__U, __A, __W);
686 }
687
_mm_maskz_div_sh(__mmask8 __U,__m128h __A,__m128h __B)688 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U,
689 __m128h __A,
690 __m128h __B) {
691 __A = _mm_div_sh(__A, __B);
692 return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
693 }
694
695 #define _mm_div_round_sh(A, B, R) \
696 ((__m128h)__builtin_ia32_divsh_round_mask( \
697 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
698 (__mmask8)-1, (int)(R)))
699
700 #define _mm_mask_div_round_sh(W, U, A, B, R) \
701 ((__m128h)__builtin_ia32_divsh_round_mask( \
702 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
703 (__mmask8)(U), (int)(R)))
704
705 #define _mm_maskz_div_round_sh(U, A, B, R) \
706 ((__m128h)__builtin_ia32_divsh_round_mask( \
707 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
708 (__mmask8)(U), (int)(R)))
709
_mm_min_sh(__m128h __A,__m128h __B)710 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A,
711 __m128h __B) {
712 return (__m128h)__builtin_ia32_minsh_round_mask(
713 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
714 _MM_FROUND_CUR_DIRECTION);
715 }
716
_mm_mask_min_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)717 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W,
718 __mmask8 __U,
719 __m128h __A,
720 __m128h __B) {
721 return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
722 (__v8hf)__W, (__mmask8)__U,
723 _MM_FROUND_CUR_DIRECTION);
724 }
725
_mm_maskz_min_sh(__mmask8 __U,__m128h __A,__m128h __B)726 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U,
727 __m128h __A,
728 __m128h __B) {
729 return (__m128h)__builtin_ia32_minsh_round_mask(
730 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
731 _MM_FROUND_CUR_DIRECTION);
732 }
733
734 #define _mm_min_round_sh(A, B, R) \
735 ((__m128h)__builtin_ia32_minsh_round_mask( \
736 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
737 (__mmask8)-1, (int)(R)))
738
739 #define _mm_mask_min_round_sh(W, U, A, B, R) \
740 ((__m128h)__builtin_ia32_minsh_round_mask( \
741 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
742 (__mmask8)(U), (int)(R)))
743
744 #define _mm_maskz_min_round_sh(U, A, B, R) \
745 ((__m128h)__builtin_ia32_minsh_round_mask( \
746 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
747 (__mmask8)(U), (int)(R)))
748
_mm_max_sh(__m128h __A,__m128h __B)749 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A,
750 __m128h __B) {
751 return (__m128h)__builtin_ia32_maxsh_round_mask(
752 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
753 _MM_FROUND_CUR_DIRECTION);
754 }
755
_mm_mask_max_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)756 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W,
757 __mmask8 __U,
758 __m128h __A,
759 __m128h __B) {
760 return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
761 (__v8hf)__W, (__mmask8)__U,
762 _MM_FROUND_CUR_DIRECTION);
763 }
764
_mm_maskz_max_sh(__mmask8 __U,__m128h __A,__m128h __B)765 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U,
766 __m128h __A,
767 __m128h __B) {
768 return (__m128h)__builtin_ia32_maxsh_round_mask(
769 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
770 _MM_FROUND_CUR_DIRECTION);
771 }
772
773 #define _mm_max_round_sh(A, B, R) \
774 ((__m128h)__builtin_ia32_maxsh_round_mask( \
775 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
776 (__mmask8)-1, (int)(R)))
777
778 #define _mm_mask_max_round_sh(W, U, A, B, R) \
779 ((__m128h)__builtin_ia32_maxsh_round_mask( \
780 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
781 (__mmask8)(U), (int)(R)))
782
783 #define _mm_maskz_max_round_sh(U, A, B, R) \
784 ((__m128h)__builtin_ia32_maxsh_round_mask( \
785 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
786 (__mmask8)(U), (int)(R)))
787
788 #define _mm512_cmp_round_ph_mask(A, B, P, R) \
789 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
790 (__v32hf)(__m512h)(B), (int)(P), \
791 (__mmask32)-1, (int)(R)))
792
793 #define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \
794 ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
795 (__v32hf)(__m512h)(B), (int)(P), \
796 (__mmask32)(U), (int)(R)))
797
798 #define _mm512_cmp_ph_mask(A, B, P) \
799 _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
800
801 #define _mm512_mask_cmp_ph_mask(U, A, B, P) \
802 _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
803
804 #define _mm_cmp_round_sh_mask(X, Y, P, R) \
805 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
806 (__v8hf)(__m128h)(Y), (int)(P), \
807 (__mmask8)-1, (int)(R)))
808
809 #define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \
810 ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
811 (__v8hf)(__m128h)(Y), (int)(P), \
812 (__mmask8)(M), (int)(R)))
813
814 #define _mm_cmp_sh_mask(X, Y, P) \
815 ((__mmask8)__builtin_ia32_cmpsh_mask( \
816 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \
817 _MM_FROUND_CUR_DIRECTION))
818
819 #define _mm_mask_cmp_sh_mask(M, X, Y, P) \
820 ((__mmask8)__builtin_ia32_cmpsh_mask( \
821 (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \
822 _MM_FROUND_CUR_DIRECTION))
823 // loads with vmovsh:
_mm_load_sh(void const * __dp)824 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
825 struct __mm_load_sh_struct {
826 _Float16 __u;
827 } __attribute__((__packed__, __may_alias__));
828 _Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u;
829 return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
830 }
831
832 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_load_sh(__m128h __W,__mmask8 __U,const void * __A)833 _mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
834 __m128h src = (__v8hf)__builtin_shufflevector(
835 (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
836
837 return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1);
838 }
839
840 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_load_sh(__mmask8 __U,const void * __A)841 _mm_maskz_load_sh(__mmask8 __U, const void *__A) {
842 return (__m128h)__builtin_ia32_loadsh128_mask(
843 (const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
844 }
845
846 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_load_ph(void const * __p)847 _mm512_load_ph(void const *__p) {
848 return *(const __m512h *)__p;
849 }
850
851 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_load_ph(void const * __p)852 _mm256_load_ph(void const *__p) {
853 return *(const __m256h *)__p;
854 }
855
_mm_load_ph(void const * __p)856 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
857 return *(const __m128h *)__p;
858 }
859
860 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_loadu_ph(void const * __p)861 _mm512_loadu_ph(void const *__p) {
862 struct __loadu_ph {
863 __m512h_u __v;
864 } __attribute__((__packed__, __may_alias__));
865 return ((const struct __loadu_ph *)__p)->__v;
866 }
867
868 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_loadu_ph(void const * __p)869 _mm256_loadu_ph(void const *__p) {
870 struct __loadu_ph {
871 __m256h_u __v;
872 } __attribute__((__packed__, __may_alias__));
873 return ((const struct __loadu_ph *)__p)->__v;
874 }
875
_mm_loadu_ph(void const * __p)876 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
877 struct __loadu_ph {
878 __m128h_u __v;
879 } __attribute__((__packed__, __may_alias__));
880 return ((const struct __loadu_ph *)__p)->__v;
881 }
882
883 // stores with vmovsh:
_mm_store_sh(void * __dp,__m128h __a)884 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
885 __m128h __a) {
886 struct __mm_store_sh_struct {
887 _Float16 __u;
888 } __attribute__((__packed__, __may_alias__));
889 ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
890 }
891
_mm_mask_store_sh(void * __W,__mmask8 __U,__m128h __A)892 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
893 __mmask8 __U,
894 __m128h __A) {
895 __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
896 }
897
_mm512_store_ph(void * __P,__m512h __A)898 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
899 __m512h __A) {
900 *(__m512h *)__P = __A;
901 }
902
_mm256_store_ph(void * __P,__m256h __A)903 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
904 __m256h __A) {
905 *(__m256h *)__P = __A;
906 }
907
_mm_store_ph(void * __P,__m128h __A)908 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
909 __m128h __A) {
910 *(__m128h *)__P = __A;
911 }
912
_mm512_storeu_ph(void * __P,__m512h __A)913 static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
914 __m512h __A) {
915 struct __storeu_ph {
916 __m512h_u __v;
917 } __attribute__((__packed__, __may_alias__));
918 ((struct __storeu_ph *)__P)->__v = __A;
919 }
920
_mm256_storeu_ph(void * __P,__m256h __A)921 static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
922 __m256h __A) {
923 struct __storeu_ph {
924 __m256h_u __v;
925 } __attribute__((__packed__, __may_alias__));
926 ((struct __storeu_ph *)__P)->__v = __A;
927 }
928
_mm_storeu_ph(void * __P,__m128h __A)929 static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
930 __m128h __A) {
931 struct __storeu_ph {
932 __m128h_u __v;
933 } __attribute__((__packed__, __may_alias__));
934 ((struct __storeu_ph *)__P)->__v = __A;
935 }
936
937 // moves with vmovsh:
_mm_move_sh(__m128h __a,__m128h __b)938 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a,
939 __m128h __b) {
940 __a[0] = __b[0];
941 return __a;
942 }
943
_mm_mask_move_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)944 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W,
945 __mmask8 __U,
946 __m128h __A,
947 __m128h __B) {
948 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
949 }
950
_mm_maskz_move_sh(__mmask8 __U,__m128h __A,__m128h __B)951 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U,
952 __m128h __A,
953 __m128h __B) {
954 return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
955 _mm_setzero_ph());
956 }
957
958 // vmovw:
_mm_cvtsi16_si128(short __a)959 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
960 return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
961 }
962
_mm_cvtsi128_si16(__m128i __a)963 static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
964 __v8hi __b = (__v8hi)__a;
965 return __b[0];
966 }
967
_mm512_rcp_ph(__m512h __A)968 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
969 return (__m512h)__builtin_ia32_rcpph512_mask(
970 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
971 }
972
973 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_rcp_ph(__m512h __W,__mmask32 __U,__m512h __A)974 _mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
975 return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
976 (__mmask32)__U);
977 }
978
979 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_rcp_ph(__mmask32 __U,__m512h __A)980 _mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
981 return (__m512h)__builtin_ia32_rcpph512_mask(
982 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
983 }
984
_mm512_rsqrt_ph(__m512h __A)985 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
986 return (__m512h)__builtin_ia32_rsqrtph512_mask(
987 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
988 }
989
990 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_rsqrt_ph(__m512h __W,__mmask32 __U,__m512h __A)991 _mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
992 return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
993 (__mmask32)__U);
994 }
995
996 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_rsqrt_ph(__mmask32 __U,__m512h __A)997 _mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
998 return (__m512h)__builtin_ia32_rsqrtph512_mask(
999 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
1000 }
1001
1002 #define _mm512_getmant_ph(A, B, C) \
1003 ((__m512h)__builtin_ia32_getmantph512_mask( \
1004 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1005 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
1006 _MM_FROUND_CUR_DIRECTION))
1007
1008 #define _mm512_mask_getmant_ph(W, U, A, B, C) \
1009 ((__m512h)__builtin_ia32_getmantph512_mask( \
1010 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1011 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1012
1013 #define _mm512_maskz_getmant_ph(U, A, B, C) \
1014 ((__m512h)__builtin_ia32_getmantph512_mask( \
1015 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1016 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1017
1018 #define _mm512_getmant_round_ph(A, B, C, R) \
1019 ((__m512h)__builtin_ia32_getmantph512_mask( \
1020 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1021 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1022
1023 #define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \
1024 ((__m512h)__builtin_ia32_getmantph512_mask( \
1025 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
1026 (__mmask32)(U), (int)(R)))
1027
1028 #define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \
1029 ((__m512h)__builtin_ia32_getmantph512_mask( \
1030 (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
1031 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1032
_mm512_getexp_ph(__m512h __A)1033 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
1034 return (__m512h)__builtin_ia32_getexpph512_mask(
1035 (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1036 _MM_FROUND_CUR_DIRECTION);
1037 }
1038
1039 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_getexp_ph(__m512h __W,__mmask32 __U,__m512h __A)1040 _mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1041 return (__m512h)__builtin_ia32_getexpph512_mask(
1042 (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1043 }
1044
1045 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_getexp_ph(__mmask32 __U,__m512h __A)1046 _mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
1047 return (__m512h)__builtin_ia32_getexpph512_mask(
1048 (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1049 _MM_FROUND_CUR_DIRECTION);
1050 }
1051
1052 #define _mm512_getexp_round_ph(A, R) \
1053 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1054 (__v32hf)_mm512_undefined_ph(), \
1055 (__mmask32)-1, (int)(R)))
1056
1057 #define _mm512_mask_getexp_round_ph(W, U, A, R) \
1058 ((__m512h)__builtin_ia32_getexpph512_mask( \
1059 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1060
1061 #define _mm512_maskz_getexp_round_ph(U, A, R) \
1062 ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
1063 (__v32hf)_mm512_setzero_ph(), \
1064 (__mmask32)(U), (int)(R)))
1065
_mm512_scalef_ph(__m512h __A,__m512h __B)1066 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
1067 __m512h __B) {
1068 return (__m512h)__builtin_ia32_scalefph512_mask(
1069 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1070 _MM_FROUND_CUR_DIRECTION);
1071 }
1072
1073 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_scalef_ph(__m512h __W,__mmask32 __U,__m512h __A,__m512h __B)1074 _mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
1075 return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1076 (__v32hf)__W, (__mmask32)__U,
1077 _MM_FROUND_CUR_DIRECTION);
1078 }
1079
1080 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_scalef_ph(__mmask32 __U,__m512h __A,__m512h __B)1081 _mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
1082 return (__m512h)__builtin_ia32_scalefph512_mask(
1083 (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1084 _MM_FROUND_CUR_DIRECTION);
1085 }
1086
1087 #define _mm512_scalef_round_ph(A, B, R) \
1088 ((__m512h)__builtin_ia32_scalefph512_mask( \
1089 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1090 (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1091
1092 #define _mm512_mask_scalef_round_ph(W, U, A, B, R) \
1093 ((__m512h)__builtin_ia32_scalefph512_mask( \
1094 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \
1095 (__mmask32)(U), (int)(R)))
1096
1097 #define _mm512_maskz_scalef_round_ph(U, A, B, R) \
1098 ((__m512h)__builtin_ia32_scalefph512_mask( \
1099 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
1100 (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1101
1102 #define _mm512_roundscale_ph(A, B) \
1103 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1104 (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \
1105 _MM_FROUND_CUR_DIRECTION))
1106
1107 #define _mm512_mask_roundscale_ph(A, B, C, imm) \
1108 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1109 (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \
1110 (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1111
1112 #define _mm512_maskz_roundscale_ph(A, B, imm) \
1113 ((__m512h)__builtin_ia32_rndscaleph_mask( \
1114 (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1115 (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1116
1117 #define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \
1118 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \
1119 (__v32hf)(__m512h)(A), \
1120 (__mmask32)(B), (int)(R)))
1121
1122 #define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \
1123 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \
1124 (__v32hf)_mm512_setzero_ph(), \
1125 (__mmask32)(A), (int)(R)))
1126
1127 #define _mm512_roundscale_round_ph(A, imm, R) \
1128 ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \
1129 (__v32hf)_mm512_undefined_ph(), \
1130 (__mmask32)-1, (int)(R)))
1131
1132 #define _mm512_reduce_ph(A, imm) \
1133 ((__m512h)__builtin_ia32_reduceph512_mask( \
1134 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \
1135 (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1136
1137 #define _mm512_mask_reduce_ph(W, U, A, imm) \
1138 ((__m512h)__builtin_ia32_reduceph512_mask( \
1139 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \
1140 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1141
1142 #define _mm512_maskz_reduce_ph(U, A, imm) \
1143 ((__m512h)__builtin_ia32_reduceph512_mask( \
1144 (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
1145 (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1146
1147 #define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \
1148 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1149 (__v32hf)(__m512h)(W), \
1150 (__mmask32)(U), (int)(R)))
1151
1152 #define _mm512_maskz_reduce_round_ph(U, A, imm, R) \
1153 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1154 (__v32hf)_mm512_setzero_ph(), \
1155 (__mmask32)(U), (int)(R)))
1156
1157 #define _mm512_reduce_round_ph(A, imm, R) \
1158 ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1159 (__v32hf)_mm512_undefined_ph(), \
1160 (__mmask32)-1, (int)(R)))
1161
_mm_rcp_sh(__m128h __A,__m128h __B)1162 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
1163 __m128h __B) {
1164 return (__m128h)__builtin_ia32_rcpsh_mask(
1165 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1166 }
1167
_mm_mask_rcp_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)1168 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
1169 __mmask8 __U,
1170 __m128h __A,
1171 __m128h __B) {
1172 return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1173 (__v8hf)__W, (__mmask8)__U);
1174 }
1175
_mm_maskz_rcp_sh(__mmask8 __U,__m128h __A,__m128h __B)1176 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
1177 __m128h __A,
1178 __m128h __B) {
1179 return (__m128h)__builtin_ia32_rcpsh_mask(
1180 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1181 }
1182
_mm_rsqrt_sh(__m128h __A,__m128h __B)1183 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
1184 __m128h __B) {
1185 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1186 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1187 }
1188
_mm_mask_rsqrt_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)1189 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
1190 __mmask8 __U,
1191 __m128h __A,
1192 __m128h __B) {
1193 return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1194 (__v8hf)__W, (__mmask8)__U);
1195 }
1196
1197 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_rsqrt_sh(__mmask8 __U,__m128h __A,__m128h __B)1198 _mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1199 return (__m128h)__builtin_ia32_rsqrtsh_mask(
1200 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1201 }
1202
1203 #define _mm_getmant_round_sh(A, B, C, D, R) \
1204 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1205 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1206 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1207
1208 #define _mm_getmant_sh(A, B, C, D) \
1209 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1210 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1211 (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1212
1213 #define _mm_mask_getmant_sh(W, U, A, B, C, D) \
1214 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1215 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1216 (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1217
1218 #define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \
1219 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1220 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1221 (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1222
1223 #define _mm_maskz_getmant_sh(U, A, B, C, D) \
1224 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1225 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1226 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1227
1228 #define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \
1229 ((__m128h)__builtin_ia32_getmantsh_round_mask( \
1230 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
1231 (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1232
1233 #define _mm_getexp_round_sh(A, B, R) \
1234 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1235 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1236 (__mmask8)-1, (int)(R)))
1237
_mm_getexp_sh(__m128h __A,__m128h __B)1238 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
1239 __m128h __B) {
1240 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1241 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1242 _MM_FROUND_CUR_DIRECTION);
1243 }
1244
1245 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_getexp_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)1246 _mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1247 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1248 (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
1249 _MM_FROUND_CUR_DIRECTION);
1250 }
1251
1252 #define _mm_mask_getexp_round_sh(W, U, A, B, R) \
1253 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1254 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1255 (__mmask8)(U), (int)(R)))
1256
1257 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_getexp_sh(__mmask8 __U,__m128h __A,__m128h __B)1258 _mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1259 return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1260 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1261 _MM_FROUND_CUR_DIRECTION);
1262 }
1263
1264 #define _mm_maskz_getexp_round_sh(U, A, B, R) \
1265 ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
1266 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1267 (__mmask8)(U), (int)(R)))
1268
1269 #define _mm_scalef_round_sh(A, B, R) \
1270 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1271 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1272 (__mmask8)-1, (int)(R)))
1273
_mm_scalef_sh(__m128h __A,__m128h __B)1274 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
1275 __m128h __B) {
1276 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1277 (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1278 _MM_FROUND_CUR_DIRECTION);
1279 }
1280
1281 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_scalef_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)1282 _mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1283 return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1284 (__v8hf)__W, (__mmask8)__U,
1285 _MM_FROUND_CUR_DIRECTION);
1286 }
1287
1288 #define _mm_mask_scalef_round_sh(W, U, A, B, R) \
1289 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1290 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1291 (__mmask8)(U), (int)(R)))
1292
1293 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_scalef_sh(__mmask8 __U,__m128h __A,__m128h __B)1294 _mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1295 return (__m128h)__builtin_ia32_scalefsh_round_mask(
1296 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1297 _MM_FROUND_CUR_DIRECTION);
1298 }
1299
1300 #define _mm_maskz_scalef_round_sh(U, A, B, R) \
1301 ((__m128h)__builtin_ia32_scalefsh_round_mask( \
1302 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1303 (__mmask8)(U), (int)(R)))
1304
1305 #define _mm_roundscale_round_sh(A, B, imm, R) \
1306 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1307 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1308 (__mmask8)-1, (int)(imm), (int)(R)))
1309
1310 #define _mm_roundscale_sh(A, B, imm) \
1311 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1312 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1313 (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1314
1315 #define _mm_mask_roundscale_sh(W, U, A, B, I) \
1316 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1317 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1318 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1319
1320 #define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \
1321 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1322 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1323 (__mmask8)(U), (int)(I), (int)(R)))
1324
1325 #define _mm_maskz_roundscale_sh(U, A, B, I) \
1326 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1327 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1328 (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1329
1330 #define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \
1331 ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
1332 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1333 (__mmask8)(U), (int)(I), (int)(R)))
1334
1335 #define _mm_reduce_sh(A, B, C) \
1336 ((__m128h)__builtin_ia32_reducesh_mask( \
1337 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1338 (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1339
1340 #define _mm_mask_reduce_sh(W, U, A, B, C) \
1341 ((__m128h)__builtin_ia32_reducesh_mask( \
1342 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1343 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1344
1345 #define _mm_maskz_reduce_sh(U, A, B, C) \
1346 ((__m128h)__builtin_ia32_reducesh_mask( \
1347 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1348 (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1349
1350 #define _mm_reduce_round_sh(A, B, C, R) \
1351 ((__m128h)__builtin_ia32_reducesh_mask( \
1352 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1353 (__mmask8)-1, (int)(C), (int)(R)))
1354
1355 #define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \
1356 ((__m128h)__builtin_ia32_reducesh_mask( \
1357 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1358 (__mmask8)(U), (int)(C), (int)(R)))
1359
1360 #define _mm_maskz_reduce_round_sh(U, A, B, C, R) \
1361 ((__m128h)__builtin_ia32_reducesh_mask( \
1362 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1363 (__mmask8)(U), (int)(C), (int)(R)))
1364
1365 #define _mm512_sqrt_round_ph(A, R) \
1366 ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1367
1368 #define _mm512_mask_sqrt_round_ph(W, U, A, R) \
1369 ((__m512h)__builtin_ia32_selectph_512( \
1370 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1371 (__v32hf)(__m512h)(W)))
1372
1373 #define _mm512_maskz_sqrt_round_ph(U, A, R) \
1374 ((__m512h)__builtin_ia32_selectph_512( \
1375 (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
1376 (__v32hf)_mm512_setzero_ph()))
1377
_mm512_sqrt_ph(__m512h __A)1378 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
1379 return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
1380 _MM_FROUND_CUR_DIRECTION);
1381 }
1382
1383 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_sqrt_ph(__m512h __W,__mmask32 __U,__m512h __A)1384 _mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1385 return (__m512h)__builtin_ia32_selectph_512(
1386 (__mmask32)(__U),
1387 (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1388 (__v32hf)(__m512h)(__W));
1389 }
1390
1391 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_sqrt_ph(__mmask32 __U,__m512h __A)1392 _mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
1393 return (__m512h)__builtin_ia32_selectph_512(
1394 (__mmask32)(__U),
1395 (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1396 (__v32hf)_mm512_setzero_ph());
1397 }
1398
1399 #define _mm_sqrt_round_sh(A, B, R) \
1400 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1401 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1402 (__mmask8)-1, (int)(R)))
1403
1404 #define _mm_mask_sqrt_round_sh(W, U, A, B, R) \
1405 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1406 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
1407 (__mmask8)(U), (int)(R)))
1408
1409 #define _mm_maskz_sqrt_round_sh(U, A, B, R) \
1410 ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
1411 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
1412 (__mmask8)(U), (int)(R)))
1413
_mm_sqrt_sh(__m128h __A,__m128h __B)1414 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
1415 __m128h __B) {
1416 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1417 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1418 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
1419 }
1420
_mm_mask_sqrt_sh(__m128h __W,__mmask32 __U,__m128h __A,__m128h __B)1421 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
1422 __mmask32 __U,
1423 __m128h __A,
1424 __m128h __B) {
1425 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1426 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1427 (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1428 }
1429
_mm_maskz_sqrt_sh(__mmask32 __U,__m128h __A,__m128h __B)1430 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
1431 __m128h __A,
1432 __m128h __B) {
1433 return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1434 (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1435 (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1436 }
1437
1438 #define _mm512_mask_fpclass_ph_mask(U, A, imm) \
1439 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1440 (int)(imm), (__mmask32)(U)))
1441
1442 #define _mm512_fpclass_ph_mask(A, imm) \
1443 ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
1444 (int)(imm), (__mmask32)-1))
1445
1446 #define _mm_fpclass_sh_mask(A, imm) \
1447 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1448 (__mmask8)-1))
1449
1450 #define _mm_mask_fpclass_sh_mask(U, A, imm) \
1451 ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
1452 (__mmask8)(U)))
1453
1454 #define _mm512_cvt_roundpd_ph(A, R) \
1455 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1456 (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1457
1458 #define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \
1459 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \
1460 (__mmask8)(U), (int)(R)))
1461
1462 #define _mm512_maskz_cvt_roundpd_ph(U, A, R) \
1463 ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
1464 (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1465
_mm512_cvtpd_ph(__m512d __A)1466 static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
1467 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1468 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1469 _MM_FROUND_CUR_DIRECTION);
1470 }
1471
1472 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtpd_ph(__m128h __W,__mmask8 __U,__m512d __A)1473 _mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
1474 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1475 (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1476 }
1477
1478 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtpd_ph(__mmask8 __U,__m512d __A)1479 _mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
1480 return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1481 (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1482 _MM_FROUND_CUR_DIRECTION);
1483 }
1484
1485 #define _mm512_cvt_roundph_pd(A, R) \
1486 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1487 (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1488
1489 #define _mm512_mask_cvt_roundph_pd(W, U, A, R) \
1490 ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \
1491 (__mmask8)(U), (int)(R)))
1492
1493 #define _mm512_maskz_cvt_roundph_pd(U, A, R) \
1494 ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
1495 (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1496
_mm512_cvtph_pd(__m128h __A)1497 static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
1498 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1499 (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
1500 _MM_FROUND_CUR_DIRECTION);
1501 }
1502
1503 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_pd(__m512d __W,__mmask8 __U,__m128h __A)1504 _mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
1505 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1506 (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1507 }
1508
1509 static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_pd(__mmask8 __U,__m128h __A)1510 _mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
1511 return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1512 (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
1513 _MM_FROUND_CUR_DIRECTION);
1514 }
1515
1516 #define _mm_cvt_roundsh_ss(A, B, R) \
1517 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1518 (__v4sf)_mm_undefined_ps(), \
1519 (__mmask8)(-1), (int)(R)))
1520
1521 #define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \
1522 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \
1523 (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1524
1525 #define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \
1526 ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
1527 (__v4sf)_mm_setzero_ps(), \
1528 (__mmask8)(U), (int)(R)))
1529
_mm_cvtsh_ss(__m128 __A,__m128h __B)1530 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
1531 __m128h __B) {
1532 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1533 (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
1534 _MM_FROUND_CUR_DIRECTION);
1535 }
1536
_mm_mask_cvtsh_ss(__m128 __W,__mmask8 __U,__m128 __A,__m128h __B)1537 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
1538 __mmask8 __U,
1539 __m128 __A,
1540 __m128h __B) {
1541 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1542 (__v4sf)__W, (__mmask8)__U,
1543 _MM_FROUND_CUR_DIRECTION);
1544 }
1545
_mm_maskz_cvtsh_ss(__mmask8 __U,__m128 __A,__m128h __B)1546 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
1547 __m128 __A,
1548 __m128h __B) {
1549 return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1550 (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
1551 _MM_FROUND_CUR_DIRECTION);
1552 }
1553
1554 #define _mm_cvt_roundss_sh(A, B, R) \
1555 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1556 (__v8hf)_mm_undefined_ph(), \
1557 (__mmask8)(-1), (int)(R)))
1558
1559 #define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \
1560 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \
1561 (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1562
1563 #define _mm_maskz_cvt_roundss_sh(U, A, B, R) \
1564 ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
1565 (__v8hf)_mm_setzero_ph(), \
1566 (__mmask8)(U), (int)(R)))
1567
_mm_cvtss_sh(__m128h __A,__m128 __B)1568 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
1569 __m128 __B) {
1570 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1571 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1572 _MM_FROUND_CUR_DIRECTION);
1573 }
1574
_mm_mask_cvtss_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128 __B)1575 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
1576 __mmask8 __U,
1577 __m128h __A,
1578 __m128 __B) {
1579 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1580 (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
1581 _MM_FROUND_CUR_DIRECTION);
1582 }
1583
_mm_maskz_cvtss_sh(__mmask8 __U,__m128h __A,__m128 __B)1584 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
1585 __m128h __A,
1586 __m128 __B) {
1587 return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1588 (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1589 _MM_FROUND_CUR_DIRECTION);
1590 }
1591
1592 #define _mm_cvt_roundsd_sh(A, B, R) \
1593 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1594 (__v8hf)_mm_undefined_ph(), \
1595 (__mmask8)(-1), (int)(R)))
1596
1597 #define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \
1598 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \
1599 (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1600
1601 #define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \
1602 ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
1603 (__v8hf)_mm_setzero_ph(), \
1604 (__mmask8)(U), (int)(R)))
1605
_mm_cvtsd_sh(__m128h __A,__m128d __B)1606 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
1607 __m128d __B) {
1608 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1609 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1610 _MM_FROUND_CUR_DIRECTION);
1611 }
1612
_mm_mask_cvtsd_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128d __B)1613 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
1614 __mmask8 __U,
1615 __m128h __A,
1616 __m128d __B) {
1617 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1618 (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
1619 _MM_FROUND_CUR_DIRECTION);
1620 }
1621
1622 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_cvtsd_sh(__mmask8 __U,__m128h __A,__m128d __B)1623 _mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
1624 return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1625 (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1626 _MM_FROUND_CUR_DIRECTION);
1627 }
1628
1629 #define _mm_cvt_roundsh_sd(A, B, R) \
1630 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1631 (__v2df)_mm_undefined_pd(), \
1632 (__mmask8)(-1), (int)(R)))
1633
1634 #define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \
1635 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \
1636 (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1637
1638 #define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \
1639 ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
1640 (__v2df)_mm_setzero_pd(), \
1641 (__mmask8)(U), (int)(R)))
1642
_mm_cvtsh_sd(__m128d __A,__m128h __B)1643 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
1644 __m128h __B) {
1645 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1646 (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
1647 _MM_FROUND_CUR_DIRECTION);
1648 }
1649
_mm_mask_cvtsh_sd(__m128d __W,__mmask8 __U,__m128d __A,__m128h __B)1650 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
1651 __mmask8 __U,
1652 __m128d __A,
1653 __m128h __B) {
1654 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1655 (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
1656 _MM_FROUND_CUR_DIRECTION);
1657 }
1658
1659 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_cvtsh_sd(__mmask8 __U,__m128d __A,__m128h __B)1660 _mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
1661 return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1662 (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
1663 _MM_FROUND_CUR_DIRECTION);
1664 }
1665
1666 #define _mm512_cvt_roundph_epi16(A, R) \
1667 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1668 (__v32hi)_mm512_undefined_epi32(), \
1669 (__mmask32)(-1), (int)(R)))
1670
1671 #define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \
1672 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1673 (__mmask32)(U), (int)(R)))
1674
1675 #define _mm512_maskz_cvt_roundph_epi16(U, A, R) \
1676 ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
1677 (__v32hi)_mm512_setzero_epi32(), \
1678 (__mmask32)(U), (int)(R)))
1679
1680 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epi16(__m512h __A)1681 _mm512_cvtph_epi16(__m512h __A) {
1682 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1683 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1684 _MM_FROUND_CUR_DIRECTION);
1685 }
1686
1687 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epi16(__m512i __W,__mmask32 __U,__m512h __A)1688 _mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1689 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1690 (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1691 }
1692
1693 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epi16(__mmask32 __U,__m512h __A)1694 _mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
1695 return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1696 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1697 _MM_FROUND_CUR_DIRECTION);
1698 }
1699
1700 #define _mm512_cvtt_roundph_epi16(A, R) \
1701 ((__m512i)__builtin_ia32_vcvttph2w512_mask( \
1702 (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \
1703 (int)(R)))
1704
1705 #define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \
1706 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \
1707 (__mmask32)(U), (int)(R)))
1708
1709 #define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \
1710 ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \
1711 (__v32hi)_mm512_setzero_epi32(), \
1712 (__mmask32)(U), (int)(R)))
1713
1714 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epi16(__m512h __A)1715 _mm512_cvttph_epi16(__m512h __A) {
1716 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1717 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1718 _MM_FROUND_CUR_DIRECTION);
1719 }
1720
1721 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epi16(__m512i __W,__mmask32 __U,__m512h __A)1722 _mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1723 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1724 (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1725 }
1726
1727 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epi16(__mmask32 __U,__m512h __A)1728 _mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
1729 return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1730 (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1731 _MM_FROUND_CUR_DIRECTION);
1732 }
1733
1734 #define _mm512_cvt_roundepi16_ph(A, R) \
1735 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \
1736 (__v32hf)_mm512_undefined_ph(), \
1737 (__mmask32)(-1), (int)(R)))
1738
1739 #define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \
1740 ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \
1741 (__mmask32)(U), (int)(R)))
1742
1743 #define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \
1744 ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \
1745 (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1746
1747 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_cvtepi16_ph(__m512i __A)1748 _mm512_cvtepi16_ph(__m512i __A) {
1749 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1750 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1751 _MM_FROUND_CUR_DIRECTION);
1752 }
1753
1754 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi16_ph(__m512h __W,__mmask32 __U,__m512i __A)1755 _mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1756 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1757 (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1758 }
1759
1760 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi16_ph(__mmask32 __U,__m512i __A)1761 _mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
1762 return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1763 (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1764 _MM_FROUND_CUR_DIRECTION);
1765 }
1766
1767 #define _mm512_cvt_roundph_epu16(A, R) \
1768 ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \
1769 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1770 (int)(R)))
1771
1772 #define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \
1773 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1774 (__mmask32)(U), (int)(R)))
1775
1776 #define _mm512_maskz_cvt_roundph_epu16(U, A, R) \
1777 ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \
1778 (__v32hu)_mm512_setzero_epi32(), \
1779 (__mmask32)(U), (int)(R)))
1780
1781 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epu16(__m512h __A)1782 _mm512_cvtph_epu16(__m512h __A) {
1783 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1784 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1785 _MM_FROUND_CUR_DIRECTION);
1786 }
1787
1788 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epu16(__m512i __W,__mmask32 __U,__m512h __A)1789 _mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1790 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1791 (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1792 }
1793
1794 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epu16(__mmask32 __U,__m512h __A)1795 _mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
1796 return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1797 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1798 _MM_FROUND_CUR_DIRECTION);
1799 }
1800
1801 #define _mm512_cvtt_roundph_epu16(A, R) \
1802 ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \
1803 (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
1804 (int)(R)))
1805
1806 #define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \
1807 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
1808 (__mmask32)(U), (int)(R)))
1809
1810 #define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \
1811 ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \
1812 (__v32hu)_mm512_setzero_epi32(), \
1813 (__mmask32)(U), (int)(R)))
1814
1815 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epu16(__m512h __A)1816 _mm512_cvttph_epu16(__m512h __A) {
1817 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1818 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1819 _MM_FROUND_CUR_DIRECTION);
1820 }
1821
1822 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epu16(__m512i __W,__mmask32 __U,__m512h __A)1823 _mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1824 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1825 (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1826 }
1827
1828 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epu16(__mmask32 __U,__m512h __A)1829 _mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
1830 return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1831 (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1832 _MM_FROUND_CUR_DIRECTION);
1833 }
1834
1835 #define _mm512_cvt_roundepu16_ph(A, R) \
1836 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \
1837 (__v32hf)_mm512_undefined_ph(), \
1838 (__mmask32)(-1), (int)(R)))
1839
1840 #define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \
1841 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \
1842 (__mmask32)(U), (int)(R)))
1843
1844 #define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \
1845 ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \
1846 (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1847
1848 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_cvtepu16_ph(__m512i __A)1849 _mm512_cvtepu16_ph(__m512i __A) {
1850 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1851 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1852 _MM_FROUND_CUR_DIRECTION);
1853 }
1854
1855 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepu16_ph(__m512h __W,__mmask32 __U,__m512i __A)1856 _mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1857 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1858 (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1859 }
1860
1861 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepu16_ph(__mmask32 __U,__m512i __A)1862 _mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
1863 return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1864 (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1865 _MM_FROUND_CUR_DIRECTION);
1866 }
1867
1868 #define _mm512_cvt_roundph_epi32(A, R) \
1869 ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \
1870 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
1871 (int)(R)))
1872
1873 #define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \
1874 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \
1875 (__mmask16)(U), (int)(R)))
1876
1877 #define _mm512_maskz_cvt_roundph_epi32(U, A, R) \
1878 ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \
1879 (__v16si)_mm512_setzero_epi32(), \
1880 (__mmask16)(U), (int)(R)))
1881
1882 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epi32(__m256h __A)1883 _mm512_cvtph_epi32(__m256h __A) {
1884 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1885 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
1886 _MM_FROUND_CUR_DIRECTION);
1887 }
1888
1889 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epi32(__m512i __W,__mmask16 __U,__m256h __A)1890 _mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
1891 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1892 (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1893 }
1894
1895 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epi32(__mmask16 __U,__m256h __A)1896 _mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
1897 return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1898 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
1899 _MM_FROUND_CUR_DIRECTION);
1900 }
1901
1902 #define _mm512_cvt_roundph_epu32(A, R) \
1903 ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \
1904 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
1905 (int)(R)))
1906
1907 #define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \
1908 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \
1909 (__mmask16)(U), (int)(R)))
1910
1911 #define _mm512_maskz_cvt_roundph_epu32(U, A, R) \
1912 ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \
1913 (__v16su)_mm512_setzero_epi32(), \
1914 (__mmask16)(U), (int)(R)))
1915
1916 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epu32(__m256h __A)1917 _mm512_cvtph_epu32(__m256h __A) {
1918 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1919 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
1920 _MM_FROUND_CUR_DIRECTION);
1921 }
1922
1923 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epu32(__m512i __W,__mmask16 __U,__m256h __A)1924 _mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
1925 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1926 (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1927 }
1928
1929 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epu32(__mmask16 __U,__m256h __A)1930 _mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
1931 return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1932 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
1933 _MM_FROUND_CUR_DIRECTION);
1934 }
1935
1936 #define _mm512_cvt_roundepi32_ph(A, R) \
1937 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \
1938 (__v16hf)_mm256_undefined_ph(), \
1939 (__mmask16)(-1), (int)(R)))
1940
1941 #define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \
1942 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \
1943 (__mmask16)(U), (int)(R)))
1944
1945 #define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \
1946 ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \
1947 (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1948
1949 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_cvtepi32_ph(__m512i __A)1950 _mm512_cvtepi32_ph(__m512i __A) {
1951 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1952 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1953 _MM_FROUND_CUR_DIRECTION);
1954 }
1955
1956 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi32_ph(__m256h __W,__mmask16 __U,__m512i __A)1957 _mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1958 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1959 (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1960 }
1961
1962 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi32_ph(__mmask16 __U,__m512i __A)1963 _mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
1964 return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1965 (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1966 _MM_FROUND_CUR_DIRECTION);
1967 }
1968
1969 #define _mm512_cvt_roundepu32_ph(A, R) \
1970 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \
1971 (__v16hf)_mm256_undefined_ph(), \
1972 (__mmask16)(-1), (int)(R)))
1973
1974 #define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \
1975 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \
1976 (__mmask16)(U), (int)(R)))
1977
1978 #define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \
1979 ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \
1980 (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1981
1982 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_cvtepu32_ph(__m512i __A)1983 _mm512_cvtepu32_ph(__m512i __A) {
1984 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1985 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1986 _MM_FROUND_CUR_DIRECTION);
1987 }
1988
1989 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepu32_ph(__m256h __W,__mmask16 __U,__m512i __A)1990 _mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1991 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1992 (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1993 }
1994
1995 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepu32_ph(__mmask16 __U,__m512i __A)1996 _mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
1997 return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1998 (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1999 _MM_FROUND_CUR_DIRECTION);
2000 }
2001
2002 #define _mm512_cvtt_roundph_epi32(A, R) \
2003 ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \
2004 (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
2005 (int)(R)))
2006
2007 #define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \
2008 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \
2009 (__mmask16)(U), (int)(R)))
2010
2011 #define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \
2012 ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \
2013 (__v16si)_mm512_setzero_epi32(), \
2014 (__mmask16)(U), (int)(R)))
2015
2016 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epi32(__m256h __A)2017 _mm512_cvttph_epi32(__m256h __A) {
2018 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2019 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
2020 _MM_FROUND_CUR_DIRECTION);
2021 }
2022
2023 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epi32(__m512i __W,__mmask16 __U,__m256h __A)2024 _mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
2025 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2026 (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2027 }
2028
2029 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epi32(__mmask16 __U,__m256h __A)2030 _mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
2031 return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2032 (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
2033 _MM_FROUND_CUR_DIRECTION);
2034 }
2035
2036 #define _mm512_cvtt_roundph_epu32(A, R) \
2037 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2038 (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
2039 (int)(R)))
2040
2041 #define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \
2042 ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \
2043 (__mmask16)(U), (int)(R)))
2044
2045 #define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \
2046 ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
2047 (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \
2048 (int)(R)))
2049
2050 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epu32(__m256h __A)2051 _mm512_cvttph_epu32(__m256h __A) {
2052 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2053 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
2054 _MM_FROUND_CUR_DIRECTION);
2055 }
2056
2057 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epu32(__m512i __W,__mmask16 __U,__m256h __A)2058 _mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
2059 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2060 (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2061 }
2062
2063 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epu32(__mmask16 __U,__m256h __A)2064 _mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
2065 return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2066 (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
2067 _MM_FROUND_CUR_DIRECTION);
2068 }
2069
2070 #define _mm512_cvt_roundepi64_ph(A, R) \
2071 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2072 (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2073
2074 #define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \
2075 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \
2076 (__mmask8)(U), (int)(R)))
2077
2078 #define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \
2079 ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
2080 (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2081
2082 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_cvtepi64_ph(__m512i __A)2083 _mm512_cvtepi64_ph(__m512i __A) {
2084 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2085 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2086 _MM_FROUND_CUR_DIRECTION);
2087 }
2088
2089 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepi64_ph(__m128h __W,__mmask8 __U,__m512i __A)2090 _mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2091 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2092 (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2093 }
2094
2095 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepi64_ph(__mmask8 __U,__m512i __A)2096 _mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
2097 return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2098 (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2099 _MM_FROUND_CUR_DIRECTION);
2100 }
2101
2102 #define _mm512_cvt_roundph_epi64(A, R) \
2103 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \
2104 (__v8di)_mm512_undefined_epi32(), \
2105 (__mmask8)(-1), (int)(R)))
2106
2107 #define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \
2108 ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2109 (__mmask8)(U), (int)(R)))
2110
2111 #define _mm512_maskz_cvt_roundph_epi64(U, A, R) \
2112 ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \
2113 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2114
2115 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epi64(__m128h __A)2116 _mm512_cvtph_epi64(__m128h __A) {
2117 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2118 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2119 _MM_FROUND_CUR_DIRECTION);
2120 }
2121
2122 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epi64(__m512i __W,__mmask8 __U,__m128h __A)2123 _mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2124 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2125 (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2126 }
2127
2128 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epi64(__mmask8 __U,__m128h __A)2129 _mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
2130 return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2131 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2132 _MM_FROUND_CUR_DIRECTION);
2133 }
2134
2135 #define _mm512_cvt_roundepu64_ph(A, R) \
2136 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2137 (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2138
2139 #define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \
2140 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \
2141 (__mmask8)(U), (int)(R)))
2142
2143 #define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \
2144 ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
2145 (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2146
2147 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_cvtepu64_ph(__m512i __A)2148 _mm512_cvtepu64_ph(__m512i __A) {
2149 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2150 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2151 _MM_FROUND_CUR_DIRECTION);
2152 }
2153
2154 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtepu64_ph(__m128h __W,__mmask8 __U,__m512i __A)2155 _mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2156 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2157 (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2158 }
2159
2160 static __inline__ __m128h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtepu64_ph(__mmask8 __U,__m512i __A)2161 _mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
2162 return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2163 (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2164 _MM_FROUND_CUR_DIRECTION);
2165 }
2166
2167 #define _mm512_cvt_roundph_epu64(A, R) \
2168 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2169 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2170 (int)(R)))
2171
2172 #define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \
2173 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2174 (__mmask8)(U), (int)(R)))
2175
2176 #define _mm512_maskz_cvt_roundph_epu64(U, A, R) \
2177 ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
2178 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2179
2180 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtph_epu64(__m128h __A)2181 _mm512_cvtph_epu64(__m128h __A) {
2182 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2183 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2184 _MM_FROUND_CUR_DIRECTION);
2185 }
2186
2187 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvtph_epu64(__m512i __W,__mmask8 __U,__m128h __A)2188 _mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2189 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2190 (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2191 }
2192
2193 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtph_epu64(__mmask8 __U,__m128h __A)2194 _mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
2195 return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2196 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2197 _MM_FROUND_CUR_DIRECTION);
2198 }
2199
2200 #define _mm512_cvtt_roundph_epi64(A, R) \
2201 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2202 (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \
2203 (int)(R)))
2204
2205 #define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \
2206 ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \
2207 (__mmask8)(U), (int)(R)))
2208
2209 #define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \
2210 ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
2211 (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2212
2213 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epi64(__m128h __A)2214 _mm512_cvttph_epi64(__m128h __A) {
2215 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2216 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2217 _MM_FROUND_CUR_DIRECTION);
2218 }
2219
2220 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epi64(__m512i __W,__mmask8 __U,__m128h __A)2221 _mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2222 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2223 (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2224 }
2225
2226 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epi64(__mmask8 __U,__m128h __A)2227 _mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
2228 return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2229 (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2230 _MM_FROUND_CUR_DIRECTION);
2231 }
2232
2233 #define _mm512_cvtt_roundph_epu64(A, R) \
2234 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2235 (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
2236 (int)(R)))
2237
2238 #define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \
2239 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
2240 (__mmask8)(U), (int)(R)))
2241
2242 #define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \
2243 ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
2244 (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2245
2246 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttph_epu64(__m128h __A)2247 _mm512_cvttph_epu64(__m128h __A) {
2248 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2249 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2250 _MM_FROUND_CUR_DIRECTION);
2251 }
2252
2253 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_cvttph_epu64(__m512i __W,__mmask8 __U,__m128h __A)2254 _mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2255 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2256 (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2257 }
2258
2259 static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_cvttph_epu64(__mmask8 __U,__m128h __A)2260 _mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
2261 return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2262 (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2263 _MM_FROUND_CUR_DIRECTION);
2264 }
2265
2266 #define _mm_cvt_roundsh_i32(A, R) \
2267 ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2268
_mm_cvtsh_i32(__m128h __A)2269 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
2270 return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2271 }
2272
2273 #define _mm_cvt_roundsh_u32(A, R) \
2274 ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2275
2276 static __inline__ unsigned int __DEFAULT_FN_ATTRS128
_mm_cvtsh_u32(__m128h __A)2277 _mm_cvtsh_u32(__m128h __A) {
2278 return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2279 _MM_FROUND_CUR_DIRECTION);
2280 }
2281
2282 #ifdef __x86_64__
2283 #define _mm_cvt_roundsh_i64(A, R) \
2284 ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2285
_mm_cvtsh_i64(__m128h __A)2286 static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
2287 return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2288 _MM_FROUND_CUR_DIRECTION);
2289 }
2290
2291 #define _mm_cvt_roundsh_u64(A, R) \
2292 ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2293
2294 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
_mm_cvtsh_u64(__m128h __A)2295 _mm_cvtsh_u64(__m128h __A) {
2296 return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
2297 (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2298 }
2299 #endif // __x86_64__
2300
2301 #define _mm_cvt_roundu32_sh(A, B, R) \
2302 ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2303
2304 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvtu32_sh(__m128h __A,unsigned int __B)2305 _mm_cvtu32_sh(__m128h __A, unsigned int __B) {
2306 __A[0] = __B;
2307 return __A;
2308 }
2309
2310 #ifdef __x86_64__
2311 #define _mm_cvt_roundu64_sh(A, B, R) \
2312 ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \
2313 (int)(R)))
2314
2315 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_cvtu64_sh(__m128h __A,unsigned long long __B)2316 _mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
2317 __A[0] = __B;
2318 return __A;
2319 }
2320 #endif
2321
2322 #define _mm_cvt_roundi32_sh(A, B, R) \
2323 ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2324
_mm_cvti32_sh(__m128h __A,int __B)2325 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
2326 int __B) {
2327 __A[0] = __B;
2328 return __A;
2329 }
2330
2331 #ifdef __x86_64__
2332 #define _mm_cvt_roundi64_sh(A, B, R) \
2333 ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2334
_mm_cvti64_sh(__m128h __A,long long __B)2335 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
2336 long long __B) {
2337 __A[0] = __B;
2338 return __A;
2339 }
2340 #endif
2341
2342 #define _mm_cvtt_roundsh_i32(A, R) \
2343 ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2344
_mm_cvttsh_i32(__m128h __A)2345 static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
2346 return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2347 _MM_FROUND_CUR_DIRECTION);
2348 }
2349
2350 #ifdef __x86_64__
2351 #define _mm_cvtt_roundsh_i64(A, R) \
2352 ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2353
_mm_cvttsh_i64(__m128h __A)2354 static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
2355 return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2356 _MM_FROUND_CUR_DIRECTION);
2357 }
2358 #endif
2359
2360 #define _mm_cvtt_roundsh_u32(A, R) \
2361 ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2362
2363 static __inline__ unsigned int __DEFAULT_FN_ATTRS128
_mm_cvttsh_u32(__m128h __A)2364 _mm_cvttsh_u32(__m128h __A) {
2365 return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2366 _MM_FROUND_CUR_DIRECTION);
2367 }
2368
2369 #ifdef __x86_64__
2370 #define _mm_cvtt_roundsh_u64(A, R) \
2371 ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2372
2373 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
_mm_cvttsh_u64(__m128h __A)2374 _mm_cvttsh_u64(__m128h __A) {
2375 return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
2376 (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2377 }
2378 #endif
2379
2380 #define _mm512_cvtx_roundph_ps(A, R) \
2381 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \
2382 (__v16sf)_mm512_undefined_ps(), \
2383 (__mmask16)(-1), (int)(R)))
2384
2385 #define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \
2386 ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \
2387 (__mmask16)(U), (int)(R)))
2388
2389 #define _mm512_maskz_cvtx_roundph_ps(U, A, R) \
2390 ((__m512)__builtin_ia32_vcvtph2psx512_mask( \
2391 (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2392
_mm512_cvtxph_ps(__m256h __A)2393 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
2394 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2395 (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
2396 _MM_FROUND_CUR_DIRECTION);
2397 }
2398
2399 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_cvtxph_ps(__m512 __W,__mmask16 __U,__m256h __A)2400 _mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
2401 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2402 (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2403 }
2404
2405 static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtxph_ps(__mmask16 __U,__m256h __A)2406 _mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
2407 return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2408 (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
2409 _MM_FROUND_CUR_DIRECTION);
2410 }
2411
2412 #define _mm512_cvtx_roundps_ph(A, R) \
2413 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \
2414 (__v16hf)_mm256_undefined_ph(), \
2415 (__mmask16)(-1), (int)(R)))
2416
2417 #define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \
2418 ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \
2419 (__mmask16)(U), (int)(R)))
2420
2421 #define _mm512_maskz_cvtx_roundps_ph(U, A, R) \
2422 ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \
2423 (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2424
_mm512_cvtxps_ph(__m512 __A)2425 static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
2426 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2427 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
2428 _MM_FROUND_CUR_DIRECTION);
2429 }
2430
2431 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_mask_cvtxps_ph(__m256h __W,__mmask16 __U,__m512 __A)2432 _mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
2433 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2434 (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2435 }
2436
2437 static __inline__ __m256h __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtxps_ph(__mmask16 __U,__m512 __A)2438 _mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
2439 return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2440 (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2441 _MM_FROUND_CUR_DIRECTION);
2442 }
2443
2444 #define _mm512_fmadd_round_ph(A, B, C, R) \
2445 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2446 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2447 (__mmask32)-1, (int)(R)))
2448
2449 #define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
2450 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2451 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2452 (__mmask32)(U), (int)(R)))
2453
2454 #define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
2455 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2456 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2457 (__mmask32)(U), (int)(R)))
2458
2459 #define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
2460 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2461 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2462 (__mmask32)(U), (int)(R)))
2463
2464 #define _mm512_fmsub_round_ph(A, B, C, R) \
2465 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2466 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2467 (__mmask32)-1, (int)(R)))
2468
2469 #define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
2470 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2471 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2472 (__mmask32)(U), (int)(R)))
2473
2474 #define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
2475 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2476 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2477 (__mmask32)(U), (int)(R)))
2478
2479 #define _mm512_fnmadd_round_ph(A, B, C, R) \
2480 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2481 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2482 (__mmask32)-1, (int)(R)))
2483
2484 #define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
2485 ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
2486 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2487 (__mmask32)(U), (int)(R)))
2488
2489 #define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
2490 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2491 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2492 (__mmask32)(U), (int)(R)))
2493
2494 #define _mm512_fnmsub_round_ph(A, B, C, R) \
2495 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2496 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2497 (__mmask32)-1, (int)(R)))
2498
2499 #define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
2500 ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
2501 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2502 (__mmask32)(U), (int)(R)))
2503
_mm512_fmadd_ph(__m512h __A,__m512h __B,__m512h __C)2504 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
2505 __m512h __B,
2506 __m512h __C) {
2507 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2508 (__v32hf)__C, (__mmask32)-1,
2509 _MM_FROUND_CUR_DIRECTION);
2510 }
2511
2512 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmadd_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2513 _mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2514 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2515 (__v32hf)__C, (__mmask32)__U,
2516 _MM_FROUND_CUR_DIRECTION);
2517 }
2518
2519 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmadd_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2520 _mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2521 return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2522 (__v32hf)__C, (__mmask32)__U,
2523 _MM_FROUND_CUR_DIRECTION);
2524 }
2525
2526 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmadd_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2527 _mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2528 return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2529 (__v32hf)__C, (__mmask32)__U,
2530 _MM_FROUND_CUR_DIRECTION);
2531 }
2532
_mm512_fmsub_ph(__m512h __A,__m512h __B,__m512h __C)2533 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
2534 __m512h __B,
2535 __m512h __C) {
2536 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2537 -(__v32hf)__C, (__mmask32)-1,
2538 _MM_FROUND_CUR_DIRECTION);
2539 }
2540
2541 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmsub_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2542 _mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2543 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2544 -(__v32hf)__C, (__mmask32)__U,
2545 _MM_FROUND_CUR_DIRECTION);
2546 }
2547
2548 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmsub_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2549 _mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2550 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2551 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2552 _MM_FROUND_CUR_DIRECTION);
2553 }
2554
_mm512_fnmadd_ph(__m512h __A,__m512h __B,__m512h __C)2555 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
2556 __m512h __B,
2557 __m512h __C) {
2558 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2559 (__v32hf)__C, (__mmask32)-1,
2560 _MM_FROUND_CUR_DIRECTION);
2561 }
2562
2563 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fnmadd_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2564 _mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2565 return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2566 (__v32hf)__C, (__mmask32)__U,
2567 _MM_FROUND_CUR_DIRECTION);
2568 }
2569
2570 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fnmadd_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2571 _mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2572 return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2573 (__v32hf)__C, (__mmask32)__U,
2574 _MM_FROUND_CUR_DIRECTION);
2575 }
2576
_mm512_fnmsub_ph(__m512h __A,__m512h __B,__m512h __C)2577 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
2578 __m512h __B,
2579 __m512h __C) {
2580 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2581 -(__v32hf)__C, (__mmask32)-1,
2582 _MM_FROUND_CUR_DIRECTION);
2583 }
2584
2585 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fnmsub_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2586 _mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2587 return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2588 -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2589 _MM_FROUND_CUR_DIRECTION);
2590 }
2591
2592 #define _mm512_fmaddsub_round_ph(A, B, C, R) \
2593 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2594 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2595 (__mmask32)-1, (int)(R)))
2596
2597 #define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
2598 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2599 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2600 (__mmask32)(U), (int)(R)))
2601
2602 #define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
2603 ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \
2604 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2605 (__mmask32)(U), (int)(R)))
2606
2607 #define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
2608 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2609 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2610 (__mmask32)(U), (int)(R)))
2611
2612 #define _mm512_fmsubadd_round_ph(A, B, C, R) \
2613 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2614 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2615 (__mmask32)-1, (int)(R)))
2616
2617 #define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
2618 ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
2619 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2620 (__mmask32)(U), (int)(R)))
2621
2622 #define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
2623 ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
2624 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2625 (__mmask32)(U), (int)(R)))
2626
2627 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fmaddsub_ph(__m512h __A,__m512h __B,__m512h __C)2628 _mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2629 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2630 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
2631 _MM_FROUND_CUR_DIRECTION);
2632 }
2633
2634 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmaddsub_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2635 _mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2636 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2637 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2638 _MM_FROUND_CUR_DIRECTION);
2639 }
2640
2641 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmaddsub_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2642 _mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2643 return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2644 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2645 _MM_FROUND_CUR_DIRECTION);
2646 }
2647
2648 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmaddsub_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2649 _mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2650 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2651 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2652 _MM_FROUND_CUR_DIRECTION);
2653 }
2654
2655 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_fmsubadd_ph(__m512h __A,__m512h __B,__m512h __C)2656 _mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2657 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2658 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
2659 _MM_FROUND_CUR_DIRECTION);
2660 }
2661
2662 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmsubadd_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2663 _mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2664 return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2665 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2666 _MM_FROUND_CUR_DIRECTION);
2667 }
2668
2669 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmsubadd_ph(__mmask32 __U,__m512h __A,__m512h __B,__m512h __C)2670 _mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2671 return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2672 (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2673 _MM_FROUND_CUR_DIRECTION);
2674 }
2675
2676 #define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
2677 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2678 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2679 (__mmask32)(U), (int)(R)))
2680
2681 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmsub_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2682 _mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2683 return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2684 (__v32hf)__C, (__mmask32)__U,
2685 _MM_FROUND_CUR_DIRECTION);
2686 }
2687
2688 #define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
2689 ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \
2690 (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2691 (__mmask32)(U), (int)(R)))
2692
2693 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmsubadd_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2694 _mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2695 return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2696 (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2697 _MM_FROUND_CUR_DIRECTION);
2698 }
2699
2700 #define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
2701 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2702 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2703 (__mmask32)(U), (int)(R)))
2704
2705 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fnmadd_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2706 _mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2707 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2708 (__v32hf)__C, (__mmask32)__U,
2709 _MM_FROUND_CUR_DIRECTION);
2710 }
2711
2712 #define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
2713 ((__m512h)__builtin_ia32_vfmaddph512_mask( \
2714 (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
2715 (__mmask32)(U), (int)(R)))
2716
2717 #define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
2718 ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
2719 -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
2720 (__mmask32)(U), (int)(R)))
2721
2722 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fnmsub_ph(__m512h __A,__mmask32 __U,__m512h __B,__m512h __C)2723 _mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2724 return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2725 -(__v32hf)__C, (__mmask32)__U,
2726 _MM_FROUND_CUR_DIRECTION);
2727 }
2728
2729 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fnmsub_ph(__m512h __A,__m512h __B,__m512h __C,__mmask32 __U)2730 _mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2731 return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2732 (__v32hf)__C, (__mmask32)__U,
2733 _MM_FROUND_CUR_DIRECTION);
2734 }
2735
_mm_fmadd_sh(__m128h __W,__m128h __A,__m128h __B)2736 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
2737 __m128h __A,
2738 __m128h __B) {
2739 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2740 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2741 }
2742
_mm_mask_fmadd_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)2743 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
2744 __mmask8 __U,
2745 __m128h __A,
2746 __m128h __B) {
2747 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2748 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2749 }
2750
2751 #define _mm_fmadd_round_sh(A, B, C, R) \
2752 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2753 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2754 (__mmask8)-1, (int)(R)))
2755
2756 #define _mm_mask_fmadd_round_sh(W, U, A, B, R) \
2757 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2758 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2759 (__mmask8)(U), (int)(R)))
2760
2761 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmadd_sh(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2762 _mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2763 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2764 (__mmask8)__U,
2765 _MM_FROUND_CUR_DIRECTION);
2766 }
2767
2768 #define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
2769 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2770 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2771 (__mmask8)(U), (int)(R)))
2772
2773 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fmadd_sh(__m128h __W,__m128h __X,__m128h __Y,__mmask8 __U)2774 _mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2775 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2776 (__mmask8)__U,
2777 _MM_FROUND_CUR_DIRECTION);
2778 }
2779
2780 #define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \
2781 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2782 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2783 (__mmask8)(U), (int)(R)))
2784
_mm_fmsub_sh(__m128h __W,__m128h __A,__m128h __B)2785 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
2786 __m128h __A,
2787 __m128h __B) {
2788 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2789 -(__v8hf)__B, (__mmask8)-1,
2790 _MM_FROUND_CUR_DIRECTION);
2791 }
2792
_mm_mask_fmsub_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)2793 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
2794 __mmask8 __U,
2795 __m128h __A,
2796 __m128h __B) {
2797 return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2798 -(__v8hf)__B, (__mmask8)__U,
2799 _MM_FROUND_CUR_DIRECTION);
2800 }
2801
2802 #define _mm_fmsub_round_sh(A, B, C, R) \
2803 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2804 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2805 (__mmask8)-1, (int)(R)))
2806
2807 #define _mm_mask_fmsub_round_sh(W, U, A, B, R) \
2808 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2809 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2810 (__mmask8)(U), (int)(R)))
2811
2812 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmsub_sh(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2813 _mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2814 return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2815 -(__v8hf)__C, (__mmask8)__U,
2816 _MM_FROUND_CUR_DIRECTION);
2817 }
2818
2819 #define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
2820 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2821 (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2822 (__mmask8)(U), (int)R))
2823
2824 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fmsub_sh(__m128h __W,__m128h __X,__m128h __Y,__mmask8 __U)2825 _mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2826 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2827 (__mmask8)__U,
2828 _MM_FROUND_CUR_DIRECTION);
2829 }
2830
2831 #define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \
2832 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2833 (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2834 (__mmask8)(U), (int)(R)))
2835
_mm_fnmadd_sh(__m128h __W,__m128h __A,__m128h __B)2836 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
2837 __m128h __A,
2838 __m128h __B) {
2839 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2840 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2841 }
2842
2843 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fnmadd_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)2844 _mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2845 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2846 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2847 }
2848
2849 #define _mm_fnmadd_round_sh(A, B, C, R) \
2850 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2851 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2852 (__mmask8)-1, (int)(R)))
2853
2854 #define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \
2855 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2856 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
2857 (__mmask8)(U), (int)(R)))
2858
2859 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fnmadd_sh(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2860 _mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2861 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2862 (__mmask8)__U,
2863 _MM_FROUND_CUR_DIRECTION);
2864 }
2865
2866 #define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
2867 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2868 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
2869 (__mmask8)(U), (int)(R)))
2870
2871 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fnmadd_sh(__m128h __W,__m128h __X,__m128h __Y,__mmask8 __U)2872 _mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2873 return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2874 (__mmask8)__U,
2875 _MM_FROUND_CUR_DIRECTION);
2876 }
2877
2878 #define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \
2879 ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
2880 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2881 (__mmask8)(U), (int)(R)))
2882
_mm_fnmsub_sh(__m128h __W,__m128h __A,__m128h __B)2883 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
2884 __m128h __A,
2885 __m128h __B) {
2886 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2887 (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2888 }
2889
2890 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fnmsub_sh(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)2891 _mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2892 return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2893 (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2894 }
2895
2896 #define _mm_fnmsub_round_sh(A, B, C, R) \
2897 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2898 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2899 (__mmask8)-1, (int)(R)))
2900
2901 #define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \
2902 ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
2903 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
2904 (__mmask8)(U), (int)(R)))
2905
2906 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fnmsub_sh(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2907 _mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2908 return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2909 (__mmask8)__U,
2910 _MM_FROUND_CUR_DIRECTION);
2911 }
2912
2913 #define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
2914 ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
2915 (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
2916 (__mmask8)(U), (int)(R)))
2917
2918 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fnmsub_sh(__m128h __W,__m128h __X,__m128h __Y,__mmask8 __U)2919 _mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2920 return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2921 (__mmask8)__U,
2922 _MM_FROUND_CUR_DIRECTION);
2923 }
2924
2925 #define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \
2926 ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
2927 (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
2928 (__mmask8)(U), (int)(R)))
2929
_mm_fcmadd_sch(__m128h __A,__m128h __B,__m128h __C)2930 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
2931 __m128h __B,
2932 __m128h __C) {
2933 return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2934 (__v4sf)__C, (__mmask8)-1,
2935 _MM_FROUND_CUR_DIRECTION);
2936 }
2937
2938 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fcmadd_sch(__m128h __A,__mmask8 __U,__m128h __B,__m128h __C)2939 _mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2940 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2941 (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2942 }
2943
2944 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fcmadd_sch(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2945 _mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2946 return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2947 (__v4sf)__C, (__mmask8)__U,
2948 _MM_FROUND_CUR_DIRECTION);
2949 }
2950
2951 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fcmadd_sch(__m128h __A,__m128h __B,__m128h __C,__mmask8 __U)2952 _mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
2953 return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2954 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
2955 }
2956
2957 #define _mm_fcmadd_round_sch(A, B, C, R) \
2958 ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
2959 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2960 (__mmask8)-1, (int)(R)))
2961
2962 #define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
2963 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \
2964 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2965 (__mmask8)(U), (int)(R)))
2966
2967 #define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
2968 ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
2969 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2970 (__mmask8)(U), (int)(R)))
2971
2972 #define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
2973 ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \
2974 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
2975 (__mmask8)(U), (int)(R)))
2976
_mm_fmadd_sch(__m128h __A,__m128h __B,__m128h __C)2977 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
2978 __m128h __B,
2979 __m128h __C) {
2980 return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2981 (__v4sf)__C, (__mmask8)-1,
2982 _MM_FROUND_CUR_DIRECTION);
2983 }
2984
2985 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fmadd_sch(__m128h __A,__mmask8 __U,__m128h __B,__m128h __C)2986 _mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2987 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
2988 (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2989 }
2990
2991 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmadd_sch(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)2992 _mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2993 return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2994 (__v4sf)__C, (__mmask8)__U,
2995 _MM_FROUND_CUR_DIRECTION);
2996 }
2997
2998 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fmadd_sch(__m128h __A,__m128h __B,__m128h __C,__mmask8 __U)2999 _mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
3000 return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3001 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
3002 }
3003
3004 #define _mm_fmadd_round_sch(A, B, C, R) \
3005 ((__m128h)__builtin_ia32_vfmaddcsh_mask( \
3006 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3007 (__mmask8)-1, (int)(R)))
3008
3009 #define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
3010 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \
3011 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3012 (__mmask8)(U), (int)(R)))
3013
3014 #define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
3015 ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
3016 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3017 (__mmask8)(U), (int)(R)))
3018
3019 #define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \
3020 ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \
3021 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
3022 (__mmask8)(U), (int)(R)))
3023
_mm_fcmul_sch(__m128h __A,__m128h __B)3024 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
3025 __m128h __B) {
3026 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3027 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3028 _MM_FROUND_CUR_DIRECTION);
3029 }
3030
3031 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fcmul_sch(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)3032 _mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
3033 return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3034 (__v4sf)__W, (__mmask8)__U,
3035 _MM_FROUND_CUR_DIRECTION);
3036 }
3037
3038 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fcmul_sch(__mmask8 __U,__m128h __A,__m128h __B)3039 _mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3040 return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3041 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3042 _MM_FROUND_CUR_DIRECTION);
3043 }
3044
3045 #define _mm_fcmul_round_sch(A, B, R) \
3046 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3047 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3048 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3049
3050 #define _mm_mask_fcmul_round_sch(W, U, A, B, R) \
3051 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3052 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3053 (__mmask8)(U), (int)(R)))
3054
3055 #define _mm_maskz_fcmul_round_sch(U, A, B, R) \
3056 ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
3057 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3058 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3059
_mm_fmul_sch(__m128h __A,__m128h __B)3060 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
3061 __m128h __B) {
3062 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3063 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3064 _MM_FROUND_CUR_DIRECTION);
3065 }
3066
_mm_mask_fmul_sch(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)3067 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
3068 __mmask8 __U,
3069 __m128h __A,
3070 __m128h __B) {
3071 return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3072 (__v4sf)__W, (__mmask8)__U,
3073 _MM_FROUND_CUR_DIRECTION);
3074 }
3075
3076 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmul_sch(__mmask8 __U,__m128h __A,__m128h __B)3077 _mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3078 return (__m128h)__builtin_ia32_vfmulcsh_mask(
3079 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3080 _MM_FROUND_CUR_DIRECTION);
3081 }
3082
3083 #define _mm_fmul_round_sch(A, B, R) \
3084 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3085 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3086 (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3087
3088 #define _mm_mask_fmul_round_sch(W, U, A, B, R) \
3089 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3090 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
3091 (__mmask8)(U), (int)(R)))
3092
3093 #define _mm_maskz_fmul_round_sch(U, A, B, R) \
3094 ((__m128h)__builtin_ia32_vfmulcsh_mask( \
3095 (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
3096 (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3097
_mm512_fcmul_pch(__m512h __A,__m512h __B)3098 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
3099 __m512h __B) {
3100 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3101 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3102 _MM_FROUND_CUR_DIRECTION);
3103 }
3104
3105 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fcmul_pch(__m512h __W,__mmask16 __U,__m512h __A,__m512h __B)3106 _mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3107 return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3108 (__v16sf)__W, (__mmask16)__U,
3109 _MM_FROUND_CUR_DIRECTION);
3110 }
3111
3112 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fcmul_pch(__mmask16 __U,__m512h __A,__m512h __B)3113 _mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3114 return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3115 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3116 _MM_FROUND_CUR_DIRECTION);
3117 }
3118
3119 #define _mm512_fcmul_round_pch(A, B, R) \
3120 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3121 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3122 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3123
3124 #define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \
3125 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3126 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3127 (__mmask16)(U), (int)(R)))
3128
3129 #define _mm512_maskz_fcmul_round_pch(U, A, B, R) \
3130 ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
3131 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3132 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3133
_mm512_fmul_pch(__m512h __A,__m512h __B)3134 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
3135 __m512h __B) {
3136 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3137 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3138 _MM_FROUND_CUR_DIRECTION);
3139 }
3140
3141 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmul_pch(__m512h __W,__mmask16 __U,__m512h __A,__m512h __B)3142 _mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3143 return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3144 (__v16sf)__W, (__mmask16)__U,
3145 _MM_FROUND_CUR_DIRECTION);
3146 }
3147
3148 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmul_pch(__mmask16 __U,__m512h __A,__m512h __B)3149 _mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3150 return (__m512h)__builtin_ia32_vfmulcph512_mask(
3151 (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3152 _MM_FROUND_CUR_DIRECTION);
3153 }
3154
3155 #define _mm512_fmul_round_pch(A, B, R) \
3156 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3157 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3158 (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3159
3160 #define _mm512_mask_fmul_round_pch(W, U, A, B, R) \
3161 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3162 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
3163 (__mmask16)(U), (int)(R)))
3164
3165 #define _mm512_maskz_fmul_round_pch(U, A, B, R) \
3166 ((__m512h)__builtin_ia32_vfmulcph512_mask( \
3167 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
3168 (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3169
_mm512_fcmadd_pch(__m512h __A,__m512h __B,__m512h __C)3170 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
3171 __m512h __B,
3172 __m512h __C) {
3173 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3174 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1,
3175 _MM_FROUND_CUR_DIRECTION);
3176 }
3177
3178 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fcmadd_pch(__m512h __A,__mmask16 __U,__m512h __B,__m512h __C)3179 _mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3180 return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3181 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3182 _MM_FROUND_CUR_DIRECTION);
3183 }
3184
3185 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fcmadd_pch(__m512h __A,__m512h __B,__m512h __C,__mmask16 __U)3186 _mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3187 return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3188 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3189 _MM_FROUND_CUR_DIRECTION);
3190 }
3191
3192 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fcmadd_pch(__mmask16 __U,__m512h __A,__m512h __B,__m512h __C)3193 _mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3194 return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3195 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3196 _MM_FROUND_CUR_DIRECTION);
3197 }
3198
3199 #define _mm512_fcmadd_round_pch(A, B, C, R) \
3200 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3201 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3202 (__mmask16)-1, (int)(R)))
3203
3204 #define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
3205 ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
3206 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3207 (__mmask16)(U), (int)(R)))
3208
3209 #define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
3210 ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
3211 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3212 (__mmask16)(U), (int)(R)))
3213
3214 #define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
3215 ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
3216 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3217 (__mmask16)(U), (int)(R)))
3218
_mm512_fmadd_pch(__m512h __A,__m512h __B,__m512h __C)3219 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
3220 __m512h __B,
3221 __m512h __C) {
3222 return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3223 (__v16sf)__C, (__mmask16)-1,
3224 _MM_FROUND_CUR_DIRECTION);
3225 }
3226
3227 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_fmadd_pch(__m512h __A,__mmask16 __U,__m512h __B,__m512h __C)3228 _mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3229 return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3230 (__v16sf)__C, (__mmask16)__U,
3231 _MM_FROUND_CUR_DIRECTION);
3232 }
3233
3234 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask3_fmadd_pch(__m512h __A,__m512h __B,__m512h __C,__mmask16 __U)3235 _mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3236 return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3237 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3238 _MM_FROUND_CUR_DIRECTION);
3239 }
3240
3241 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_maskz_fmadd_pch(__mmask16 __U,__m512h __A,__m512h __B,__m512h __C)3242 _mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3243 return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3244 (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3245 _MM_FROUND_CUR_DIRECTION);
3246 }
3247
3248 #define _mm512_fmadd_round_pch(A, B, C, R) \
3249 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3250 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3251 (__mmask16)-1, (int)(R)))
3252
3253 #define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
3254 ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
3255 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3256 (__mmask16)(U), (int)(R)))
3257
3258 #define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
3259 ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
3260 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3261 (__mmask16)(U), (int)(R)))
3262
3263 #define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
3264 ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
3265 (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
3266 (__mmask16)(U), (int)(R)))
3267
3268 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
_mm512_reduce_add_ph(__m512h __W)3269 _mm512_reduce_add_ph(__m512h __W) {
3270 return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3271 }
3272
3273 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
_mm512_reduce_mul_ph(__m512h __W)3274 _mm512_reduce_mul_ph(__m512h __W) {
3275 return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3276 }
3277
3278 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
_mm512_reduce_max_ph(__m512h __V)3279 _mm512_reduce_max_ph(__m512h __V) {
3280 return __builtin_ia32_reduce_fmax_ph512(__V);
3281 }
3282
3283 static __inline__ _Float16 __DEFAULT_FN_ATTRS512
_mm512_reduce_min_ph(__m512h __V)3284 _mm512_reduce_min_ph(__m512h __V) {
3285 return __builtin_ia32_reduce_fmin_ph512(__V);
3286 }
3287
3288 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_mask_blend_ph(__mmask32 __U,__m512h __A,__m512h __W)3289 _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
3290 return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
3291 (__v32hf)__A);
3292 }
3293
3294 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_permutex2var_ph(__m512h __A,__m512i __I,__m512h __B)3295 _mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3296 return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3297 (__v32hi)__B);
3298 }
3299
3300 static __inline__ __m512h __DEFAULT_FN_ATTRS512
_mm512_permutexvar_ph(__m512i __A,__m512h __B)3301 _mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3302 return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3303 }
3304
3305 // intrinsics below are alias for f*mul_*ch
3306 #define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3307 #define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3308 #define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3309 #define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3310 #define _mm512_mask_mul_round_pch(W, U, A, B, R) \
3311 _mm512_mask_fmul_round_pch(W, U, A, B, R)
3312 #define _mm512_maskz_mul_round_pch(U, A, B, R) \
3313 _mm512_maskz_fmul_round_pch(U, A, B, R)
3314
3315 #define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3316 #define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3317 #define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3318 #define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3319 #define _mm512_mask_cmul_round_pch(W, U, A, B, R) \
3320 _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3321 #define _mm512_maskz_cmul_round_pch(U, A, B, R) \
3322 _mm512_maskz_fcmul_round_pch(U, A, B, R)
3323
3324 #define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3325 #define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3326 #define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3327 #define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3328 #define _mm_mask_mul_round_sch(W, U, A, B, R) \
3329 _mm_mask_fmul_round_sch(W, U, A, B, R)
3330 #define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3331
3332 #define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3333 #define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3334 #define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3335 #define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3336 #define _mm_mask_cmul_round_sch(W, U, A, B, R) \
3337 _mm_mask_fcmul_round_sch(W, U, A, B, R)
3338 #define _mm_maskz_cmul_round_sch(U, A, B, R) \
3339 _mm_maskz_fcmul_round_sch(U, A, B, R)
3340
3341 #undef __DEFAULT_FN_ATTRS128
3342 #undef __DEFAULT_FN_ATTRS256
3343 #undef __DEFAULT_FN_ATTRS512
3344
3345 #endif
3346 #endif
3347