1 /*===---------- avx512vlfp16intrin.h - AVX512-FP16 intrinsics --------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9 #ifndef __IMMINTRIN_H
10 #error \
11 "Never use <avx512vlfp16intrin.h> directly; include <immintrin.h> instead."
12 #endif
13
14 #ifdef __SSE2__
15
16 #ifndef __AVX512VLFP16INTRIN_H
17 #define __AVX512VLFP16INTRIN_H
18
19 /* Define the default attributes for the functions in this file. */
20 #define __DEFAULT_FN_ATTRS256 \
21 __attribute__((__always_inline__, __nodebug__, \
22 __target__("avx512fp16,avx512vl,no-evex512"), \
23 __min_vector_width__(256)))
24 #define __DEFAULT_FN_ATTRS128 \
25 __attribute__((__always_inline__, __nodebug__, \
26 __target__("avx512fp16,avx512vl,no-evex512"), \
27 __min_vector_width__(128)))
28
_mm_cvtsh_h(__m128h __a)29 static __inline__ _Float16 __DEFAULT_FN_ATTRS128 _mm_cvtsh_h(__m128h __a) {
30 return __a[0];
31 }
32
_mm256_cvtsh_h(__m256h __a)33 static __inline__ _Float16 __DEFAULT_FN_ATTRS256 _mm256_cvtsh_h(__m256h __a) {
34 return __a[0];
35 }
36
_mm_set_sh(_Float16 __h)37 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_set_sh(_Float16 __h) {
38 return __extension__(__m128h){__h, 0, 0, 0, 0, 0, 0, 0};
39 }
40
_mm_set1_ph(_Float16 __h)41 static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_set1_ph(_Float16 __h) {
42 return (__m128h)(__v8hf){__h, __h, __h, __h, __h, __h, __h, __h};
43 }
44
_mm256_set1_ph(_Float16 __h)45 static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_set1_ph(_Float16 __h) {
46 return (__m256h)(__v16hf){__h, __h, __h, __h, __h, __h, __h, __h,
47 __h, __h, __h, __h, __h, __h, __h, __h};
48 }
49
50 static __inline __m128h __DEFAULT_FN_ATTRS128
_mm_set_ph(_Float16 __h1,_Float16 __h2,_Float16 __h3,_Float16 __h4,_Float16 __h5,_Float16 __h6,_Float16 __h7,_Float16 __h8)51 _mm_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
52 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8) {
53 return (__m128h)(__v8hf){__h8, __h7, __h6, __h5, __h4, __h3, __h2, __h1};
54 }
55
56 static __inline __m256h __DEFAULT_FN_ATTRS256
_mm256_set1_pch(_Float16 _Complex h)57 _mm256_set1_pch(_Float16 _Complex h) {
58 return (__m256h)_mm256_set1_ps(__builtin_bit_cast(float, h));
59 }
60
61 static __inline __m128h __DEFAULT_FN_ATTRS128
_mm_set1_pch(_Float16 _Complex h)62 _mm_set1_pch(_Float16 _Complex h) {
63 return (__m128h)_mm_set1_ps(__builtin_bit_cast(float, h));
64 }
65
66 static __inline __m256h __DEFAULT_FN_ATTRS256
_mm256_set_ph(_Float16 __h1,_Float16 __h2,_Float16 __h3,_Float16 __h4,_Float16 __h5,_Float16 __h6,_Float16 __h7,_Float16 __h8,_Float16 __h9,_Float16 __h10,_Float16 __h11,_Float16 __h12,_Float16 __h13,_Float16 __h14,_Float16 __h15,_Float16 __h16)67 _mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
68 _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
69 _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
70 _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16) {
71 return (__m256h)(__v16hf){__h16, __h15, __h14, __h13, __h12, __h11,
72 __h10, __h9, __h8, __h7, __h6, __h5,
73 __h4, __h3, __h2, __h1};
74 }
75
76 #define _mm_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8) \
77 _mm_set_ph((h8), (h7), (h6), (h5), (h4), (h3), (h2), (h1))
78
79 #define _mm256_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
80 h14, h15, h16) \
81 _mm256_set_ph((h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), (h8), \
82 (h7), (h6), (h5), (h4), (h3), (h2), (h1))
83
_mm256_add_ph(__m256h __A,__m256h __B)84 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_add_ph(__m256h __A,
85 __m256h __B) {
86 return (__m256h)((__v16hf)__A + (__v16hf)__B);
87 }
88
89 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_add_ph(__m256h __W,__mmask16 __U,__m256h __A,__m256h __B)90 _mm256_mask_add_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
91 return (__m256h)__builtin_ia32_selectph_256(
92 __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)__W);
93 }
94
95 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_add_ph(__mmask16 __U,__m256h __A,__m256h __B)96 _mm256_maskz_add_ph(__mmask16 __U, __m256h __A, __m256h __B) {
97 return (__m256h)__builtin_ia32_selectph_256(
98 __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
99 }
100
_mm_add_ph(__m128h __A,__m128h __B)101 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_ph(__m128h __A,
102 __m128h __B) {
103 return (__m128h)((__v8hf)__A + (__v8hf)__B);
104 }
105
_mm_mask_add_ph(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)106 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_ph(__m128h __W,
107 __mmask8 __U,
108 __m128h __A,
109 __m128h __B) {
110 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B),
111 (__v8hf)__W);
112 }
113
_mm_maskz_add_ph(__mmask8 __U,__m128h __A,__m128h __B)114 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_ph(__mmask8 __U,
115 __m128h __A,
116 __m128h __B) {
117 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B),
118 (__v8hf)_mm_setzero_ph());
119 }
120
_mm256_sub_ph(__m256h __A,__m256h __B)121 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_sub_ph(__m256h __A,
122 __m256h __B) {
123 return (__m256h)((__v16hf)__A - (__v16hf)__B);
124 }
125
126 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_sub_ph(__m256h __W,__mmask16 __U,__m256h __A,__m256h __B)127 _mm256_mask_sub_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
128 return (__m256h)__builtin_ia32_selectph_256(
129 __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)__W);
130 }
131
132 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_sub_ph(__mmask16 __U,__m256h __A,__m256h __B)133 _mm256_maskz_sub_ph(__mmask16 __U, __m256h __A, __m256h __B) {
134 return (__m256h)__builtin_ia32_selectph_256(
135 __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
136 }
137
_mm_sub_ph(__m128h __A,__m128h __B)138 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_ph(__m128h __A,
139 __m128h __B) {
140 return (__m128h)((__v8hf)__A - (__v8hf)__B);
141 }
142
_mm_mask_sub_ph(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)143 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_ph(__m128h __W,
144 __mmask8 __U,
145 __m128h __A,
146 __m128h __B) {
147 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B),
148 (__v8hf)__W);
149 }
150
_mm_maskz_sub_ph(__mmask8 __U,__m128h __A,__m128h __B)151 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_ph(__mmask8 __U,
152 __m128h __A,
153 __m128h __B) {
154 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B),
155 (__v8hf)_mm_setzero_ph());
156 }
157
_mm256_mul_ph(__m256h __A,__m256h __B)158 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mul_ph(__m256h __A,
159 __m256h __B) {
160 return (__m256h)((__v16hf)__A * (__v16hf)__B);
161 }
162
163 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_mul_ph(__m256h __W,__mmask16 __U,__m256h __A,__m256h __B)164 _mm256_mask_mul_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
165 return (__m256h)__builtin_ia32_selectph_256(
166 __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)__W);
167 }
168
169 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_mul_ph(__mmask16 __U,__m256h __A,__m256h __B)170 _mm256_maskz_mul_ph(__mmask16 __U, __m256h __A, __m256h __B) {
171 return (__m256h)__builtin_ia32_selectph_256(
172 __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
173 }
174
_mm_mul_ph(__m128h __A,__m128h __B)175 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_ph(__m128h __A,
176 __m128h __B) {
177 return (__m128h)((__v8hf)__A * (__v8hf)__B);
178 }
179
_mm_mask_mul_ph(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)180 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_ph(__m128h __W,
181 __mmask8 __U,
182 __m128h __A,
183 __m128h __B) {
184 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B),
185 (__v8hf)__W);
186 }
187
_mm_maskz_mul_ph(__mmask8 __U,__m128h __A,__m128h __B)188 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ph(__mmask8 __U,
189 __m128h __A,
190 __m128h __B) {
191 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B),
192 (__v8hf)_mm_setzero_ph());
193 }
194
_mm256_div_ph(__m256h __A,__m256h __B)195 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_div_ph(__m256h __A,
196 __m256h __B) {
197 return (__m256h)((__v16hf)__A / (__v16hf)__B);
198 }
199
200 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_div_ph(__m256h __W,__mmask16 __U,__m256h __A,__m256h __B)201 _mm256_mask_div_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
202 return (__m256h)__builtin_ia32_selectph_256(
203 __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)__W);
204 }
205
206 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_div_ph(__mmask16 __U,__m256h __A,__m256h __B)207 _mm256_maskz_div_ph(__mmask16 __U, __m256h __A, __m256h __B) {
208 return (__m256h)__builtin_ia32_selectph_256(
209 __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
210 }
211
_mm_div_ph(__m128h __A,__m128h __B)212 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_ph(__m128h __A,
213 __m128h __B) {
214 return (__m128h)((__v8hf)__A / (__v8hf)__B);
215 }
216
_mm_mask_div_ph(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)217 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_ph(__m128h __W,
218 __mmask8 __U,
219 __m128h __A,
220 __m128h __B) {
221 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B),
222 (__v8hf)__W);
223 }
224
_mm_maskz_div_ph(__mmask8 __U,__m128h __A,__m128h __B)225 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_ph(__mmask8 __U,
226 __m128h __A,
227 __m128h __B) {
228 return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B),
229 (__v8hf)_mm_setzero_ph());
230 }
231
_mm256_min_ph(__m256h __A,__m256h __B)232 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_min_ph(__m256h __A,
233 __m256h __B) {
234 return (__m256h)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B);
235 }
236
237 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_min_ph(__m256h __W,__mmask16 __U,__m256h __A,__m256h __B)238 _mm256_mask_min_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
239 return (__m256h)__builtin_ia32_selectph_256(
240 (__mmask16)__U,
241 (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B),
242 (__v16hf)__W);
243 }
244
245 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_min_ph(__mmask16 __U,__m256h __A,__m256h __B)246 _mm256_maskz_min_ph(__mmask16 __U, __m256h __A, __m256h __B) {
247 return (__m256h)__builtin_ia32_selectph_256(
248 (__mmask16)__U,
249 (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B),
250 (__v16hf)_mm256_setzero_ph());
251 }
252
_mm_min_ph(__m128h __A,__m128h __B)253 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_ph(__m128h __A,
254 __m128h __B) {
255 return (__m128h)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B);
256 }
257
_mm_mask_min_ph(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)258 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_ph(__m128h __W,
259 __mmask8 __U,
260 __m128h __A,
261 __m128h __B) {
262 return (__m128h)__builtin_ia32_selectph_128(
263 (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B),
264 (__v8hf)__W);
265 }
266
_mm_maskz_min_ph(__mmask8 __U,__m128h __A,__m128h __B)267 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_ph(__mmask8 __U,
268 __m128h __A,
269 __m128h __B) {
270 return (__m128h)__builtin_ia32_selectph_128(
271 (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B),
272 (__v8hf)_mm_setzero_ph());
273 }
274
_mm256_max_ph(__m256h __A,__m256h __B)275 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_max_ph(__m256h __A,
276 __m256h __B) {
277 return (__m256h)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B);
278 }
279
280 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_max_ph(__m256h __W,__mmask16 __U,__m256h __A,__m256h __B)281 _mm256_mask_max_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
282 return (__m256h)__builtin_ia32_selectph_256(
283 (__mmask16)__U,
284 (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B),
285 (__v16hf)__W);
286 }
287
288 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_max_ph(__mmask16 __U,__m256h __A,__m256h __B)289 _mm256_maskz_max_ph(__mmask16 __U, __m256h __A, __m256h __B) {
290 return (__m256h)__builtin_ia32_selectph_256(
291 (__mmask16)__U,
292 (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B),
293 (__v16hf)_mm256_setzero_ph());
294 }
295
_mm_max_ph(__m128h __A,__m128h __B)296 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_ph(__m128h __A,
297 __m128h __B) {
298 return (__m128h)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B);
299 }
300
_mm_mask_max_ph(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)301 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_ph(__m128h __W,
302 __mmask8 __U,
303 __m128h __A,
304 __m128h __B) {
305 return (__m128h)__builtin_ia32_selectph_128(
306 (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B),
307 (__v8hf)__W);
308 }
309
_mm_maskz_max_ph(__mmask8 __U,__m128h __A,__m128h __B)310 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_ph(__mmask8 __U,
311 __m128h __A,
312 __m128h __B) {
313 return (__m128h)__builtin_ia32_selectph_128(
314 (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B),
315 (__v8hf)_mm_setzero_ph());
316 }
317
_mm256_abs_ph(__m256h __A)318 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_abs_ph(__m256h __A) {
319 return (__m256h)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), (__m256i)__A);
320 }
321
_mm_abs_ph(__m128h __A)322 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) {
323 return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A);
324 }
325
_mm256_conj_pch(__m256h __A)326 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_conj_pch(__m256h __A) {
327 return (__m256h)_mm256_xor_ps((__m256)__A, _mm256_set1_ps(-0.0f));
328 }
329
330 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_conj_pch(__m256h __W,__mmask8 __U,__m256h __A)331 _mm256_mask_conj_pch(__m256h __W, __mmask8 __U, __m256h __A) {
332 return (__m256h)__builtin_ia32_selectps_256(
333 (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)__W);
334 }
335
336 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_conj_pch(__mmask8 __U,__m256h __A)337 _mm256_maskz_conj_pch(__mmask8 __U, __m256h __A) {
338 return (__m256h)__builtin_ia32_selectps_256(
339 (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)_mm256_setzero_ps());
340 }
341
_mm_conj_pch(__m128h __A)342 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_conj_pch(__m128h __A) {
343 return (__m128h)_mm_xor_ps((__m128)__A, _mm_set1_ps(-0.0f));
344 }
345
_mm_mask_conj_pch(__m128h __W,__mmask8 __U,__m128h __A)346 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_conj_pch(__m128h __W,
347 __mmask8 __U,
348 __m128h __A) {
349 return (__m128h)__builtin_ia32_selectps_128(
350 (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)__W);
351 }
352
353 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_conj_pch(__mmask8 __U,__m128h __A)354 _mm_maskz_conj_pch(__mmask8 __U, __m128h __A) {
355 return (__m128h)__builtin_ia32_selectps_128(
356 (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)_mm_setzero_ps());
357 }
358
359 #define _mm256_cmp_ph_mask(a, b, p) \
360 ((__mmask16)__builtin_ia32_cmpph256_mask( \
361 (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1))
362
363 #define _mm256_mask_cmp_ph_mask(m, a, b, p) \
364 ((__mmask16)__builtin_ia32_cmpph256_mask( \
365 (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)(m)))
366
367 #define _mm_cmp_ph_mask(a, b, p) \
368 ((__mmask8)__builtin_ia32_cmpph128_mask( \
369 (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)-1))
370
371 #define _mm_mask_cmp_ph_mask(m, a, b, p) \
372 ((__mmask8)__builtin_ia32_cmpph128_mask( \
373 (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m)))
374
_mm256_rcp_ph(__m256h __A)375 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rcp_ph(__m256h __A) {
376 return (__m256h)__builtin_ia32_rcpph256_mask(
377 (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1);
378 }
379
380 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_rcp_ph(__m256h __W,__mmask16 __U,__m256h __A)381 _mm256_mask_rcp_ph(__m256h __W, __mmask16 __U, __m256h __A) {
382 return (__m256h)__builtin_ia32_rcpph256_mask((__v16hf)__A, (__v16hf)__W,
383 (__mmask16)__U);
384 }
385
386 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_rcp_ph(__mmask16 __U,__m256h __A)387 _mm256_maskz_rcp_ph(__mmask16 __U, __m256h __A) {
388 return (__m256h)__builtin_ia32_rcpph256_mask(
389 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
390 }
391
_mm_rcp_ph(__m128h __A)392 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_ph(__m128h __A) {
393 return (__m128h)__builtin_ia32_rcpph128_mask(
394 (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
395 }
396
_mm_mask_rcp_ph(__m128h __W,__mmask8 __U,__m128h __A)397 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_ph(__m128h __W,
398 __mmask8 __U,
399 __m128h __A) {
400 return (__m128h)__builtin_ia32_rcpph128_mask((__v8hf)__A, (__v8hf)__W,
401 (__mmask8)__U);
402 }
403
_mm_maskz_rcp_ph(__mmask8 __U,__m128h __A)404 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_ph(__mmask8 __U,
405 __m128h __A) {
406 return (__m128h)__builtin_ia32_rcpph128_mask(
407 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
408 }
409
_mm256_rsqrt_ph(__m256h __A)410 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rsqrt_ph(__m256h __A) {
411 return (__m256h)__builtin_ia32_rsqrtph256_mask(
412 (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1);
413 }
414
415 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_rsqrt_ph(__m256h __W,__mmask16 __U,__m256h __A)416 _mm256_mask_rsqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
417 return (__m256h)__builtin_ia32_rsqrtph256_mask((__v16hf)__A, (__v16hf)__W,
418 (__mmask16)__U);
419 }
420
421 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_rsqrt_ph(__mmask16 __U,__m256h __A)422 _mm256_maskz_rsqrt_ph(__mmask16 __U, __m256h __A) {
423 return (__m256h)__builtin_ia32_rsqrtph256_mask(
424 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
425 }
426
_mm_rsqrt_ph(__m128h __A)427 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_ph(__m128h __A) {
428 return (__m128h)__builtin_ia32_rsqrtph128_mask(
429 (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
430 }
431
_mm_mask_rsqrt_ph(__m128h __W,__mmask8 __U,__m128h __A)432 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_ph(__m128h __W,
433 __mmask8 __U,
434 __m128h __A) {
435 return (__m128h)__builtin_ia32_rsqrtph128_mask((__v8hf)__A, (__v8hf)__W,
436 (__mmask8)__U);
437 }
438
439 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_rsqrt_ph(__mmask8 __U,__m128h __A)440 _mm_maskz_rsqrt_ph(__mmask8 __U, __m128h __A) {
441 return (__m128h)__builtin_ia32_rsqrtph128_mask(
442 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
443 }
444
_mm_getexp_ph(__m128h __A)445 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_ph(__m128h __A) {
446 return (__m128h)__builtin_ia32_getexpph128_mask(
447 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
448 }
449
450 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_getexp_ph(__m128h __W,__mmask8 __U,__m128h __A)451 _mm_mask_getexp_ph(__m128h __W, __mmask8 __U, __m128h __A) {
452 return (__m128h)__builtin_ia32_getexpph128_mask((__v8hf)__A, (__v8hf)__W,
453 (__mmask8)__U);
454 }
455
456 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_getexp_ph(__mmask8 __U,__m128h __A)457 _mm_maskz_getexp_ph(__mmask8 __U, __m128h __A) {
458 return (__m128h)__builtin_ia32_getexpph128_mask(
459 (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
460 }
461
_mm256_getexp_ph(__m256h __A)462 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_getexp_ph(__m256h __A) {
463 return (__m256h)__builtin_ia32_getexpph256_mask(
464 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1);
465 }
466
467 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_getexp_ph(__m256h __W,__mmask16 __U,__m256h __A)468 _mm256_mask_getexp_ph(__m256h __W, __mmask16 __U, __m256h __A) {
469 return (__m256h)__builtin_ia32_getexpph256_mask((__v16hf)__A, (__v16hf)__W,
470 (__mmask16)__U);
471 }
472
473 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_getexp_ph(__mmask16 __U,__m256h __A)474 _mm256_maskz_getexp_ph(__mmask16 __U, __m256h __A) {
475 return (__m256h)__builtin_ia32_getexpph256_mask(
476 (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
477 }
478
479 #define _mm_getmant_ph(A, B, C) \
480 ((__m128h)__builtin_ia32_getmantph128_mask( \
481 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
482 (__mmask8)-1))
483
484 #define _mm_mask_getmant_ph(W, U, A, B, C) \
485 ((__m128h)__builtin_ia32_getmantph128_mask( \
486 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)(__m128h)(W), \
487 (__mmask8)(U)))
488
489 #define _mm_maskz_getmant_ph(U, A, B, C) \
490 ((__m128h)__builtin_ia32_getmantph128_mask( \
491 (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
492 (__mmask8)(U)))
493
494 #define _mm256_getmant_ph(A, B, C) \
495 ((__m256h)__builtin_ia32_getmantph256_mask( \
496 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
497 (__v16hf)_mm256_setzero_ph(), (__mmask16)-1))
498
499 #define _mm256_mask_getmant_ph(W, U, A, B, C) \
500 ((__m256h)__builtin_ia32_getmantph256_mask( \
501 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \
502 (__mmask16)(U)))
503
504 #define _mm256_maskz_getmant_ph(U, A, B, C) \
505 ((__m256h)__builtin_ia32_getmantph256_mask( \
506 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
507 (__v16hf)_mm256_setzero_ph(), (__mmask16)(U)))
508
_mm_scalef_ph(__m128h __A,__m128h __B)509 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_ph(__m128h __A,
510 __m128h __B) {
511 return (__m128h)__builtin_ia32_scalefph128_mask(
512 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
513 }
514
515 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_scalef_ph(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)516 _mm_mask_scalef_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
517 return (__m128h)__builtin_ia32_scalefph128_mask((__v8hf)__A, (__v8hf)__B,
518 (__v8hf)__W, (__mmask8)__U);
519 }
520
521 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_scalef_ph(__mmask8 __U,__m128h __A,__m128h __B)522 _mm_maskz_scalef_ph(__mmask8 __U, __m128h __A, __m128h __B) {
523 return (__m128h)__builtin_ia32_scalefph128_mask(
524 (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
525 }
526
_mm256_scalef_ph(__m256h __A,__m256h __B)527 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_scalef_ph(__m256h __A,
528 __m256h __B) {
529 return (__m256h)__builtin_ia32_scalefph256_mask(
530 (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1);
531 }
532
533 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_scalef_ph(__m256h __W,__mmask16 __U,__m256h __A,__m256h __B)534 _mm256_mask_scalef_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
535 return (__m256h)__builtin_ia32_scalefph256_mask((__v16hf)__A, (__v16hf)__B,
536 (__v16hf)__W, (__mmask16)__U);
537 }
538
539 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_scalef_ph(__mmask16 __U,__m256h __A,__m256h __B)540 _mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
541 return (__m256h)__builtin_ia32_scalefph256_mask(
542 (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
543 }
544
545 #define _mm_roundscale_ph(A, imm) \
546 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
547 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \
548 (__mmask8)-1))
549
550 #define _mm_mask_roundscale_ph(W, U, A, imm) \
551 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
552 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
553
554 #define _mm_maskz_roundscale_ph(U, A, imm) \
555 ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
556 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \
557 (__mmask8)(U)))
558
559 #define _mm256_roundscale_ph(A, imm) \
560 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
561 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
562 (__mmask16)-1))
563
564 #define _mm256_mask_roundscale_ph(W, U, A, imm) \
565 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
566 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \
567 (__mmask16)(U)))
568
569 #define _mm256_maskz_roundscale_ph(U, A, imm) \
570 ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
571 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
572 (__mmask16)(U)))
573
574 #define _mm_reduce_ph(A, imm) \
575 ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \
576 (__v8hf)_mm_setzero_ph(), \
577 (__mmask8)-1))
578
579 #define _mm_mask_reduce_ph(W, U, A, imm) \
580 ((__m128h)__builtin_ia32_reduceph128_mask( \
581 (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
582
583 #define _mm_maskz_reduce_ph(U, A, imm) \
584 ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \
585 (__v8hf)_mm_setzero_ph(), \
586 (__mmask8)(U)))
587
588 #define _mm256_reduce_ph(A, imm) \
589 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
590 (__v16hf)_mm256_setzero_ph(), \
591 (__mmask16)-1))
592
593 #define _mm256_mask_reduce_ph(W, U, A, imm) \
594 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
595 (__v16hf)(__m256h)(W), \
596 (__mmask16)(U)))
597
598 #define _mm256_maskz_reduce_ph(U, A, imm) \
599 ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
600 (__v16hf)_mm256_setzero_ph(), \
601 (__mmask16)(U)))
602
_mm_sqrt_ph(__m128h __a)603 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) {
604 return __builtin_ia32_sqrtph((__v8hf)__a);
605 }
606
_mm_mask_sqrt_ph(__m128h __W,__mmask8 __U,__m128h __A)607 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W,
608 __mmask8 __U,
609 __m128h __A) {
610 return (__m128h)__builtin_ia32_selectph_128(
611 (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)__W);
612 }
613
_mm_maskz_sqrt_ph(__mmask8 __U,__m128h __A)614 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U,
615 __m128h __A) {
616 return (__m128h)__builtin_ia32_selectph_128(
617 (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)_mm_setzero_ph());
618 }
619
_mm256_sqrt_ph(__m256h __a)620 static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) {
621 return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a);
622 }
623
624 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_sqrt_ph(__m256h __W,__mmask16 __U,__m256h __A)625 _mm256_mask_sqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
626 return (__m256h)__builtin_ia32_selectph_256(
627 (__mmask16)__U, (__v16hf)_mm256_sqrt_ph(__A), (__v16hf)__W);
628 }
629
630 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_sqrt_ph(__mmask16 __U,__m256h __A)631 _mm256_maskz_sqrt_ph(__mmask16 __U, __m256h __A) {
632 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
633 (__v16hf)_mm256_sqrt_ph(__A),
634 (__v16hf)_mm256_setzero_ph());
635 }
636
637 #define _mm_mask_fpclass_ph_mask(U, A, imm) \
638 ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \
639 (int)(imm), (__mmask8)(U)))
640
641 #define _mm_fpclass_ph_mask(A, imm) \
642 ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \
643 (int)(imm), (__mmask8)-1))
644
645 #define _mm256_mask_fpclass_ph_mask(U, A, imm) \
646 ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \
647 (int)(imm), (__mmask16)(U)))
648
649 #define _mm256_fpclass_ph_mask(A, imm) \
650 ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \
651 (int)(imm), (__mmask16)-1))
652
_mm_cvtpd_ph(__m128d __A)653 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtpd_ph(__m128d __A) {
654 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
655 (__v2df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
656 }
657
_mm_mask_cvtpd_ph(__m128h __W,__mmask8 __U,__m128d __A)658 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtpd_ph(__m128h __W,
659 __mmask8 __U,
660 __m128d __A) {
661 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask((__v2df)__A, (__v8hf)__W,
662 (__mmask8)__U);
663 }
664
665 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_cvtpd_ph(__mmask8 __U,__m128d __A)666 _mm_maskz_cvtpd_ph(__mmask8 __U, __m128d __A) {
667 return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
668 (__v2df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
669 }
670
_mm256_cvtpd_ph(__m256d __A)671 static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtpd_ph(__m256d __A) {
672 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask(
673 (__v4df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
674 }
675
676 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_mask_cvtpd_ph(__m128h __W,__mmask8 __U,__m256d __A)677 _mm256_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m256d __A) {
678 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask((__v4df)__A, (__v8hf)__W,
679 (__mmask8)__U);
680 }
681
682 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtpd_ph(__mmask8 __U,__m256d __A)683 _mm256_maskz_cvtpd_ph(__mmask8 __U, __m256d __A) {
684 return (__m128h)__builtin_ia32_vcvtpd2ph256_mask(
685 (__v4df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
686 }
687
_mm_cvtph_pd(__m128h __A)688 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtph_pd(__m128h __A) {
689 return (__m128d)__builtin_ia32_vcvtph2pd128_mask(
690 (__v8hf)__A, (__v2df)_mm_undefined_pd(), (__mmask8)-1);
691 }
692
_mm_mask_cvtph_pd(__m128d __W,__mmask8 __U,__m128h __A)693 static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtph_pd(__m128d __W,
694 __mmask8 __U,
695 __m128h __A) {
696 return (__m128d)__builtin_ia32_vcvtph2pd128_mask((__v8hf)__A, (__v2df)__W,
697 (__mmask8)__U);
698 }
699
700 static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maskz_cvtph_pd(__mmask8 __U,__m128h __A)701 _mm_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
702 return (__m128d)__builtin_ia32_vcvtph2pd128_mask(
703 (__v8hf)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U);
704 }
705
_mm256_cvtph_pd(__m128h __A)706 static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_cvtph_pd(__m128h __A) {
707 return (__m256d)__builtin_ia32_vcvtph2pd256_mask(
708 (__v8hf)__A, (__v4df)_mm256_undefined_pd(), (__mmask8)-1);
709 }
710
711 static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_mask_cvtph_pd(__m256d __W,__mmask8 __U,__m128h __A)712 _mm256_mask_cvtph_pd(__m256d __W, __mmask8 __U, __m128h __A) {
713 return (__m256d)__builtin_ia32_vcvtph2pd256_mask((__v8hf)__A, (__v4df)__W,
714 (__mmask8)__U);
715 }
716
717 static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtph_pd(__mmask8 __U,__m128h __A)718 _mm256_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
719 return (__m256d)__builtin_ia32_vcvtph2pd256_mask(
720 (__v8hf)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U);
721 }
722
_mm_cvtph_epi16(__m128h __A)723 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi16(__m128h __A) {
724 return (__m128i)__builtin_ia32_vcvtph2w128_mask(
725 (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
726 }
727
728 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_cvtph_epi16(__m128i __W,__mmask8 __U,__m128h __A)729 _mm_mask_cvtph_epi16(__m128i __W, __mmask8 __U, __m128h __A) {
730 return (__m128i)__builtin_ia32_vcvtph2w128_mask((__v8hf)__A, (__v8hi)__W,
731 (__mmask8)__U);
732 }
733
734 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_cvtph_epi16(__mmask8 __U,__m128h __A)735 _mm_maskz_cvtph_epi16(__mmask8 __U, __m128h __A) {
736 return (__m128i)__builtin_ia32_vcvtph2w128_mask(
737 (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U);
738 }
739
740 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvtph_epi16(__m256h __A)741 _mm256_cvtph_epi16(__m256h __A) {
742 return (__m256i)__builtin_ia32_vcvtph2w256_mask(
743 (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1);
744 }
745
746 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_cvtph_epi16(__m256i __W,__mmask16 __U,__m256h __A)747 _mm256_mask_cvtph_epi16(__m256i __W, __mmask16 __U, __m256h __A) {
748 return (__m256i)__builtin_ia32_vcvtph2w256_mask((__v16hf)__A, (__v16hi)__W,
749 (__mmask16)__U);
750 }
751
752 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtph_epi16(__mmask16 __U,__m256h __A)753 _mm256_maskz_cvtph_epi16(__mmask16 __U, __m256h __A) {
754 return (__m256i)__builtin_ia32_vcvtph2w256_mask(
755 (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U);
756 }
757
_mm_cvttph_epi16(__m128h __A)758 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi16(__m128h __A) {
759 return (__m128i)__builtin_ia32_vcvttph2w128_mask(
760 (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
761 }
762
763 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_cvttph_epi16(__m128i __W,__mmask8 __U,__m128h __A)764 _mm_mask_cvttph_epi16(__m128i __W, __mmask8 __U, __m128h __A) {
765 return (__m128i)__builtin_ia32_vcvttph2w128_mask((__v8hf)__A, (__v8hi)__W,
766 (__mmask8)__U);
767 }
768
769 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_cvttph_epi16(__mmask8 __U,__m128h __A)770 _mm_maskz_cvttph_epi16(__mmask8 __U, __m128h __A) {
771 return (__m128i)__builtin_ia32_vcvttph2w128_mask(
772 (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U);
773 }
774
775 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvttph_epi16(__m256h __A)776 _mm256_cvttph_epi16(__m256h __A) {
777 return (__m256i)__builtin_ia32_vcvttph2w256_mask(
778 (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1);
779 }
780
781 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_cvttph_epi16(__m256i __W,__mmask16 __U,__m256h __A)782 _mm256_mask_cvttph_epi16(__m256i __W, __mmask16 __U, __m256h __A) {
783 return (__m256i)__builtin_ia32_vcvttph2w256_mask((__v16hf)__A, (__v16hi)__W,
784 (__mmask16)__U);
785 }
786
787 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_cvttph_epi16(__mmask16 __U,__m256h __A)788 _mm256_maskz_cvttph_epi16(__mmask16 __U, __m256h __A) {
789 return (__m256i)__builtin_ia32_vcvttph2w256_mask(
790 (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U);
791 }
792
_mm_cvtepi16_ph(__m128i __A)793 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi16_ph(__m128i __A) {
794 return (__m128h) __builtin_convertvector((__v8hi)__A, __v8hf);
795 }
796
797 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_cvtepi16_ph(__m128h __W,__mmask8 __U,__m128i __A)798 _mm_mask_cvtepi16_ph(__m128h __W, __mmask8 __U, __m128i __A) {
799 return (__m128h)__builtin_ia32_selectph_128(
800 (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)__W);
801 }
802
803 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_cvtepi16_ph(__mmask8 __U,__m128i __A)804 _mm_maskz_cvtepi16_ph(__mmask8 __U, __m128i __A) {
805 return (__m128h)__builtin_ia32_selectph_128(
806 (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)_mm_setzero_ph());
807 }
808
809 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_cvtepi16_ph(__m256i __A)810 _mm256_cvtepi16_ph(__m256i __A) {
811 return (__m256h) __builtin_convertvector((__v16hi)__A, __v16hf);
812 }
813
814 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_cvtepi16_ph(__m256h __W,__mmask16 __U,__m256i __A)815 _mm256_mask_cvtepi16_ph(__m256h __W, __mmask16 __U, __m256i __A) {
816 return (__m256h)__builtin_ia32_selectph_256(
817 (__mmask16)__U, (__v16hf)_mm256_cvtepi16_ph(__A), (__v16hf)__W);
818 }
819
820 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtepi16_ph(__mmask16 __U,__m256i __A)821 _mm256_maskz_cvtepi16_ph(__mmask16 __U, __m256i __A) {
822 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
823 (__v16hf)_mm256_cvtepi16_ph(__A),
824 (__v16hf)_mm256_setzero_ph());
825 }
826
_mm_cvtph_epu16(__m128h __A)827 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu16(__m128h __A) {
828 return (__m128i)__builtin_ia32_vcvtph2uw128_mask(
829 (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1);
830 }
831
832 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_cvtph_epu16(__m128i __W,__mmask8 __U,__m128h __A)833 _mm_mask_cvtph_epu16(__m128i __W, __mmask8 __U, __m128h __A) {
834 return (__m128i)__builtin_ia32_vcvtph2uw128_mask((__v8hf)__A, (__v8hu)__W,
835 (__mmask8)__U);
836 }
837
838 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_cvtph_epu16(__mmask8 __U,__m128h __A)839 _mm_maskz_cvtph_epu16(__mmask8 __U, __m128h __A) {
840 return (__m128i)__builtin_ia32_vcvtph2uw128_mask(
841 (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U);
842 }
843
844 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvtph_epu16(__m256h __A)845 _mm256_cvtph_epu16(__m256h __A) {
846 return (__m256i)__builtin_ia32_vcvtph2uw256_mask(
847 (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1);
848 }
849
850 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_cvtph_epu16(__m256i __W,__mmask16 __U,__m256h __A)851 _mm256_mask_cvtph_epu16(__m256i __W, __mmask16 __U, __m256h __A) {
852 return (__m256i)__builtin_ia32_vcvtph2uw256_mask((__v16hf)__A, (__v16hu)__W,
853 (__mmask16)__U);
854 }
855
856 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtph_epu16(__mmask16 __U,__m256h __A)857 _mm256_maskz_cvtph_epu16(__mmask16 __U, __m256h __A) {
858 return (__m256i)__builtin_ia32_vcvtph2uw256_mask(
859 (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U);
860 }
861
_mm_cvttph_epu16(__m128h __A)862 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu16(__m128h __A) {
863 return (__m128i)__builtin_ia32_vcvttph2uw128_mask(
864 (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1);
865 }
866
867 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_cvttph_epu16(__m128i __W,__mmask8 __U,__m128h __A)868 _mm_mask_cvttph_epu16(__m128i __W, __mmask8 __U, __m128h __A) {
869 return (__m128i)__builtin_ia32_vcvttph2uw128_mask((__v8hf)__A, (__v8hu)__W,
870 (__mmask8)__U);
871 }
872
873 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_cvttph_epu16(__mmask8 __U,__m128h __A)874 _mm_maskz_cvttph_epu16(__mmask8 __U, __m128h __A) {
875 return (__m128i)__builtin_ia32_vcvttph2uw128_mask(
876 (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U);
877 }
878
879 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvttph_epu16(__m256h __A)880 _mm256_cvttph_epu16(__m256h __A) {
881 return (__m256i)__builtin_ia32_vcvttph2uw256_mask(
882 (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1);
883 }
884
885 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_cvttph_epu16(__m256i __W,__mmask16 __U,__m256h __A)886 _mm256_mask_cvttph_epu16(__m256i __W, __mmask16 __U, __m256h __A) {
887 return (__m256i)__builtin_ia32_vcvttph2uw256_mask((__v16hf)__A, (__v16hu)__W,
888 (__mmask16)__U);
889 }
890
891 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_cvttph_epu16(__mmask16 __U,__m256h __A)892 _mm256_maskz_cvttph_epu16(__mmask16 __U, __m256h __A) {
893 return (__m256i)__builtin_ia32_vcvttph2uw256_mask(
894 (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U);
895 }
896
_mm_cvtepu16_ph(__m128i __A)897 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu16_ph(__m128i __A) {
898 return (__m128h) __builtin_convertvector((__v8hu)__A, __v8hf);
899 }
900
901 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_cvtepu16_ph(__m128h __W,__mmask8 __U,__m128i __A)902 _mm_mask_cvtepu16_ph(__m128h __W, __mmask8 __U, __m128i __A) {
903 return (__m128h)__builtin_ia32_selectph_128(
904 (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)__W);
905 }
906
907 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_cvtepu16_ph(__mmask8 __U,__m128i __A)908 _mm_maskz_cvtepu16_ph(__mmask8 __U, __m128i __A) {
909 return (__m128h)__builtin_ia32_selectph_128(
910 (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)_mm_setzero_ph());
911 }
912
913 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_cvtepu16_ph(__m256i __A)914 _mm256_cvtepu16_ph(__m256i __A) {
915 return (__m256h) __builtin_convertvector((__v16hu)__A, __v16hf);
916 }
917
918 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_cvtepu16_ph(__m256h __W,__mmask16 __U,__m256i __A)919 _mm256_mask_cvtepu16_ph(__m256h __W, __mmask16 __U, __m256i __A) {
920 return (__m256h)__builtin_ia32_selectph_256(
921 (__mmask16)__U, (__v16hf)_mm256_cvtepu16_ph(__A), (__v16hf)__W);
922 }
923
924 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtepu16_ph(__mmask16 __U,__m256i __A)925 _mm256_maskz_cvtepu16_ph(__mmask16 __U, __m256i __A) {
926 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
927 (__v16hf)_mm256_cvtepu16_ph(__A),
928 (__v16hf)_mm256_setzero_ph());
929 }
930
_mm_cvtph_epi32(__m128h __A)931 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi32(__m128h __A) {
932 return (__m128i)__builtin_ia32_vcvtph2dq128_mask(
933 (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1);
934 }
935
936 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_cvtph_epi32(__m128i __W,__mmask8 __U,__m128h __A)937 _mm_mask_cvtph_epi32(__m128i __W, __mmask8 __U, __m128h __A) {
938 return (__m128i)__builtin_ia32_vcvtph2dq128_mask((__v8hf)__A, (__v4si)__W,
939 (__mmask8)__U);
940 }
941
942 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_cvtph_epi32(__mmask8 __U,__m128h __A)943 _mm_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) {
944 return (__m128i)__builtin_ia32_vcvtph2dq128_mask(
945 (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U);
946 }
947
948 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvtph_epi32(__m128h __A)949 _mm256_cvtph_epi32(__m128h __A) {
950 return (__m256i)__builtin_ia32_vcvtph2dq256_mask(
951 (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1);
952 }
953
954 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_cvtph_epi32(__m256i __W,__mmask8 __U,__m128h __A)955 _mm256_mask_cvtph_epi32(__m256i __W, __mmask8 __U, __m128h __A) {
956 return (__m256i)__builtin_ia32_vcvtph2dq256_mask((__v8hf)__A, (__v8si)__W,
957 (__mmask8)__U);
958 }
959
960 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtph_epi32(__mmask8 __U,__m128h __A)961 _mm256_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) {
962 return (__m256i)__builtin_ia32_vcvtph2dq256_mask(
963 (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U);
964 }
965
_mm_cvtph_epu32(__m128h __A)966 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu32(__m128h __A) {
967 return (__m128i)__builtin_ia32_vcvtph2udq128_mask(
968 (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1);
969 }
970
971 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_cvtph_epu32(__m128i __W,__mmask8 __U,__m128h __A)972 _mm_mask_cvtph_epu32(__m128i __W, __mmask8 __U, __m128h __A) {
973 return (__m128i)__builtin_ia32_vcvtph2udq128_mask((__v8hf)__A, (__v4su)__W,
974 (__mmask8)__U);
975 }
976
977 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_cvtph_epu32(__mmask8 __U,__m128h __A)978 _mm_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) {
979 return (__m128i)__builtin_ia32_vcvtph2udq128_mask(
980 (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U);
981 }
982
983 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvtph_epu32(__m128h __A)984 _mm256_cvtph_epu32(__m128h __A) {
985 return (__m256i)__builtin_ia32_vcvtph2udq256_mask(
986 (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1);
987 }
988
989 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_cvtph_epu32(__m256i __W,__mmask8 __U,__m128h __A)990 _mm256_mask_cvtph_epu32(__m256i __W, __mmask8 __U, __m128h __A) {
991 return (__m256i)__builtin_ia32_vcvtph2udq256_mask((__v8hf)__A, (__v8su)__W,
992 (__mmask8)__U);
993 }
994
995 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtph_epu32(__mmask8 __U,__m128h __A)996 _mm256_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) {
997 return (__m256i)__builtin_ia32_vcvtph2udq256_mask(
998 (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U);
999 }
1000
_mm_cvtepi32_ph(__m128i __A)1001 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi32_ph(__m128i __A) {
1002 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask(
1003 (__v4si)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1004 }
1005
1006 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_cvtepi32_ph(__m128h __W,__mmask8 __U,__m128i __A)1007 _mm_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m128i __A) {
1008 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask((__v4si)__A, (__v8hf)__W,
1009 (__mmask8)__U);
1010 }
1011
1012 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_cvtepi32_ph(__mmask8 __U,__m128i __A)1013 _mm_maskz_cvtepi32_ph(__mmask8 __U, __m128i __A) {
1014 return (__m128h)__builtin_ia32_vcvtdq2ph128_mask(
1015 (__v4si)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1016 }
1017
1018 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_cvtepi32_ph(__m256i __A)1019 _mm256_cvtepi32_ph(__m256i __A) {
1020 return (__m128h) __builtin_convertvector((__v8si)__A, __v8hf);
1021 }
1022
1023 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_mask_cvtepi32_ph(__m128h __W,__mmask8 __U,__m256i __A)1024 _mm256_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m256i __A) {
1025 return (__m128h)__builtin_ia32_selectph_128(
1026 (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)__W);
1027 }
1028
1029 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtepi32_ph(__mmask8 __U,__m256i __A)1030 _mm256_maskz_cvtepi32_ph(__mmask8 __U, __m256i __A) {
1031 return (__m128h)__builtin_ia32_selectph_128(
1032 (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)_mm_setzero_ph());
1033 }
1034
_mm_cvtepu32_ph(__m128i __A)1035 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu32_ph(__m128i __A) {
1036 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask(
1037 (__v4su)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1038 }
1039
1040 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_cvtepu32_ph(__m128h __W,__mmask8 __U,__m128i __A)1041 _mm_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m128i __A) {
1042 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask((__v4su)__A, (__v8hf)__W,
1043 (__mmask8)__U);
1044 }
1045
1046 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_cvtepu32_ph(__mmask8 __U,__m128i __A)1047 _mm_maskz_cvtepu32_ph(__mmask8 __U, __m128i __A) {
1048 return (__m128h)__builtin_ia32_vcvtudq2ph128_mask(
1049 (__v4su)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1050 }
1051
1052 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_cvtepu32_ph(__m256i __A)1053 _mm256_cvtepu32_ph(__m256i __A) {
1054 return (__m128h) __builtin_convertvector((__v8su)__A, __v8hf);
1055 }
1056
1057 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_mask_cvtepu32_ph(__m128h __W,__mmask8 __U,__m256i __A)1058 _mm256_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m256i __A) {
1059 return (__m128h)__builtin_ia32_selectph_128(
1060 (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)__W);
1061 }
1062
1063 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtepu32_ph(__mmask8 __U,__m256i __A)1064 _mm256_maskz_cvtepu32_ph(__mmask8 __U, __m256i __A) {
1065 return (__m128h)__builtin_ia32_selectph_128(
1066 (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)_mm_setzero_ph());
1067 }
1068
_mm_cvttph_epi32(__m128h __A)1069 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi32(__m128h __A) {
1070 return (__m128i)__builtin_ia32_vcvttph2dq128_mask(
1071 (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1);
1072 }
1073
1074 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_cvttph_epi32(__m128i __W,__mmask8 __U,__m128h __A)1075 _mm_mask_cvttph_epi32(__m128i __W, __mmask8 __U, __m128h __A) {
1076 return (__m128i)__builtin_ia32_vcvttph2dq128_mask((__v8hf)__A, (__v4si)__W,
1077 (__mmask8)__U);
1078 }
1079
1080 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_cvttph_epi32(__mmask8 __U,__m128h __A)1081 _mm_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) {
1082 return (__m128i)__builtin_ia32_vcvttph2dq128_mask(
1083 (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U);
1084 }
1085
1086 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvttph_epi32(__m128h __A)1087 _mm256_cvttph_epi32(__m128h __A) {
1088 return (__m256i)__builtin_ia32_vcvttph2dq256_mask(
1089 (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1);
1090 }
1091
1092 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_cvttph_epi32(__m256i __W,__mmask8 __U,__m128h __A)1093 _mm256_mask_cvttph_epi32(__m256i __W, __mmask8 __U, __m128h __A) {
1094 return (__m256i)__builtin_ia32_vcvttph2dq256_mask((__v8hf)__A, (__v8si)__W,
1095 (__mmask8)__U);
1096 }
1097
1098 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_cvttph_epi32(__mmask8 __U,__m128h __A)1099 _mm256_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) {
1100 return (__m256i)__builtin_ia32_vcvttph2dq256_mask(
1101 (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U);
1102 }
1103
_mm_cvttph_epu32(__m128h __A)1104 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu32(__m128h __A) {
1105 return (__m128i)__builtin_ia32_vcvttph2udq128_mask(
1106 (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1);
1107 }
1108
1109 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_cvttph_epu32(__m128i __W,__mmask8 __U,__m128h __A)1110 _mm_mask_cvttph_epu32(__m128i __W, __mmask8 __U, __m128h __A) {
1111 return (__m128i)__builtin_ia32_vcvttph2udq128_mask((__v8hf)__A, (__v4su)__W,
1112 (__mmask8)__U);
1113 }
1114
1115 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_cvttph_epu32(__mmask8 __U,__m128h __A)1116 _mm_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) {
1117 return (__m128i)__builtin_ia32_vcvttph2udq128_mask(
1118 (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U);
1119 }
1120
1121 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvttph_epu32(__m128h __A)1122 _mm256_cvttph_epu32(__m128h __A) {
1123 return (__m256i)__builtin_ia32_vcvttph2udq256_mask(
1124 (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1);
1125 }
1126
1127 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_cvttph_epu32(__m256i __W,__mmask8 __U,__m128h __A)1128 _mm256_mask_cvttph_epu32(__m256i __W, __mmask8 __U, __m128h __A) {
1129 return (__m256i)__builtin_ia32_vcvttph2udq256_mask((__v8hf)__A, (__v8su)__W,
1130 (__mmask8)__U);
1131 }
1132
1133 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_cvttph_epu32(__mmask8 __U,__m128h __A)1134 _mm256_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) {
1135 return (__m256i)__builtin_ia32_vcvttph2udq256_mask(
1136 (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U);
1137 }
1138
_mm_cvtepi64_ph(__m128i __A)1139 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi64_ph(__m128i __A) {
1140 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask(
1141 (__v2di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1142 }
1143
1144 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_cvtepi64_ph(__m128h __W,__mmask8 __U,__m128i __A)1145 _mm_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m128i __A) {
1146 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask((__v2di)__A, (__v8hf)__W,
1147 (__mmask8)__U);
1148 }
1149
1150 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_cvtepi64_ph(__mmask8 __U,__m128i __A)1151 _mm_maskz_cvtepi64_ph(__mmask8 __U, __m128i __A) {
1152 return (__m128h)__builtin_ia32_vcvtqq2ph128_mask(
1153 (__v2di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1154 }
1155
1156 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_cvtepi64_ph(__m256i __A)1157 _mm256_cvtepi64_ph(__m256i __A) {
1158 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask(
1159 (__v4di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1160 }
1161
1162 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_mask_cvtepi64_ph(__m128h __W,__mmask8 __U,__m256i __A)1163 _mm256_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m256i __A) {
1164 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask((__v4di)__A, (__v8hf)__W,
1165 (__mmask8)__U);
1166 }
1167
1168 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtepi64_ph(__mmask8 __U,__m256i __A)1169 _mm256_maskz_cvtepi64_ph(__mmask8 __U, __m256i __A) {
1170 return (__m128h)__builtin_ia32_vcvtqq2ph256_mask(
1171 (__v4di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1172 }
1173
_mm_cvtph_epi64(__m128h __A)1174 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi64(__m128h __A) {
1175 return (__m128i)__builtin_ia32_vcvtph2qq128_mask(
1176 (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1);
1177 }
1178
1179 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_cvtph_epi64(__m128i __W,__mmask8 __U,__m128h __A)1180 _mm_mask_cvtph_epi64(__m128i __W, __mmask8 __U, __m128h __A) {
1181 return (__m128i)__builtin_ia32_vcvtph2qq128_mask((__v8hf)__A, (__v2di)__W,
1182 (__mmask8)__U);
1183 }
1184
1185 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_cvtph_epi64(__mmask8 __U,__m128h __A)1186 _mm_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
1187 return (__m128i)__builtin_ia32_vcvtph2qq128_mask(
1188 (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U);
1189 }
1190
1191 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvtph_epi64(__m128h __A)1192 _mm256_cvtph_epi64(__m128h __A) {
1193 return (__m256i)__builtin_ia32_vcvtph2qq256_mask(
1194 (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1);
1195 }
1196
1197 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_cvtph_epi64(__m256i __W,__mmask8 __U,__m128h __A)1198 _mm256_mask_cvtph_epi64(__m256i __W, __mmask8 __U, __m128h __A) {
1199 return (__m256i)__builtin_ia32_vcvtph2qq256_mask((__v8hf)__A, (__v4di)__W,
1200 (__mmask8)__U);
1201 }
1202
1203 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtph_epi64(__mmask8 __U,__m128h __A)1204 _mm256_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
1205 return (__m256i)__builtin_ia32_vcvtph2qq256_mask(
1206 (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U);
1207 }
1208
_mm_cvtepu64_ph(__m128i __A)1209 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu64_ph(__m128i __A) {
1210 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask(
1211 (__v2du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1212 }
1213
1214 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_cvtepu64_ph(__m128h __W,__mmask8 __U,__m128i __A)1215 _mm_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m128i __A) {
1216 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask((__v2du)__A, (__v8hf)__W,
1217 (__mmask8)__U);
1218 }
1219
1220 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_cvtepu64_ph(__mmask8 __U,__m128i __A)1221 _mm_maskz_cvtepu64_ph(__mmask8 __U, __m128i __A) {
1222 return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask(
1223 (__v2du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1224 }
1225
1226 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_cvtepu64_ph(__m256i __A)1227 _mm256_cvtepu64_ph(__m256i __A) {
1228 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask(
1229 (__v4du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1230 }
1231
1232 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_mask_cvtepu64_ph(__m128h __W,__mmask8 __U,__m256i __A)1233 _mm256_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m256i __A) {
1234 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask((__v4du)__A, (__v8hf)__W,
1235 (__mmask8)__U);
1236 }
1237
1238 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtepu64_ph(__mmask8 __U,__m256i __A)1239 _mm256_maskz_cvtepu64_ph(__mmask8 __U, __m256i __A) {
1240 return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask(
1241 (__v4du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1242 }
1243
_mm_cvtph_epu64(__m128h __A)1244 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu64(__m128h __A) {
1245 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask(
1246 (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1);
1247 }
1248
1249 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_cvtph_epu64(__m128i __W,__mmask8 __U,__m128h __A)1250 _mm_mask_cvtph_epu64(__m128i __W, __mmask8 __U, __m128h __A) {
1251 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask((__v8hf)__A, (__v2du)__W,
1252 (__mmask8)__U);
1253 }
1254
1255 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_cvtph_epu64(__mmask8 __U,__m128h __A)1256 _mm_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
1257 return (__m128i)__builtin_ia32_vcvtph2uqq128_mask(
1258 (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U);
1259 }
1260
1261 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvtph_epu64(__m128h __A)1262 _mm256_cvtph_epu64(__m128h __A) {
1263 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask(
1264 (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1);
1265 }
1266
1267 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_cvtph_epu64(__m256i __W,__mmask8 __U,__m128h __A)1268 _mm256_mask_cvtph_epu64(__m256i __W, __mmask8 __U, __m128h __A) {
1269 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask((__v8hf)__A, (__v4du)__W,
1270 (__mmask8)__U);
1271 }
1272
1273 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtph_epu64(__mmask8 __U,__m128h __A)1274 _mm256_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
1275 return (__m256i)__builtin_ia32_vcvtph2uqq256_mask(
1276 (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U);
1277 }
1278
_mm_cvttph_epi64(__m128h __A)1279 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi64(__m128h __A) {
1280 return (__m128i)__builtin_ia32_vcvttph2qq128_mask(
1281 (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1);
1282 }
1283
1284 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_cvttph_epi64(__m128i __W,__mmask8 __U,__m128h __A)1285 _mm_mask_cvttph_epi64(__m128i __W, __mmask8 __U, __m128h __A) {
1286 return (__m128i)__builtin_ia32_vcvttph2qq128_mask((__v8hf)__A, (__v2di)__W,
1287 (__mmask8)__U);
1288 }
1289
1290 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_cvttph_epi64(__mmask8 __U,__m128h __A)1291 _mm_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
1292 return (__m128i)__builtin_ia32_vcvttph2qq128_mask(
1293 (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U);
1294 }
1295
1296 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvttph_epi64(__m128h __A)1297 _mm256_cvttph_epi64(__m128h __A) {
1298 return (__m256i)__builtin_ia32_vcvttph2qq256_mask(
1299 (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1);
1300 }
1301
1302 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_cvttph_epi64(__m256i __W,__mmask8 __U,__m128h __A)1303 _mm256_mask_cvttph_epi64(__m256i __W, __mmask8 __U, __m128h __A) {
1304 return (__m256i)__builtin_ia32_vcvttph2qq256_mask((__v8hf)__A, (__v4di)__W,
1305 (__mmask8)__U);
1306 }
1307
1308 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_cvttph_epi64(__mmask8 __U,__m128h __A)1309 _mm256_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
1310 return (__m256i)__builtin_ia32_vcvttph2qq256_mask(
1311 (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U);
1312 }
1313
_mm_cvttph_epu64(__m128h __A)1314 static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu64(__m128h __A) {
1315 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask(
1316 (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1);
1317 }
1318
1319 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_cvttph_epu64(__m128i __W,__mmask8 __U,__m128h __A)1320 _mm_mask_cvttph_epu64(__m128i __W, __mmask8 __U, __m128h __A) {
1321 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask((__v8hf)__A, (__v2du)__W,
1322 (__mmask8)__U);
1323 }
1324
1325 static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_cvttph_epu64(__mmask8 __U,__m128h __A)1326 _mm_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
1327 return (__m128i)__builtin_ia32_vcvttph2uqq128_mask(
1328 (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U);
1329 }
1330
1331 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cvttph_epu64(__m128h __A)1332 _mm256_cvttph_epu64(__m128h __A) {
1333 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask(
1334 (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1);
1335 }
1336
1337 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_cvttph_epu64(__m256i __W,__mmask8 __U,__m128h __A)1338 _mm256_mask_cvttph_epu64(__m256i __W, __mmask8 __U, __m128h __A) {
1339 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask((__v8hf)__A, (__v4du)__W,
1340 (__mmask8)__U);
1341 }
1342
1343 static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_cvttph_epu64(__mmask8 __U,__m128h __A)1344 _mm256_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
1345 return (__m256i)__builtin_ia32_vcvttph2uqq256_mask(
1346 (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U);
1347 }
1348
_mm_cvtxph_ps(__m128h __A)1349 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtxph_ps(__m128h __A) {
1350 return (__m128)__builtin_ia32_vcvtph2psx128_mask(
1351 (__v8hf)__A, (__v4sf)_mm_undefined_ps(), (__mmask8)-1);
1352 }
1353
_mm_mask_cvtxph_ps(__m128 __W,__mmask8 __U,__m128h __A)1354 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtxph_ps(__m128 __W,
1355 __mmask8 __U,
1356 __m128h __A) {
1357 return (__m128)__builtin_ia32_vcvtph2psx128_mask((__v8hf)__A, (__v4sf)__W,
1358 (__mmask8)__U);
1359 }
1360
1361 static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_cvtxph_ps(__mmask8 __U,__m128h __A)1362 _mm_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) {
1363 return (__m128)__builtin_ia32_vcvtph2psx128_mask(
1364 (__v8hf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U);
1365 }
1366
_mm256_cvtxph_ps(__m128h __A)1367 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtxph_ps(__m128h __A) {
1368 return (__m256)__builtin_ia32_vcvtph2psx256_mask(
1369 (__v8hf)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1);
1370 }
1371
1372 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_mask_cvtxph_ps(__m256 __W,__mmask8 __U,__m128h __A)1373 _mm256_mask_cvtxph_ps(__m256 __W, __mmask8 __U, __m128h __A) {
1374 return (__m256)__builtin_ia32_vcvtph2psx256_mask((__v8hf)__A, (__v8sf)__W,
1375 (__mmask8)__U);
1376 }
1377
1378 static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtxph_ps(__mmask8 __U,__m128h __A)1379 _mm256_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) {
1380 return (__m256)__builtin_ia32_vcvtph2psx256_mask(
1381 (__v8hf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U);
1382 }
1383
_mm_cvtxps_ph(__m128 __A)1384 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtxps_ph(__m128 __A) {
1385 return (__m128h)__builtin_ia32_vcvtps2phx128_mask(
1386 (__v4sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1387 }
1388
_mm_mask_cvtxps_ph(__m128h __W,__mmask8 __U,__m128 __A)1389 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtxps_ph(__m128h __W,
1390 __mmask8 __U,
1391 __m128 __A) {
1392 return (__m128h)__builtin_ia32_vcvtps2phx128_mask((__v4sf)__A, (__v8hf)__W,
1393 (__mmask8)__U);
1394 }
1395
1396 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_cvtxps_ph(__mmask8 __U,__m128 __A)1397 _mm_maskz_cvtxps_ph(__mmask8 __U, __m128 __A) {
1398 return (__m128h)__builtin_ia32_vcvtps2phx128_mask(
1399 (__v4sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1400 }
1401
_mm256_cvtxps_ph(__m256 __A)1402 static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtxps_ph(__m256 __A) {
1403 return (__m128h)__builtin_ia32_vcvtps2phx256_mask(
1404 (__v8sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1405 }
1406
1407 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_mask_cvtxps_ph(__m128h __W,__mmask8 __U,__m256 __A)1408 _mm256_mask_cvtxps_ph(__m128h __W, __mmask8 __U, __m256 __A) {
1409 return (__m128h)__builtin_ia32_vcvtps2phx256_mask((__v8sf)__A, (__v8hf)__W,
1410 (__mmask8)__U);
1411 }
1412
1413 static __inline__ __m128h __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtxps_ph(__mmask8 __U,__m256 __A)1414 _mm256_maskz_cvtxps_ph(__mmask8 __U, __m256 __A) {
1415 return (__m128h)__builtin_ia32_vcvtps2phx256_mask(
1416 (__v8sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1417 }
1418
_mm_fmadd_ph(__m128h __A,__m128h __B,__m128h __C)1419 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_ph(__m128h __A,
1420 __m128h __B,
1421 __m128h __C) {
1422 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
1423 (__v8hf)__C);
1424 }
1425
_mm_mask_fmadd_ph(__m128h __A,__mmask8 __U,__m128h __B,__m128h __C)1426 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A,
1427 __mmask8 __U,
1428 __m128h __B,
1429 __m128h __C) {
1430 return (__m128h)__builtin_ia32_selectph_128(
1431 (__mmask8)__U,
1432 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1433 (__v8hf)__A);
1434 }
1435
1436 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fmadd_ph(__m128h __A,__m128h __B,__m128h __C,__mmask8 __U)1437 _mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1438 return (__m128h)__builtin_ia32_selectph_128(
1439 (__mmask8)__U,
1440 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1441 (__v8hf)__C);
1442 }
1443
1444 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmadd_ph(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)1445 _mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1446 return (__m128h)__builtin_ia32_selectph_128(
1447 (__mmask8)__U,
1448 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1449 (__v8hf)_mm_setzero_ph());
1450 }
1451
_mm_fmsub_ph(__m128h __A,__m128h __B,__m128h __C)1452 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_ph(__m128h __A,
1453 __m128h __B,
1454 __m128h __C) {
1455 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
1456 -(__v8hf)__C);
1457 }
1458
_mm_mask_fmsub_ph(__m128h __A,__mmask8 __U,__m128h __B,__m128h __C)1459 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ph(__m128h __A,
1460 __mmask8 __U,
1461 __m128h __B,
1462 __m128h __C) {
1463 return (__m128h)__builtin_ia32_selectph_128(
1464 (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1465 (__v8hf)__A);
1466 }
1467
1468 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmsub_ph(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)1469 _mm_maskz_fmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1470 return (__m128h)__builtin_ia32_selectph_128(
1471 (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1472 (__v8hf)_mm_setzero_ph());
1473 }
1474
1475 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fnmadd_ph(__m128h __A,__m128h __B,__m128h __C,__mmask8 __U)1476 _mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1477 return (__m128h)__builtin_ia32_selectph_128(
1478 (__mmask8)__U,
1479 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1480 (__v8hf)__C);
1481 }
1482
1483 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fnmadd_ph(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)1484 _mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1485 return (__m128h)__builtin_ia32_selectph_128(
1486 (__mmask8)__U,
1487 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1488 (__v8hf)_mm_setzero_ph());
1489 }
1490
1491 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fnmsub_ph(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)1492 _mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1493 return (__m128h)__builtin_ia32_selectph_128(
1494 (__mmask8)__U,
1495 __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1496 (__v8hf)_mm_setzero_ph());
1497 }
1498
_mm256_fmadd_ph(__m256h __A,__m256h __B,__m256h __C)1499 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_ph(__m256h __A,
1500 __m256h __B,
1501 __m256h __C) {
1502 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
1503 (__v16hf)__C);
1504 }
1505
1506 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_fmadd_ph(__m256h __A,__mmask16 __U,__m256h __B,__m256h __C)1507 _mm256_mask_fmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1508 return (__m256h)__builtin_ia32_selectph_256(
1509 (__mmask16)__U,
1510 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1511 (__v16hf)__A);
1512 }
1513
1514 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask3_fmadd_ph(__m256h __A,__m256h __B,__m256h __C,__mmask16 __U)1515 _mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1516 return (__m256h)__builtin_ia32_selectph_256(
1517 (__mmask16)__U,
1518 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1519 (__v16hf)__C);
1520 }
1521
1522 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_fmadd_ph(__mmask16 __U,__m256h __A,__m256h __B,__m256h __C)1523 _mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1524 return (__m256h)__builtin_ia32_selectph_256(
1525 (__mmask16)__U,
1526 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1527 (__v16hf)_mm256_setzero_ph());
1528 }
1529
_mm256_fmsub_ph(__m256h __A,__m256h __B,__m256h __C)1530 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmsub_ph(__m256h __A,
1531 __m256h __B,
1532 __m256h __C) {
1533 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
1534 -(__v16hf)__C);
1535 }
1536
1537 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_fmsub_ph(__m256h __A,__mmask16 __U,__m256h __B,__m256h __C)1538 _mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1539 return (__m256h)__builtin_ia32_selectph_256(
1540 (__mmask16)__U,
1541 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1542 (__v16hf)__A);
1543 }
1544
1545 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_fmsub_ph(__mmask16 __U,__m256h __A,__m256h __B,__m256h __C)1546 _mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1547 return (__m256h)__builtin_ia32_selectph_256(
1548 (__mmask16)__U,
1549 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1550 (__v16hf)_mm256_setzero_ph());
1551 }
1552
1553 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask3_fnmadd_ph(__m256h __A,__m256h __B,__m256h __C,__mmask16 __U)1554 _mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1555 return (__m256h)__builtin_ia32_selectph_256(
1556 (__mmask16)__U,
1557 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1558 (__v16hf)__C);
1559 }
1560
1561 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_fnmadd_ph(__mmask16 __U,__m256h __A,__m256h __B,__m256h __C)1562 _mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1563 return (__m256h)__builtin_ia32_selectph_256(
1564 (__mmask16)__U,
1565 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1566 (__v16hf)_mm256_setzero_ph());
1567 }
1568
1569 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_fnmsub_ph(__mmask16 __U,__m256h __A,__m256h __B,__m256h __C)1570 _mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1571 return (__m256h)__builtin_ia32_selectph_256(
1572 (__mmask16)__U,
1573 __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1574 (__v16hf)_mm256_setzero_ph());
1575 }
1576
_mm_fmaddsub_ph(__m128h __A,__m128h __B,__m128h __C)1577 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ph(__m128h __A,
1578 __m128h __B,
1579 __m128h __C) {
1580 return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B,
1581 (__v8hf)__C);
1582 }
1583
1584 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fmaddsub_ph(__m128h __A,__mmask8 __U,__m128h __B,__m128h __C)1585 _mm_mask_fmaddsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1586 return (__m128h)__builtin_ia32_selectph_128(
1587 (__mmask8)__U,
1588 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1589 (__v8hf)__A);
1590 }
1591
1592 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fmaddsub_ph(__m128h __A,__m128h __B,__m128h __C,__mmask8 __U)1593 _mm_mask3_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1594 return (__m128h)__builtin_ia32_selectph_128(
1595 (__mmask8)__U,
1596 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1597 (__v8hf)__C);
1598 }
1599
1600 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmaddsub_ph(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)1601 _mm_maskz_fmaddsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1602 return (__m128h)__builtin_ia32_selectph_128(
1603 (__mmask8)__U,
1604 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1605 (__v8hf)_mm_setzero_ph());
1606 }
1607
_mm_fmsubadd_ph(__m128h __A,__m128h __B,__m128h __C)1608 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ph(__m128h __A,
1609 __m128h __B,
1610 __m128h __C) {
1611 return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B,
1612 -(__v8hf)__C);
1613 }
1614
1615 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fmsubadd_ph(__m128h __A,__mmask8 __U,__m128h __B,__m128h __C)1616 _mm_mask_fmsubadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1617 return (__m128h)__builtin_ia32_selectph_128(
1618 (__mmask8)__U,
1619 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1620 (__v8hf)__A);
1621 }
1622
1623 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmsubadd_ph(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)1624 _mm_maskz_fmsubadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1625 return (__m128h)__builtin_ia32_selectph_128(
1626 (__mmask8)__U,
1627 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1628 (__v8hf)_mm_setzero_ph());
1629 }
1630
1631 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_fmaddsub_ph(__m256h __A,__m256h __B,__m256h __C)1632 _mm256_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C) {
1633 return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B,
1634 (__v16hf)__C);
1635 }
1636
1637 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_fmaddsub_ph(__m256h __A,__mmask16 __U,__m256h __B,__m256h __C)1638 _mm256_mask_fmaddsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1639 return (__m256h)__builtin_ia32_selectph_256(
1640 (__mmask16)__U,
1641 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1642 (__v16hf)__A);
1643 }
1644
1645 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask3_fmaddsub_ph(__m256h __A,__m256h __B,__m256h __C,__mmask16 __U)1646 _mm256_mask3_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1647 return (__m256h)__builtin_ia32_selectph_256(
1648 (__mmask16)__U,
1649 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1650 (__v16hf)__C);
1651 }
1652
1653 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_fmaddsub_ph(__mmask16 __U,__m256h __A,__m256h __B,__m256h __C)1654 _mm256_maskz_fmaddsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1655 return (__m256h)__builtin_ia32_selectph_256(
1656 (__mmask16)__U,
1657 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1658 (__v16hf)_mm256_setzero_ph());
1659 }
1660
1661 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_fmsubadd_ph(__m256h __A,__m256h __B,__m256h __C)1662 _mm256_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C) {
1663 return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B,
1664 -(__v16hf)__C);
1665 }
1666
1667 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_fmsubadd_ph(__m256h __A,__mmask16 __U,__m256h __B,__m256h __C)1668 _mm256_mask_fmsubadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1669 return (__m256h)__builtin_ia32_selectph_256(
1670 (__mmask16)__U,
1671 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1672 (__v16hf)__A);
1673 }
1674
1675 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_fmsubadd_ph(__mmask16 __U,__m256h __A,__m256h __B,__m256h __C)1676 _mm256_maskz_fmsubadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1677 return (__m256h)__builtin_ia32_selectph_256(
1678 (__mmask16)__U,
1679 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1680 (__v16hf)_mm256_setzero_ph());
1681 }
1682
1683 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fmsub_ph(__m128h __A,__m128h __B,__m128h __C,__mmask8 __U)1684 _mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1685 return (__m128h)__builtin_ia32_selectph_128(
1686 (__mmask8)__U,
1687 __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1688 (__v8hf)__C);
1689 }
1690
1691 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask3_fmsub_ph(__m256h __A,__m256h __B,__m256h __C,__mmask16 __U)1692 _mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1693 return (__m256h)__builtin_ia32_selectph_256(
1694 (__mmask16)__U,
1695 __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1696 (__v16hf)__C);
1697 }
1698
1699 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fmsubadd_ph(__m128h __A,__m128h __B,__m128h __C,__mmask8 __U)1700 _mm_mask3_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1701 return (__m128h)__builtin_ia32_selectph_128(
1702 (__mmask8)__U,
1703 __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1704 (__v8hf)__C);
1705 }
1706
1707 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask3_fmsubadd_ph(__m256h __A,__m256h __B,__m256h __C,__mmask16 __U)1708 _mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1709 return (__m256h)__builtin_ia32_selectph_256(
1710 (__mmask16)__U,
1711 __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1712 (__v16hf)__C);
1713 }
1714
_mm_fnmadd_ph(__m128h __A,__m128h __B,__m128h __C)1715 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_ph(__m128h __A,
1716 __m128h __B,
1717 __m128h __C) {
1718 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
1719 (__v8hf)__C);
1720 }
1721
1722 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fnmadd_ph(__m128h __A,__mmask8 __U,__m128h __B,__m128h __C)1723 _mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1724 return (__m128h)__builtin_ia32_selectph_128(
1725 (__mmask8)__U,
1726 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C),
1727 (__v8hf)__A);
1728 }
1729
_mm256_fnmadd_ph(__m256h __A,__m256h __B,__m256h __C)1730 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ph(__m256h __A,
1731 __m256h __B,
1732 __m256h __C) {
1733 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
1734 (__v16hf)__C);
1735 }
1736
1737 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_fnmadd_ph(__m256h __A,__mmask16 __U,__m256h __B,__m256h __C)1738 _mm256_mask_fnmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1739 return (__m256h)__builtin_ia32_selectph_256(
1740 (__mmask16)__U,
1741 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C),
1742 (__v16hf)__A);
1743 }
1744
_mm_fnmsub_ph(__m128h __A,__m128h __B,__m128h __C)1745 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_ph(__m128h __A,
1746 __m128h __B,
1747 __m128h __C) {
1748 return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
1749 -(__v8hf)__C);
1750 }
1751
1752 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fnmsub_ph(__m128h __A,__mmask8 __U,__m128h __B,__m128h __C)1753 _mm_mask_fnmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1754 return (__m128h)__builtin_ia32_selectph_128(
1755 (__mmask8)__U,
1756 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
1757 (__v8hf)__A);
1758 }
1759
1760 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fnmsub_ph(__m128h __A,__m128h __B,__m128h __C,__mmask8 __U)1761 _mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1762 return (__m128h)__builtin_ia32_selectph_128(
1763 (__mmask8)__U,
1764 __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
1765 (__v8hf)__C);
1766 }
1767
_mm256_fnmsub_ph(__m256h __A,__m256h __B,__m256h __C)1768 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ph(__m256h __A,
1769 __m256h __B,
1770 __m256h __C) {
1771 return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
1772 -(__v16hf)__C);
1773 }
1774
1775 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_fnmsub_ph(__m256h __A,__mmask16 __U,__m256h __B,__m256h __C)1776 _mm256_mask_fnmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1777 return (__m256h)__builtin_ia32_selectph_256(
1778 (__mmask16)__U,
1779 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
1780 (__v16hf)__A);
1781 }
1782
1783 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask3_fnmsub_ph(__m256h __A,__m256h __B,__m256h __C,__mmask16 __U)1784 _mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1785 return (__m256h)__builtin_ia32_selectph_256(
1786 (__mmask16)__U,
1787 __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
1788 (__v16hf)__C);
1789 }
1790
_mm_fcmul_pch(__m128h __A,__m128h __B)1791 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_pch(__m128h __A,
1792 __m128h __B) {
1793 return (__m128h)__builtin_ia32_vfcmulcph128_mask(
1794 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1);
1795 }
1796
1797 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fcmul_pch(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)1798 _mm_mask_fcmul_pch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1799 return (__m128h)__builtin_ia32_vfcmulcph128_mask((__v4sf)__A, (__v4sf)__B,
1800 (__v4sf)__W, (__mmask8)__U);
1801 }
1802
1803 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fcmul_pch(__mmask8 __U,__m128h __A,__m128h __B)1804 _mm_maskz_fcmul_pch(__mmask8 __U, __m128h __A, __m128h __B) {
1805 return (__m128h)__builtin_ia32_vfcmulcph128_mask(
1806 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U);
1807 }
1808
_mm256_fcmul_pch(__m256h __A,__m256h __B)1809 static __inline__ __m256h __DEFAULT_FN_ATTRS128 _mm256_fcmul_pch(__m256h __A,
1810 __m256h __B) {
1811 return (__m256h)__builtin_ia32_vfcmulcph256_mask(
1812 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1);
1813 }
1814
1815 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_fcmul_pch(__m256h __W,__mmask8 __U,__m256h __A,__m256h __B)1816 _mm256_mask_fcmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
1817 return (__m256h)__builtin_ia32_vfcmulcph256_mask((__v8sf)__A, (__v8sf)__B,
1818 (__v8sf)__W, (__mmask8)__U);
1819 }
1820
1821 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_fcmul_pch(__mmask8 __U,__m256h __A,__m256h __B)1822 _mm256_maskz_fcmul_pch(__mmask8 __U, __m256h __A, __m256h __B) {
1823 return (__m256h)__builtin_ia32_vfcmulcph256_mask(
1824 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U);
1825 }
1826
_mm_fcmadd_pch(__m128h __A,__m128h __B,__m128h __C)1827 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_pch(__m128h __A,
1828 __m128h __B,
1829 __m128h __C) {
1830 return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
1831 (__v4sf)__C, (__mmask8)-1);
1832 }
1833
1834 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fcmadd_pch(__m128h __A,__mmask8 __U,__m128h __B,__m128h __C)1835 _mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1836 return (__m128h)__builtin_ia32_selectps_128(
1837 __U,
1838 __builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)(__m128h)__B,
1839 (__v4sf)__C, (__mmask8)__U),
1840 (__v4sf)__A);
1841 }
1842
1843 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fcmadd_pch(__m128h __A,__m128h __B,__m128h __C,__mmask8 __U)1844 _mm_mask3_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1845 return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
1846 (__v4sf)__C, (__mmask8)__U);
1847 }
1848
1849 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fcmadd_pch(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)1850 _mm_maskz_fcmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1851 return (__m128h)__builtin_ia32_vfcmaddcph128_maskz(
1852 (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U);
1853 }
1854
_mm256_fcmadd_pch(__m256h __A,__m256h __B,__m256h __C)1855 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fcmadd_pch(__m256h __A,
1856 __m256h __B,
1857 __m256h __C) {
1858 return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
1859 (__v8sf)__C, (__mmask8)-1);
1860 }
1861
1862 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_fcmadd_pch(__m256h __A,__mmask8 __U,__m256h __B,__m256h __C)1863 _mm256_mask_fcmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) {
1864 return (__m256h)__builtin_ia32_selectps_256(
1865 __U,
1866 __builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C,
1867 (__mmask8)__U),
1868 (__v8sf)__A);
1869 }
1870
1871 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask3_fcmadd_pch(__m256h __A,__m256h __B,__m256h __C,__mmask8 __U)1872 _mm256_mask3_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) {
1873 return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
1874 (__v8sf)__C, (__mmask8)__U);
1875 }
1876
1877 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_fcmadd_pch(__mmask8 __U,__m256h __A,__m256h __B,__m256h __C)1878 _mm256_maskz_fcmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
1879 return (__m256h)__builtin_ia32_vfcmaddcph256_maskz(
1880 (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U);
1881 }
1882
_mm_fmul_pch(__m128h __A,__m128h __B)1883 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_pch(__m128h __A,
1884 __m128h __B) {
1885 return (__m128h)__builtin_ia32_vfmulcph128_mask(
1886 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1);
1887 }
1888
_mm_mask_fmul_pch(__m128h __W,__mmask8 __U,__m128h __A,__m128h __B)1889 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_pch(__m128h __W,
1890 __mmask8 __U,
1891 __m128h __A,
1892 __m128h __B) {
1893 return (__m128h)__builtin_ia32_vfmulcph128_mask((__v4sf)__A, (__v4sf)__B,
1894 (__v4sf)__W, (__mmask8)__U);
1895 }
1896
1897 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmul_pch(__mmask8 __U,__m128h __A,__m128h __B)1898 _mm_maskz_fmul_pch(__mmask8 __U, __m128h __A, __m128h __B) {
1899 return (__m128h)__builtin_ia32_vfmulcph128_mask(
1900 (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U);
1901 }
1902
_mm256_fmul_pch(__m256h __A,__m256h __B)1903 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmul_pch(__m256h __A,
1904 __m256h __B) {
1905 return (__m256h)__builtin_ia32_vfmulcph256_mask(
1906 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1);
1907 }
1908
1909 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_fmul_pch(__m256h __W,__mmask8 __U,__m256h __A,__m256h __B)1910 _mm256_mask_fmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
1911 return (__m256h)__builtin_ia32_vfmulcph256_mask((__v8sf)__A, (__v8sf)__B,
1912 (__v8sf)__W, (__mmask8)__U);
1913 }
1914
1915 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_fmul_pch(__mmask8 __U,__m256h __A,__m256h __B)1916 _mm256_maskz_fmul_pch(__mmask8 __U, __m256h __A, __m256h __B) {
1917 return (__m256h)__builtin_ia32_vfmulcph256_mask(
1918 (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U);
1919 }
1920
_mm_fmadd_pch(__m128h __A,__m128h __B,__m128h __C)1921 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_pch(__m128h __A,
1922 __m128h __B,
1923 __m128h __C) {
1924 return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
1925 (__v4sf)__C, (__mmask8)-1);
1926 }
1927
1928 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask_fmadd_pch(__m128h __A,__mmask8 __U,__m128h __B,__m128h __C)1929 _mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1930 return (__m128h)__builtin_ia32_selectps_128(
1931 __U,
1932 __builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C,
1933 (__mmask8)__U),
1934 (__v4sf)__A);
1935 }
1936
1937 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_mask3_fmadd_pch(__m128h __A,__m128h __B,__m128h __C,__mmask8 __U)1938 _mm_mask3_fmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1939 return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
1940 (__v4sf)__C, (__mmask8)__U);
1941 }
1942
1943 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_maskz_fmadd_pch(__mmask8 __U,__m128h __A,__m128h __B,__m128h __C)1944 _mm_maskz_fmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1945 return (__m128h)__builtin_ia32_vfmaddcph128_maskz((__v4sf)__A, (__v4sf)__B,
1946 (__v4sf)__C, (__mmask8)__U);
1947 }
1948
_mm256_fmadd_pch(__m256h __A,__m256h __B,__m256h __C)1949 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_pch(__m256h __A,
1950 __m256h __B,
1951 __m256h __C) {
1952 return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
1953 (__v8sf)__C, (__mmask8)-1);
1954 }
1955
1956 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_fmadd_pch(__m256h __A,__mmask8 __U,__m256h __B,__m256h __C)1957 _mm256_mask_fmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) {
1958 return (__m256h)__builtin_ia32_selectps_256(
1959 __U,
1960 __builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C,
1961 (__mmask8)__U),
1962 (__v8sf)__A);
1963 }
1964
1965 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask3_fmadd_pch(__m256h __A,__m256h __B,__m256h __C,__mmask8 __U)1966 _mm256_mask3_fmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) {
1967 return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
1968 (__v8sf)__C, (__mmask8)__U);
1969 }
1970
1971 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_maskz_fmadd_pch(__mmask8 __U,__m256h __A,__m256h __B,__m256h __C)1972 _mm256_maskz_fmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
1973 return (__m256h)__builtin_ia32_vfmaddcph256_maskz((__v8sf)__A, (__v8sf)__B,
1974 (__v8sf)__C, (__mmask8)__U);
1975 }
1976
_mm_mask_blend_ph(__mmask8 __U,__m128h __A,__m128h __W)1977 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U,
1978 __m128h __A,
1979 __m128h __W) {
1980 return (__m128h)__builtin_ia32_selectph_128((__mmask8)__U, (__v8hf)__W,
1981 (__v8hf)__A);
1982 }
1983
1984 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_mask_blend_ph(__mmask16 __U,__m256h __A,__m256h __W)1985 _mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) {
1986 return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, (__v16hf)__W,
1987 (__v16hf)__A);
1988 }
1989
1990 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_permutex2var_ph(__m128h __A,__m128i __I,__m128h __B)1991 _mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) {
1992 return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
1993 (__v8hi)__B);
1994 }
1995
1996 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_permutex2var_ph(__m256h __A,__m256i __I,__m256h __B)1997 _mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) {
1998 return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
1999 (__v16hi)__B);
2000 }
2001
2002 static __inline__ __m128h __DEFAULT_FN_ATTRS128
_mm_permutexvar_ph(__m128i __A,__m128h __B)2003 _mm_permutexvar_ph(__m128i __A, __m128h __B) {
2004 return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A);
2005 }
2006
2007 static __inline__ __m256h __DEFAULT_FN_ATTRS256
_mm256_permutexvar_ph(__m256i __A,__m256h __B)2008 _mm256_permutexvar_ph(__m256i __A, __m256h __B) {
2009 return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A);
2010 }
2011
2012 static __inline__ _Float16 __DEFAULT_FN_ATTRS256
_mm256_reduce_add_ph(__m256h __W)2013 _mm256_reduce_add_ph(__m256h __W) {
2014 return __builtin_ia32_reduce_fadd_ph256(-0.0f16, __W);
2015 }
2016
2017 static __inline__ _Float16 __DEFAULT_FN_ATTRS256
_mm256_reduce_mul_ph(__m256h __W)2018 _mm256_reduce_mul_ph(__m256h __W) {
2019 return __builtin_ia32_reduce_fmul_ph256(1.0f16, __W);
2020 }
2021
2022 static __inline__ _Float16 __DEFAULT_FN_ATTRS256
_mm256_reduce_max_ph(__m256h __V)2023 _mm256_reduce_max_ph(__m256h __V) {
2024 return __builtin_ia32_reduce_fmax_ph256(__V);
2025 }
2026
2027 static __inline__ _Float16 __DEFAULT_FN_ATTRS256
_mm256_reduce_min_ph(__m256h __V)2028 _mm256_reduce_min_ph(__m256h __V) {
2029 return __builtin_ia32_reduce_fmin_ph256(__V);
2030 }
2031
2032 static __inline__ _Float16 __DEFAULT_FN_ATTRS128
_mm_reduce_add_ph(__m128h __W)2033 _mm_reduce_add_ph(__m128h __W) {
2034 return __builtin_ia32_reduce_fadd_ph128(-0.0f16, __W);
2035 }
2036
2037 static __inline__ _Float16 __DEFAULT_FN_ATTRS128
_mm_reduce_mul_ph(__m128h __W)2038 _mm_reduce_mul_ph(__m128h __W) {
2039 return __builtin_ia32_reduce_fmul_ph128(1.0f16, __W);
2040 }
2041
2042 static __inline__ _Float16 __DEFAULT_FN_ATTRS128
_mm_reduce_max_ph(__m128h __V)2043 _mm_reduce_max_ph(__m128h __V) {
2044 return __builtin_ia32_reduce_fmax_ph128(__V);
2045 }
2046
2047 static __inline__ _Float16 __DEFAULT_FN_ATTRS128
_mm_reduce_min_ph(__m128h __V)2048 _mm_reduce_min_ph(__m128h __V) {
2049 return __builtin_ia32_reduce_fmin_ph128(__V);
2050 }
2051
2052 // intrinsics below are alias for f*mul_*ch
2053 #define _mm_mul_pch(A, B) _mm_fmul_pch(A, B)
2054 #define _mm_mask_mul_pch(W, U, A, B) _mm_mask_fmul_pch(W, U, A, B)
2055 #define _mm_maskz_mul_pch(U, A, B) _mm_maskz_fmul_pch(U, A, B)
2056 #define _mm256_mul_pch(A, B) _mm256_fmul_pch(A, B)
2057 #define _mm256_mask_mul_pch(W, U, A, B) _mm256_mask_fmul_pch(W, U, A, B)
2058 #define _mm256_maskz_mul_pch(U, A, B) _mm256_maskz_fmul_pch(U, A, B)
2059
2060 #define _mm_cmul_pch(A, B) _mm_fcmul_pch(A, B)
2061 #define _mm_mask_cmul_pch(W, U, A, B) _mm_mask_fcmul_pch(W, U, A, B)
2062 #define _mm_maskz_cmul_pch(U, A, B) _mm_maskz_fcmul_pch(U, A, B)
2063 #define _mm256_cmul_pch(A, B) _mm256_fcmul_pch(A, B)
2064 #define _mm256_mask_cmul_pch(W, U, A, B) _mm256_mask_fcmul_pch(W, U, A, B)
2065 #define _mm256_maskz_cmul_pch(U, A, B) _mm256_maskz_fcmul_pch(U, A, B)
2066
2067 #undef __DEFAULT_FN_ATTRS128
2068 #undef __DEFAULT_FN_ATTRS256
2069
2070 #endif
2071 #endif
2072