1 /*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 #ifndef __IMMINTRIN_H
10 #error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
11 #endif
12 
13 #ifdef __SSE2__
14 
15 #ifndef __AVX512BF16INTRIN_H
16 #define __AVX512BF16INTRIN_H
17 
18 typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
19 typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
20 typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
21 
22 #define __DEFAULT_FN_ATTRS512 \
23   __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16,evex512"), \
24                  __min_vector_width__(512)))
25 #define __DEFAULT_FN_ATTRS                                                     \
26   __attribute__((__always_inline__, __nodebug__,                               \
27                  __target__("avx512bf16,no-evex512")))
28 
29 /// Convert One BF16 Data to One Single Float Data.
30 ///
31 /// \headerfile <x86intrin.h>
32 ///
33 /// This intrinsic does not correspond to a specific instruction.
34 ///
35 /// \param __A
36 ///    A bfloat data.
37 /// \returns A float data whose sign field and exponent field keep unchanged,
38 ///    and fraction field is extended to 23 bits.
39 static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
40   return __builtin_ia32_cvtsbf162ss_32(__A);
41 }
42 
43 /// Convert Two Packed Single Data to One Packed BF16 Data.
44 ///
45 /// \headerfile <x86intrin.h>
46 ///
47 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
48 ///
49 /// \param __A
50 ///    A 512-bit vector of [16 x float].
51 /// \param __B
52 ///    A 512-bit vector of [16 x float].
53 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
54 ///    conversion of __B, and higher 256 bits come from conversion of __A.
55 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
56 _mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
57   return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
58                                                     (__v16sf) __B);
59 }
60 
61 /// Convert Two Packed Single Data to One Packed BF16 Data.
62 ///
63 /// \headerfile <x86intrin.h>
64 ///
65 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
66 ///
67 /// \param __A
68 ///    A 512-bit vector of [16 x float].
69 /// \param __B
70 ///    A 512-bit vector of [16 x float].
71 /// \param __W
72 ///    A 512-bit vector of [32 x bfloat].
73 /// \param __U
74 ///    A 32-bit mask value specifying what is chosen for each element.
75 ///    A 1 means conversion of __A or __B. A 0 means element from __W.
76 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
77 ///    conversion of __B, and higher 256 bits come from conversion of __A.
78 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
79 _mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
80   return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
81                                         (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
82                                         (__v32bf)__W);
83 }
84 
85 /// Convert Two Packed Single Data to One Packed BF16 Data.
86 ///
87 /// \headerfile <x86intrin.h>
88 ///
89 /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
90 ///
91 /// \param __A
92 ///    A 512-bit vector of [16 x float].
93 /// \param __B
94 ///    A 512-bit vector of [16 x float].
95 /// \param __U
96 ///    A 32-bit mask value specifying what is chosen for each element.
97 ///    A 1 means conversion of __A or __B. A 0 means element is zero.
98 /// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
99 ///    conversion of __B, and higher 256 bits come from conversion of __A.
100 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
101 _mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
102   return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
103                                         (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
104                                         (__v32bf)_mm512_setzero_si512());
105 }
106 
107 /// Convert Packed Single Data to Packed BF16 Data.
108 ///
109 /// \headerfile <x86intrin.h>
110 ///
111 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
112 ///
113 /// \param __A
114 ///    A 512-bit vector of [16 x float].
115 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
116 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
117 _mm512_cvtneps_pbh(__m512 __A) {
118   return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
119                                               (__v16bf)_mm256_undefined_si256(),
120                                               (__mmask16)-1);
121 }
122 
123 /// Convert Packed Single Data to Packed BF16 Data.
124 ///
125 /// \headerfile <x86intrin.h>
126 ///
127 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
128 ///
129 /// \param __A
130 ///    A 512-bit vector of [16 x float].
131 /// \param __W
132 ///    A 256-bit vector of [16 x bfloat].
133 /// \param __U
134 ///    A 16-bit mask value specifying what is chosen for each element.
135 ///    A 1 means conversion of __A. A 0 means element from __W.
136 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
137 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
138 _mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
139   return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
140                                                         (__v16bf)__W,
141                                                         (__mmask16)__U);
142 }
143 
144 /// Convert Packed Single Data to Packed BF16 Data.
145 ///
146 /// \headerfile <x86intrin.h>
147 ///
148 /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
149 ///
150 /// \param __A
151 ///    A 512-bit vector of [16 x float].
152 /// \param __U
153 ///    A 16-bit mask value specifying what is chosen for each element.
154 ///    A 1 means conversion of __A. A 0 means element is zero.
155 /// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
156 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
157 _mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
158   return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
159                                                 (__v16bf)_mm256_setzero_si256(),
160                                                 (__mmask16)__U);
161 }
162 
163 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
164 ///
165 /// \headerfile <x86intrin.h>
166 ///
167 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
168 ///
169 /// \param __A
170 ///    A 512-bit vector of [32 x bfloat].
171 /// \param __B
172 ///    A 512-bit vector of [32 x bfloat].
173 /// \param __D
174 ///    A 512-bit vector of [16 x float].
175 /// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
176 ///  __A, __B and __D
177 static __inline__ __m512 __DEFAULT_FN_ATTRS512
178 _mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
179   return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
180                                              (__v32bf) __A,
181                                              (__v32bf) __B);
182 }
183 
184 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
185 ///
186 /// \headerfile <x86intrin.h>
187 ///
188 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
189 ///
190 /// \param __A
191 ///    A 512-bit vector of [32 x bfloat].
192 /// \param __B
193 ///    A 512-bit vector of [32 x bfloat].
194 /// \param __D
195 ///    A 512-bit vector of [16 x float].
196 /// \param __U
197 ///    A 16-bit mask value specifying what is chosen for each element.
198 ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
199 /// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
200 ///  __A, __B and __D
201 static __inline__ __m512 __DEFAULT_FN_ATTRS512
202 _mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
203   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
204                                        (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
205                                        (__v16sf)__D);
206 }
207 
208 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
209 ///
210 /// \headerfile <x86intrin.h>
211 ///
212 /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
213 ///
214 /// \param __A
215 ///    A 512-bit vector of [32 x bfloat].
216 /// \param __B
217 ///    A 512-bit vector of [32 x bfloat].
218 /// \param __D
219 ///    A 512-bit vector of [16 x float].
220 /// \param __U
221 ///    A 16-bit mask value specifying what is chosen for each element.
222 ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
223 /// \returns A 512-bit vector of [16 x float] comes from  Dot Product of
224 ///  __A, __B and __D
225 static __inline__ __m512 __DEFAULT_FN_ATTRS512
226 _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
227   return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
228                                        (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
229                                        (__v16sf)_mm512_setzero_si512());
230 }
231 
232 /// Convert Packed BF16 Data to Packed float Data.
233 ///
234 /// \headerfile <x86intrin.h>
235 ///
236 /// \param __A
237 ///    A 256-bit vector of [16 x bfloat].
238 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
239 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
240   return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
241       (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
242 }
243 
244 /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
245 ///
246 /// \headerfile <x86intrin.h>
247 ///
248 /// \param __U
249 ///    A 16-bit mask. Elements are zeroed out when the corresponding mask
250 ///    bit is not set.
251 /// \param __A
252 ///    A 256-bit vector of [16 x bfloat].
253 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
254 static __inline__ __m512 __DEFAULT_FN_ATTRS512
255 _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
256   return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
257       (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
258 }
259 
260 /// Convert Packed BF16 Data to Packed float Data using merging mask.
261 ///
262 /// \headerfile <x86intrin.h>
263 ///
264 /// \param __S
265 ///    A 512-bit vector of [16 x float]. Elements are copied from __S when
266 ///     the corresponding mask bit is not set.
267 /// \param __U
268 ///    A 16-bit mask.
269 /// \param __A
270 ///    A 256-bit vector of [16 x bfloat].
271 /// \returns A 512-bit vector of [16 x float] come from conversion of __A
272 static __inline__ __m512 __DEFAULT_FN_ATTRS512
273 _mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
274   return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
275       (__m512i)__S, (__mmask16)__U,
276       (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
277 }
278 
279 #undef __DEFAULT_FN_ATTRS
280 #undef __DEFAULT_FN_ATTRS512
281 
282 #endif
283 #endif
284