1 /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===------------------------------------------------------------------------===
8 */
9
10 #ifndef __IMMINTRIN_H
11 #error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
12 #endif /* __IMMINTRIN_H */
13
14 #ifndef __AMXINTRIN_H
15 #define __AMXINTRIN_H
16 #ifdef __x86_64__
17
18 #define __DEFAULT_FN_ATTRS_TILE \
19 __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
20
21 /// Load tile configuration from a 64-byte memory location specified by
22 /// "mem_addr". The tile configuration includes the tile type palette, the
23 /// number of bytes per row, and the number of rows. If the specified
24 /// palette_id is zero, that signifies the init state for both the tile
25 /// config and the tile data, and the tiles are zeroed. Any invalid
26 /// configurations will result in #GP fault.
27 ///
28 /// \headerfile <x86intrin.h>
29 ///
30 /// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
31 ///
32 /// \param __config
33 /// A pointer to 512-bits configuration
34 static __inline__ void __DEFAULT_FN_ATTRS_TILE
_tile_loadconfig(const void * __config)35 _tile_loadconfig(const void *__config) {
36 __builtin_ia32_tile_loadconfig(__config);
37 }
38
39 /// Stores the current tile configuration to a 64-byte memory location
40 /// specified by "mem_addr". The tile configuration includes the tile type
41 /// palette, the number of bytes per row, and the number of rows. If tiles
42 /// are not configured, all zeroes will be stored to memory.
43 ///
44 /// \headerfile <x86intrin.h>
45 ///
46 /// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
47 ///
48 /// \param __config
49 /// A pointer to 512-bits configuration
50 static __inline__ void __DEFAULT_FN_ATTRS_TILE
_tile_storeconfig(void * __config)51 _tile_storeconfig(void *__config) {
52 __builtin_ia32_tile_storeconfig(__config);
53 }
54
55 /// Release the tile configuration to return to the init state, which
56 /// releases all storage it currently holds.
57 ///
58 /// \headerfile <x86intrin.h>
59 ///
60 /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
_tile_release(void)61 static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
62 __builtin_ia32_tilerelease();
63 }
64
65 /// Load tile rows from memory specifieid by "base" address and "stride" into
66 /// destination tile "dst" using the tile configuration previously configured
67 /// via "_tile_loadconfig".
68 ///
69 /// \headerfile <x86intrin.h>
70 ///
71 /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
72 ///
73 /// \param dst
74 /// A destination tile. Max size is 1024 Bytes.
75 /// \param base
76 /// A pointer to base address.
77 /// \param stride
78 /// The stride between the rows' data to be loaded in memory.
79 #define _tile_loadd(dst, base, stride) \
80 __builtin_ia32_tileloadd64((dst), ((const void *)(base)), \
81 (__SIZE_TYPE__)(stride))
82
83 /// Load tile rows from memory specifieid by "base" address and "stride" into
84 /// destination tile "dst" using the tile configuration previously configured
85 /// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
86 /// that the data will likely not be reused in the near future and the data
87 /// caching can be optimized accordingly.
88 ///
89 /// \headerfile <x86intrin.h>
90 ///
91 /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
92 ///
93 /// \param dst
94 /// A destination tile. Max size is 1024 Bytes.
95 /// \param base
96 /// A pointer to base address.
97 /// \param stride
98 /// The stride between the rows' data to be loaded in memory.
99 #define _tile_stream_loadd(dst, base, stride) \
100 __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \
101 (__SIZE_TYPE__)(stride))
102
103 /// Store the tile specified by "src" to memory specifieid by "base" address and
104 /// "stride" using the tile configuration previously configured via
105 /// "_tile_loadconfig".
106 ///
107 /// \headerfile <x86intrin.h>
108 ///
109 /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
110 ///
111 /// \param dst
112 /// A destination tile. Max size is 1024 Bytes.
113 /// \param base
114 /// A pointer to base address.
115 /// \param stride
116 /// The stride between the rows' data to be stored in memory.
117 #define _tile_stored(dst, base, stride) \
118 __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
119
120 /// Zero the tile specified by "tdest".
121 ///
122 /// \headerfile <x86intrin.h>
123 ///
124 /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
125 ///
126 /// \param tile
127 /// The destination tile to be zero. Max size is 1024 Bytes.
128 #define _tile_zero(tile) __builtin_ia32_tilezero((tile))
129
130 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
131 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
132 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
133 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
134 /// and store the 32-bit result back to tile "dst".
135 ///
136 /// \headerfile <x86intrin.h>
137 ///
138 /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
139 ///
140 /// \param dst
141 /// The destination tile. Max size is 1024 Bytes.
142 /// \param src0
143 /// The 1st source tile. Max size is 1024 Bytes.
144 /// \param src1
145 /// The 2nd source tile. Max size is 1024 Bytes.
146 #define _tile_dpbssd(dst, src0, src1) \
147 __builtin_ia32_tdpbssd((dst), (src0), (src1))
148
149 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
150 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
151 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
152 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
153 /// in "dst", and store the 32-bit result back to tile "dst".
154 ///
155 /// \headerfile <x86intrin.h>
156 ///
157 /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
158 ///
159 /// \param dst
160 /// The destination tile. Max size is 1024 Bytes.
161 /// \param src0
162 /// The 1st source tile. Max size is 1024 Bytes.
163 /// \param src1
164 /// The 2nd source tile. Max size is 1024 Bytes.
165 #define _tile_dpbsud(dst, src0, src1) \
166 __builtin_ia32_tdpbsud((dst), (src0), (src1))
167
168 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
169 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
170 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
171 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
172 /// and store the 32-bit result back to tile "dst".
173 ///
174 /// \headerfile <x86intrin.h>
175 ///
176 /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
177 ///
178 /// \param dst
179 /// The destination tile. Max size is 1024 Bytes.
180 /// \param src0
181 /// The 1st source tile. Max size is 1024 Bytes.
182 /// \param src1
183 /// The 2nd source tile. Max size is 1024 Bytes.
184 #define _tile_dpbusd(dst, src0, src1) \
185 __builtin_ia32_tdpbusd((dst), (src0), (src1))
186
187 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
188 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
189 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
190 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
191 /// "dst", and store the 32-bit result back to tile "dst".
192 ///
193 /// \headerfile <x86intrin.h>
194 ///
195 /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
196 ///
197 /// \param dst
198 /// The destination tile. Max size is 1024 Bytes.
199 /// \param src0
200 /// The 1st source tile. Max size is 1024 Bytes.
201 /// \param src1
202 /// The 2nd source tile. Max size is 1024 Bytes.
203 #define _tile_dpbuud(dst, src0, src1) \
204 __builtin_ia32_tdpbuud((dst), (src0), (src1))
205
206 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
207 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
208 /// elements with elements in "dst", and store the 32-bit result back to tile
209 /// "dst".
210 ///
211 /// \headerfile <x86intrin.h>
212 ///
213 /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
214 ///
215 /// \param dst
216 /// The destination tile. Max size is 1024 Bytes.
217 /// \param src0
218 /// The 1st source tile. Max size is 1024 Bytes.
219 /// \param src1
220 /// The 2nd source tile. Max size is 1024 Bytes.
221 #define _tile_dpbf16ps(dst, src0, src1) \
222 __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
223
224 #define __DEFAULT_FN_ATTRS_INT8 \
225 __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
226
227 typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
228 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_loadd_internal(unsigned short m,unsigned short n,const void * base,__SIZE_TYPE__ stride)229 _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
230 __SIZE_TYPE__ stride) {
231 return __builtin_ia32_tileloadd64_internal(m, n, base,
232 (__SIZE_TYPE__)(stride));
233 }
234
235 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbssd_internal(unsigned short m,unsigned short n,unsigned short k,_tile1024i dst,_tile1024i src1,_tile1024i src2)236 _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
237 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
238 return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
239 }
240
241 static __inline__ void __DEFAULT_FN_ATTRS_INT8
_tile_stored_internal(unsigned short m,unsigned short n,void * base,__SIZE_TYPE__ stride,_tile1024i tile)242 _tile_stored_internal(unsigned short m, unsigned short n, void *base,
243 __SIZE_TYPE__ stride, _tile1024i tile) {
244 return __builtin_ia32_tilestored64_internal(m, n, base,
245 (__SIZE_TYPE__)(stride), tile);
246 }
247
248 typedef struct __tile1024i_str {
249 const unsigned short row;
250 const unsigned short col;
251 _tile1024i tile;
252 } __tile1024i;
253
254 __DEFAULT_FN_ATTRS_TILE
__tile_loadd(__tile1024i * dst,const void * base,__SIZE_TYPE__ stride)255 static void __tile_loadd(__tile1024i *dst, const void *base,
256 __SIZE_TYPE__ stride) {
257 dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
258 }
259
260 __DEFAULT_FN_ATTRS_INT8
__tile_dpbssd(__tile1024i * dst,__tile1024i src1,__tile1024i src2)261 static void __tile_dpbssd(__tile1024i *dst, __tile1024i src1,
262 __tile1024i src2) {
263 dst->tile = _tile_dpbssd_internal(src1.row, src2.col, src1.col, dst->tile,
264 src1.tile, src2.tile);
265 }
266
267 __DEFAULT_FN_ATTRS_TILE
__tile_stored(void * base,__SIZE_TYPE__ stride,__tile1024i src)268 static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
269 _tile_stored_internal(src.row, src.col, base, stride, src.tile);
270 }
271
272 __DEFAULT_FN_ATTRS_TILE
__tile_zero(__tile1024i * dst)273 static void __tile_zero(__tile1024i *dst) {
274 dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
275 }
276
277 #endif /* __x86_64__ */
278 #endif /* __AMXINTRIN_H */
279