1 /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===------------------------------------------------------------------------===
8  */
9 
10 #ifndef __IMMINTRIN_H
11 #error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
12 #endif /* __IMMINTRIN_H */
13 
14 #ifndef __AMXINTRIN_H
15 #define __AMXINTRIN_H
16 #ifdef __x86_64__
17 
18 #define __DEFAULT_FN_ATTRS_TILE                                                \
19   __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
20 
21 /// Load tile configuration from a 64-byte memory location specified by
22 /// "mem_addr". The tile configuration includes the tile type palette, the
23 /// number of bytes per row, and the number of rows. If the specified
24 /// palette_id is zero, that signifies the init state for both the tile
25 /// config and the tile data, and the tiles are zeroed. Any invalid
26 /// configurations will result in #GP fault.
27 ///
28 /// \headerfile <x86intrin.h>
29 ///
30 /// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
31 ///
32 /// \param __config
33 ///    A pointer to 512-bits configuration
34 static __inline__ void __DEFAULT_FN_ATTRS_TILE
_tile_loadconfig(const void * __config)35 _tile_loadconfig(const void *__config) {
36   __builtin_ia32_tile_loadconfig(__config);
37 }
38 
39 /// Stores the current tile configuration to a 64-byte memory location
40 /// specified by "mem_addr". The tile configuration includes the tile type
41 /// palette, the number of bytes per row, and the number of rows. If tiles
42 /// are not configured, all zeroes will be stored to memory.
43 ///
44 /// \headerfile <x86intrin.h>
45 ///
46 /// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
47 ///
48 /// \param __config
49 ///    A pointer to 512-bits configuration
50 static __inline__ void __DEFAULT_FN_ATTRS_TILE
_tile_storeconfig(void * __config)51 _tile_storeconfig(void *__config) {
52   __builtin_ia32_tile_storeconfig(__config);
53 }
54 
55 /// Release the tile configuration to return to the init state, which
56 /// releases all storage it currently holds.
57 ///
58 /// \headerfile <x86intrin.h>
59 ///
60 /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
_tile_release(void)61 static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
62   __builtin_ia32_tilerelease();
63 }
64 
65 /// Load tile rows from memory specifieid by "base" address and "stride" into
66 /// destination tile "dst" using the tile configuration previously configured
67 /// via "_tile_loadconfig".
68 ///
69 /// \headerfile <x86intrin.h>
70 ///
71 /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
72 ///
73 /// \param dst
74 ///    A destination tile. Max size is 1024 Bytes.
75 /// \param base
76 ///    A pointer to base address.
77 /// \param stride
78 ///    The stride between the rows' data to be loaded in memory.
79 #define _tile_loadd(dst, base, stride)                                         \
80   __builtin_ia32_tileloadd64((dst), ((const void *)(base)),                    \
81                              (__SIZE_TYPE__)(stride))
82 
83 /// Load tile rows from memory specifieid by "base" address and "stride" into
84 /// destination tile "dst" using the tile configuration previously configured
85 /// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
86 /// that the data will likely not be reused in the near future and the data
87 /// caching can be optimized accordingly.
88 ///
89 /// \headerfile <x86intrin.h>
90 ///
91 /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
92 ///
93 /// \param dst
94 ///    A destination tile. Max size is 1024 Bytes.
95 /// \param base
96 ///    A pointer to base address.
97 /// \param stride
98 ///    The stride between the rows' data to be loaded in memory.
99 #define _tile_stream_loadd(dst, base, stride)                                  \
100   __builtin_ia32_tileloaddt164((dst), ((const void *)(base)),                  \
101                                (__SIZE_TYPE__)(stride))
102 
103 /// Store the tile specified by "src" to memory specifieid by "base" address and
104 /// "stride" using the tile configuration previously configured via
105 /// "_tile_loadconfig".
106 ///
107 /// \headerfile <x86intrin.h>
108 ///
109 /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
110 ///
111 /// \param dst
112 ///    A destination tile. Max size is 1024 Bytes.
113 /// \param base
114 ///    A pointer to base address.
115 /// \param stride
116 ///    The stride between the rows' data to be stored in memory.
117 #define _tile_stored(dst, base, stride)                                        \
118   __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
119 
120 /// Zero the tile specified by "tdest".
121 ///
122 /// \headerfile <x86intrin.h>
123 ///
124 /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
125 ///
126 /// \param tile
127 ///    The destination tile to be zero. Max size is 1024 Bytes.
128 #define _tile_zero(tile) __builtin_ia32_tilezero((tile))
129 
130 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
131 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
132 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
133 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
134 /// and store the 32-bit result back to tile "dst".
135 ///
136 /// \headerfile <x86intrin.h>
137 ///
138 /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
139 ///
140 /// \param dst
141 ///    The destination tile. Max size is 1024 Bytes.
142 /// \param src0
143 ///    The 1st source tile. Max size is 1024 Bytes.
144 /// \param src1
145 ///    The 2nd source tile. Max size is 1024 Bytes.
146 #define _tile_dpbssd(dst, src0, src1)                                          \
147   __builtin_ia32_tdpbssd((dst), (src0), (src1))
148 
149 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
150 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
151 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
152 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
153 /// in "dst", and store the 32-bit result back to tile "dst".
154 ///
155 /// \headerfile <x86intrin.h>
156 ///
157 /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
158 ///
159 /// \param dst
160 ///    The destination tile. Max size is 1024 Bytes.
161 /// \param src0
162 ///    The 1st source tile. Max size is 1024 Bytes.
163 /// \param src1
164 ///    The 2nd source tile. Max size is 1024 Bytes.
165 #define _tile_dpbsud(dst, src0, src1)                                          \
166   __builtin_ia32_tdpbsud((dst), (src0), (src1))
167 
168 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
169 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
170 /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
171 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
172 /// and store the 32-bit result back to tile "dst".
173 ///
174 /// \headerfile <x86intrin.h>
175 ///
176 /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
177 ///
178 /// \param dst
179 ///    The destination tile. Max size is 1024 Bytes.
180 /// \param src0
181 ///    The 1st source tile. Max size is 1024 Bytes.
182 /// \param src1
183 ///    The 2nd source tile. Max size is 1024 Bytes.
184 #define _tile_dpbusd(dst, src0, src1)                                          \
185   __builtin_ia32_tdpbusd((dst), (src0), (src1))
186 
187 /// Compute dot-product of bytes in tiles with a source/destination accumulator.
188 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
189 /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
190 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
191 /// "dst", and store the 32-bit result back to tile "dst".
192 ///
193 /// \headerfile <x86intrin.h>
194 ///
195 /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
196 ///
197 /// \param dst
198 ///    The destination tile. Max size is 1024 Bytes.
199 /// \param src0
200 ///    The 1st source tile. Max size is 1024 Bytes.
201 /// \param src1
202 ///    The 2nd source tile. Max size is 1024 Bytes.
203 #define _tile_dpbuud(dst, src0, src1)                                          \
204   __builtin_ia32_tdpbuud((dst), (src0), (src1))
205 
206 /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
207 /// src1, accumulating the intermediate single-precision (32-bit) floating-point
208 /// elements with elements in "dst", and store the 32-bit result back to tile
209 /// "dst".
210 ///
211 /// \headerfile <x86intrin.h>
212 ///
213 /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
214 ///
215 /// \param dst
216 ///    The destination tile. Max size is 1024 Bytes.
217 /// \param src0
218 ///    The 1st source tile. Max size is 1024 Bytes.
219 /// \param src1
220 ///    The 2nd source tile. Max size is 1024 Bytes.
221 #define _tile_dpbf16ps(dst, src0, src1)                                        \
222   __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
223 
224 #define __DEFAULT_FN_ATTRS_INT8                                                \
225   __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
226 
227 typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
228 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_loadd_internal(unsigned short m,unsigned short n,const void * base,__SIZE_TYPE__ stride)229 _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
230                      __SIZE_TYPE__ stride) {
231   return __builtin_ia32_tileloadd64_internal(m, n, base,
232                                              (__SIZE_TYPE__)(stride));
233 }
234 
235 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbssd_internal(unsigned short m,unsigned short n,unsigned short k,_tile1024i dst,_tile1024i src1,_tile1024i src2)236 _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
237                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
238   return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
239 }
240 
241 static __inline__ void __DEFAULT_FN_ATTRS_INT8
_tile_stored_internal(unsigned short m,unsigned short n,void * base,__SIZE_TYPE__ stride,_tile1024i tile)242 _tile_stored_internal(unsigned short m, unsigned short n, void *base,
243                       __SIZE_TYPE__ stride, _tile1024i tile) {
244   return __builtin_ia32_tilestored64_internal(m, n, base,
245                                               (__SIZE_TYPE__)(stride), tile);
246 }
247 
248 typedef struct __tile1024i_str {
249   const unsigned short row;
250   const unsigned short col;
251   _tile1024i tile;
252 } __tile1024i;
253 
254 __DEFAULT_FN_ATTRS_TILE
__tile_loadd(__tile1024i * dst,const void * base,__SIZE_TYPE__ stride)255 static void __tile_loadd(__tile1024i *dst, const void *base,
256                          __SIZE_TYPE__ stride) {
257   dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
258 }
259 
260 __DEFAULT_FN_ATTRS_INT8
__tile_dpbssd(__tile1024i * dst,__tile1024i src1,__tile1024i src2)261 static void __tile_dpbssd(__tile1024i *dst, __tile1024i src1,
262                           __tile1024i src2) {
263   dst->tile = _tile_dpbssd_internal(src1.row, src2.col, src1.col, dst->tile,
264                                     src1.tile, src2.tile);
265 }
266 
267 __DEFAULT_FN_ATTRS_TILE
__tile_stored(void * base,__SIZE_TYPE__ stride,__tile1024i src)268 static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
269   _tile_stored_internal(src.row, src.col, base, stride, src.tile);
270 }
271 
272 __DEFAULT_FN_ATTRS_TILE
__tile_zero(__tile1024i * dst)273 static void __tile_zero(__tile1024i *dst) {
274   dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
275 }
276 
277 #endif /* __x86_64__ */
278 #endif /* __AMXINTRIN_H */
279