1//===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics -----*- tablegen -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines all of the R600-specific intrinsics.
10//
11//===----------------------------------------------------------------------===//
12
13def global_ptr_ty : LLVMQualPointerType<1>;
14
15class AMDGPUReadPreloadRegisterIntrinsic
16  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
17
18class AMDGPUReadPreloadRegisterIntrinsicNamed<string name>
19  : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>, ClangBuiltin<name>;
20
21// Used to tag image and resource intrinsics with information used to generate
22// mem operands.
23class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = false> {
24  int RsrcArg = rsrcarg;
25  bit IsImage = isimage;
26}
27
28let TargetPrefix = "r600" in {
29
30multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz {
31  def _x : AMDGPUReadPreloadRegisterIntrinsic;
32  def _y : AMDGPUReadPreloadRegisterIntrinsic;
33  def _z : AMDGPUReadPreloadRegisterIntrinsic;
34}
35
36multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> {
37  def _x : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_x")>;
38  def _y : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_y")>;
39  def _z : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_z")>;
40}
41
42defm int_r600_read_global_size : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
43                                 <"__builtin_r600_read_global_size">;
44defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
45                             <"__builtin_r600_read_ngroups">;
46defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
47                          <"__builtin_r600_read_tgid">;
48
49defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz;
50defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz;
51
52def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">,
53  Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>;
54
55// AS 7 is PARAM_I_ADDRESS, used for kernel arguments
56def int_r600_implicitarg_ptr :
57  ClangBuiltin<"__builtin_r600_implicitarg_ptr">,
58  DefaultAttrsIntrinsic<[LLVMQualPointerType<7>], [],
59  [IntrNoMem, IntrSpeculatable]>;
60
61def int_r600_rat_store_typed :
62  // 1st parameter: Data
63  // 2nd parameter: Index
64  // 3rd parameter: Constant RAT ID
65  DefaultAttrsIntrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], []>,
66  ClangBuiltin<"__builtin_r600_rat_store_typed">;
67
68def int_r600_recipsqrt_ieee :  DefaultAttrsIntrinsic<
69  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
70>;
71
72def int_r600_recipsqrt_clamped : DefaultAttrsIntrinsic<
73  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
74>;
75
76def int_r600_cube : DefaultAttrsIntrinsic<
77  [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]
78>;
79
80def int_r600_store_stream_output : DefaultAttrsIntrinsic<
81  [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []
82>;
83
84class TextureIntrinsicFloatInput : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [
85  llvm_v4f32_ty, // Coord
86  llvm_i32_ty,   // offset_x
87  llvm_i32_ty,   // offset_y,
88  llvm_i32_ty,   // offset_z,
89  llvm_i32_ty,   // resource_id
90  llvm_i32_ty,   // samplerid
91  llvm_i32_ty,   // coord_type_x
92  llvm_i32_ty,   // coord_type_y
93  llvm_i32_ty,   // coord_type_z
94  llvm_i32_ty],  // coord_type_w
95  [IntrNoMem]
96>;
97
98class TextureIntrinsicInt32Input : DefaultAttrsIntrinsic<[llvm_v4i32_ty], [
99    llvm_v4i32_ty, // Coord
100    llvm_i32_ty,   // offset_x
101    llvm_i32_ty,   // offset_y,
102    llvm_i32_ty,   // offset_z,
103    llvm_i32_ty,   // resource_id
104    llvm_i32_ty,   // samplerid
105    llvm_i32_ty,   // coord_type_x
106    llvm_i32_ty,   // coord_type_y
107    llvm_i32_ty,   // coord_type_z
108    llvm_i32_ty],  // coord_type_w
109    [IntrNoMem]
110>;
111
112def int_r600_store_swizzle :
113  Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree]
114>;
115
116def int_r600_tex : TextureIntrinsicFloatInput;
117def int_r600_texc : TextureIntrinsicFloatInput;
118def int_r600_txl : TextureIntrinsicFloatInput;
119def int_r600_txlc : TextureIntrinsicFloatInput;
120def int_r600_txb : TextureIntrinsicFloatInput;
121def int_r600_txbc : TextureIntrinsicFloatInput;
122def int_r600_txf : TextureIntrinsicInt32Input;
123def int_r600_txq : TextureIntrinsicInt32Input;
124def int_r600_ddx : TextureIntrinsicFloatInput;
125def int_r600_ddy : TextureIntrinsicFloatInput;
126
127def int_r600_dot4 : DefaultAttrsIntrinsic<[llvm_float_ty],
128  [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]
129>;
130
131def int_r600_kill : DefaultAttrsIntrinsic<[], [llvm_float_ty], []>;
132
133} // End TargetPrefix = "r600"
134
135let TargetPrefix = "amdgcn" in {
136
137//===----------------------------------------------------------------------===//
138// ABI Special Intrinsics
139//===----------------------------------------------------------------------===//
140
141defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz;
142defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
143                               <"__builtin_amdgcn_workgroup_id">;
144
145def int_amdgcn_dispatch_ptr :
146  DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],
147  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>;
148
149def int_amdgcn_queue_ptr :
150  ClangBuiltin<"__builtin_amdgcn_queue_ptr">,
151  DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],
152  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>;
153
154def int_amdgcn_kernarg_segment_ptr :
155  ClangBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">,
156  DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],
157  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>;
158
159def int_amdgcn_implicitarg_ptr :
160  ClangBuiltin<"__builtin_amdgcn_implicitarg_ptr">,
161  DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],
162  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>;
163
164def int_amdgcn_groupstaticsize :
165  ClangBuiltin<"__builtin_amdgcn_groupstaticsize">,
166  DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
167
168def int_amdgcn_dispatch_id :
169  ClangBuiltin<"__builtin_amdgcn_dispatch_id">,
170  DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable]>;
171
172// For internal use. Coordinates LDS lowering between IR transform and backend.
173def int_amdgcn_lds_kernel_id :
174  DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
175
176def int_amdgcn_implicit_buffer_ptr :
177  ClangBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
178  DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [],
179  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>;
180
181// Set EXEC to the 64-bit value given.
182// This is always moved to the beginning of the basic block.
183// FIXME: Should be mangled for wave size.
184def int_amdgcn_init_exec : Intrinsic<[],
185  [llvm_i64_ty],      // 64-bit literal constant
186  [IntrConvergent, IntrNoMem, IntrHasSideEffects, IntrNoCallback,
187   IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<0>>]>;
188
189// Set EXEC according to a thread count packed in an SGPR input:
190//    thread_count = (input >> bitoffset) & 0x7f;
191// This is always moved to the beginning of the basic block.
192// Note: only inreg arguments to the parent function are valid as
193// inputs to this intrinsic, computed values cannot be used.
194def int_amdgcn_init_exec_from_input : Intrinsic<[],
195  [llvm_i32_ty,       // 32-bit SGPR input
196   llvm_i32_ty],      // bit offset of the thread count
197  [IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback,
198   IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
199
200def int_amdgcn_wavefrontsize :
201  ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
202  DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
203
204// Represent a relocation constant.
205def int_amdgcn_reloc_constant : DefaultAttrsIntrinsic<
206  [llvm_i32_ty], [llvm_metadata_ty],
207  [IntrNoMem, IntrSpeculatable]
208>;
209
210//===----------------------------------------------------------------------===//
211// Instruction Intrinsics
212//===----------------------------------------------------------------------===//
213
214// The first parameter is s_sendmsg immediate (i16),
215// the second one is copied to m0
216def int_amdgcn_s_sendmsg : ClangBuiltin<"__builtin_amdgcn_s_sendmsg">,
217  Intrinsic <[], [llvm_i32_ty, llvm_i32_ty],
218  [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
219def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">,
220  Intrinsic <[], [llvm_i32_ty, llvm_i32_ty],
221  [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
222
223
224// gfx11 intrinsic
225// The first parameter is s_sendmsg immediate (i16). Return type is i32 or i64.
226def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty],
227  [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
228
229def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">,
230  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
231
232def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal">,
233  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
234                                IntrNoCallback, IntrNoFree]>;
235
236def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">,
237  Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
238                                IntrNoCallback, IntrNoFree]>;
239
240def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst">,
241  Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
242                                IntrWillReturn, IntrNoCallback, IntrNoFree]>;
243
244def int_amdgcn_s_barrier_signal_isfirst_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst_var">,
245  Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
246                                IntrNoCallback, IntrNoFree]>;
247
248def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">,
249  Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent,
250                                IntrWillReturn, IntrNoCallback, IntrNoFree]>;
251
252def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">,
253  Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
254                                IntrNoCallback, IntrNoFree]>;
255
256def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">,
257  Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
258                                IntrNoCallback, IntrNoFree]>;
259
260def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">,
261  Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
262                                IntrWillReturn, IntrNoCallback, IntrNoFree]>;
263
264def int_amdgcn_s_barrier_leave : ClangBuiltin<"__builtin_amdgcn_s_barrier_leave">,
265  Intrinsic<[llvm_i1_ty], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
266
267def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrier_state">,
268  Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
269                                IntrNoCallback, IntrNoFree]>;
270
271def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">,
272  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
273
274// The 1st parameter is a mask for the types of instructions that may be allowed
275// to cross the SCHED_BARRIER during scheduling.
276//     MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER.
277//     MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be
278//                         scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass.
279//     MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER.
280//     MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER.
281//     MASK = 0x0000 0008: MFMA/WMMA instructions may be scheduled across SCHED_BARRIER.
282//     MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER.
283//     MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER.
284//     MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER.
285//     MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER.
286//     MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER.
287//     MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER.
288def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">,
289  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
290                                IntrWillReturn, IntrNoCallback, IntrNoFree]>;
291
292// The first parameter is a mask that determines the types of instructions that
293// you would like to synchronize around and add to a scheduling group. The
294// values of the mask are defined above for sched_barrier. These instructions
295// will be selected from the bottom up starting from the sched_group_barrier's
296// location during instruction scheduling. The second parameter is the number of
297// matching instructions that will be associated with this sched_group_barrier.
298// The third parameter is an identifier which is used to describe what other
299// sched_group_barriers should be synchronized with.
300def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_barrier">,
301  Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
302  [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects,
303   IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
304
305// Scheduler optimization hint.
306//     MASK = 0: Small gemm opt
307def int_amdgcn_iglp_opt : ClangBuiltin<"__builtin_amdgcn_iglp_opt">,
308  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
309                                IntrWillReturn, IntrNoCallback, IntrNoFree]>;
310
311def int_amdgcn_s_waitcnt : ClangBuiltin<"__builtin_amdgcn_s_waitcnt">,
312  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
313
314// GFX12 intrinsics
315class AMDGPUWaitIntrinsic :
316  Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
317def int_amdgcn_s_wait_bvhcnt         : AMDGPUWaitIntrinsic;
318def int_amdgcn_s_wait_dscnt          : AMDGPUWaitIntrinsic;
319def int_amdgcn_s_wait_expcnt         : AMDGPUWaitIntrinsic;
320def int_amdgcn_s_wait_kmcnt          : AMDGPUWaitIntrinsic;
321def int_amdgcn_s_wait_loadcnt        : AMDGPUWaitIntrinsic;
322def int_amdgcn_s_wait_samplecnt      : AMDGPUWaitIntrinsic;
323def int_amdgcn_s_wait_storecnt       : AMDGPUWaitIntrinsic;
324
325def int_amdgcn_div_scale : DefaultAttrsIntrinsic<
326  // 1st parameter: Numerator
327  // 2nd parameter: Denominator
328  // 3rd parameter: Select quotient. Must equal Numerator or Denominator.
329  //                (0 = Denominator, 1 = Numerator).
330  [llvm_anyfloat_ty, llvm_i1_ty],
331  [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
332  [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>]
333>;
334
335def int_amdgcn_div_fmas : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
336  [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
337  [IntrNoMem, IntrSpeculatable]
338>;
339
340def int_amdgcn_div_fixup : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
341  [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
342  [IntrNoMem, IntrSpeculatable]
343>;
344
345// Look Up 2.0 / pi src0 with segment select src1[4:0]
346def int_amdgcn_trig_preop : DefaultAttrsIntrinsic<
347  [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty],
348  [IntrNoMem, IntrSpeculatable]
349>;
350
351def int_amdgcn_sin : DefaultAttrsIntrinsic<
352  [llvm_anyfloat_ty], [LLVMMatchType<0>],
353  [IntrNoMem, IntrSpeculatable]
354>;
355
356def int_amdgcn_cos : DefaultAttrsIntrinsic<
357  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
358>;
359
360// v_log_{f16|f32}, performs log2. f32 version does not handle
361// denormals. There is no reason to use this for f16 as it does
362// support denormals, and the generic log2 intrinsic should be
363// preferred.
364def int_amdgcn_log : DefaultAttrsIntrinsic<
365  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
366>;
367
368// v_exp_{f16|f32} (int_amdgcn_exp was taken by export
369// already). Performs exp2. f32 version does not handle
370// denormals. There is no reason to use this for f16 as it does
371// support denormals, and the generic exp2 intrinsic should be
372// preferred.
373def int_amdgcn_exp2 : DefaultAttrsIntrinsic<
374  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
375>;
376
377def int_amdgcn_log_clamp : DefaultAttrsIntrinsic<
378  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
379>;
380
381def int_amdgcn_fmul_legacy : ClangBuiltin<"__builtin_amdgcn_fmul_legacy">,
382  DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
383  [IntrNoMem, IntrSpeculatable, Commutative]
384>;
385
386// Fused single-precision multiply-add with legacy behaviour for the multiply,
387// which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is
388// intended for use on subtargets that have the v_fma_legacy_f32 and/or
389// v_fmac_legacy_f32 instructions. (Note that v_fma_legacy_f16 is unrelated and
390// has a completely different kind of legacy behaviour.)
391def int_amdgcn_fma_legacy :
392  DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
393  [IntrNoMem, IntrSpeculatable, Commutative]
394>;
395
396def int_amdgcn_rcp : DefaultAttrsIntrinsic<
397  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
398>;
399
400def int_amdgcn_rcp_legacy : ClangBuiltin<"__builtin_amdgcn_rcp_legacy">,
401  DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty],
402  [IntrNoMem, IntrSpeculatable]
403>;
404
405def int_amdgcn_sqrt :  DefaultAttrsIntrinsic<
406  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
407>;
408
409def int_amdgcn_rsq :  DefaultAttrsIntrinsic<
410  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
411>;
412
413def int_amdgcn_rsq_legacy :  ClangBuiltin<"__builtin_amdgcn_rsq_legacy">,
414  DefaultAttrsIntrinsic<
415  [llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]
416>;
417
418// out = 1.0 / sqrt(a) result clamped to +/- max_float.
419def int_amdgcn_rsq_clamp : DefaultAttrsIntrinsic<
420  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>;
421
422def int_amdgcn_frexp_mant : DefaultAttrsIntrinsic<
423  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
424>;
425
426def int_amdgcn_frexp_exp : DefaultAttrsIntrinsic<
427  [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable]
428>;
429
430// v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0
431// and always uses rtz, so is not suitable for implementing the OpenCL
432// fract function. It should be ok on VI.
433def int_amdgcn_fract : DefaultAttrsIntrinsic<
434  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
435>;
436
437def int_amdgcn_cvt_pkrtz : ClangBuiltin<"__builtin_amdgcn_cvt_pkrtz">,
438  DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty],
439            [IntrNoMem, IntrSpeculatable]
440>;
441
442def int_amdgcn_cvt_pknorm_i16 :
443  ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_i16">,
444  DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
445            [IntrNoMem, IntrSpeculatable]
446>;
447
448def int_amdgcn_cvt_pknorm_u16 :
449  ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_u16">,
450  DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
451            [IntrNoMem, IntrSpeculatable]
452>;
453
454def int_amdgcn_cvt_pk_i16 :
455    ClangBuiltin<"__builtin_amdgcn_cvt_pk_i16">,
456    DefaultAttrsIntrinsic<
457  [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
458  [IntrNoMem, IntrSpeculatable]
459>;
460
461def int_amdgcn_cvt_pk_u16 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_u16">,
462  DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
463    [IntrNoMem, IntrSpeculatable]
464>;
465
466def int_amdgcn_class : DefaultAttrsIntrinsic<
467  [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty],
468  [IntrNoMem, IntrSpeculatable]
469>;
470
471def int_amdgcn_fmed3 :
472  DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
473    [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
474    [IntrNoMem, IntrSpeculatable]
475>;
476
477def int_amdgcn_cubeid : ClangBuiltin<"__builtin_amdgcn_cubeid">,
478  DefaultAttrsIntrinsic<[llvm_float_ty],
479    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
480    [IntrNoMem, IntrSpeculatable]
481>;
482
483def int_amdgcn_cubema : ClangBuiltin<"__builtin_amdgcn_cubema">,
484  DefaultAttrsIntrinsic<[llvm_float_ty],
485  [llvm_float_ty, llvm_float_ty, llvm_float_ty],
486  [IntrNoMem, IntrSpeculatable]
487>;
488
489def int_amdgcn_cubesc : ClangBuiltin<"__builtin_amdgcn_cubesc">,
490  DefaultAttrsIntrinsic<[llvm_float_ty],
491    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
492    [IntrNoMem, IntrSpeculatable]
493>;
494
495def int_amdgcn_cubetc : ClangBuiltin<"__builtin_amdgcn_cubetc">,
496  DefaultAttrsIntrinsic<[llvm_float_ty],
497    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
498    [IntrNoMem, IntrSpeculatable]
499>;
500
501// v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz
502// should be used.
503def int_amdgcn_sffbh :
504  DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>],
505  [IntrNoMem, IntrSpeculatable]
506>;
507
508// v_mad_f32|f16/v_mac_f32|f16, selected regardless of denorm support.
509def int_amdgcn_fmad_ftz :
510  DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
511            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
512            [IntrNoMem, IntrSpeculatable]
513>;
514
515class AMDGPULDSIntrin :
516  Intrinsic<[llvm_any_ty],
517    [LLVMQualPointerType<3>,
518    LLVMMatchType<0>,
519    llvm_i32_ty, // ordering
520    llvm_i32_ty, // scope
521    llvm_i1_ty], // isVolatile
522    [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
523     ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]
524>;
525
526// FIXME: The m0 argument should be moved after the normal arguments
527class AMDGPUDSOrderedIntrinsic : Intrinsic<
528  [llvm_i32_ty],
529  // M0 = {hi16:address, lo16:waveID}. Allow passing M0 as a pointer, so that
530  // the bit packing can be optimized at the IR level.
531  [LLVMQualPointerType<2>, // IntToPtr(M0)
532   llvm_i32_ty, // value to add or swap
533   llvm_i32_ty, // ordering
534   llvm_i32_ty, // scope
535   llvm_i1_ty,  // isVolatile
536   llvm_i32_ty, // ordered count index (OA index), also added to the address
537                // gfx10: bits 24-27 indicate the number of active threads/dwords
538   llvm_i1_ty,  // wave release, usually set to 1
539   llvm_i1_ty], // wave done, set to 1 for the last ordered instruction
540  [IntrWillReturn, NoCapture<ArgIndex<0>>,
541   ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>,
542   ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree
543  ]
544>;
545
546class AMDGPUDSAppendConsumedIntrinsic : Intrinsic<
547  [llvm_i32_ty],
548  [llvm_anyptr_ty, // LDS or GDS ptr
549   llvm_i1_ty], // isVolatile
550   [IntrConvergent, IntrWillReturn, IntrArgMemOnly,
551    NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree],
552   "",
553   [SDNPMemOperand]
554>;
555
556def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic;
557def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
558
559// The pointer argument is assumed to be dynamically uniform if a VGPR.
560def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
561def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
562
563def int_amdgcn_ds_fadd : AMDGPULDSIntrin;
564def int_amdgcn_ds_fmin : AMDGPULDSIntrin;
565def int_amdgcn_ds_fmax : AMDGPULDSIntrin;
566
567} // TargetPrefix = "amdgcn"
568
569// New-style image intrinsics
570
571//////////////////////////////////////////////////////////////////////////
572// Dimension-aware image intrinsics framework
573//////////////////////////////////////////////////////////////////////////
574
575// Helper class to represent (type, name) combinations of arguments. The
576// argument names are explanatory and used as DAG operand names for codegen
577// pattern matching.
578class AMDGPUArg<LLVMType ty, string name> {
579  LLVMType Type = ty;
580  string Name = name;
581}
582
583// Return [AMDGPUArg<basety, names[0]>, AMDGPUArg<LLVMMatchType<0>, names[1]>, ...]
584class makeArgList<list<string> names, LLVMType basety> {
585  list<AMDGPUArg> ret =
586    !listconcat([AMDGPUArg<basety, names[0]>],
587                !foreach(name, !tail(names), AMDGPUArg<LLVMMatchType<0>, name>));
588}
589
590// Return arglist, with LLVMMatchType's references shifted by 'shift'.
591class arglistmatchshift<list<AMDGPUArg> arglist, int shift> {
592  list<AMDGPUArg> ret =
593    !foreach(arg, arglist,
594             !if(!isa<LLVMMatchType>(arg.Type),
595                 AMDGPUArg<LLVMMatchType<!add(!cast<LLVMMatchType>(arg.Type).Number, shift)>,
596                           arg.Name>,
597                 arg));
598}
599
600// Return the concatenation of the given arglists. LLVMMatchType's are adjusted
601// accordingly, and shifted by an additional 'shift'.
602class arglistconcat<list<list<AMDGPUArg>> arglists, int shift = 0> {
603  list<AMDGPUArg> ret =
604    !foldl([]<AMDGPUArg>, arglists, lhs, rhs,
605           !listconcat(
606             lhs,
607             arglistmatchshift<rhs,
608                               !add(shift, !foldl(0, lhs, a, b,
609                                                  !add(a, b.Type.isAny)))>.ret));
610}
611
612// Represent texture/image types / dimensionality.
613class AMDGPUDimProps<bits<3> enc, string name, string asmsuffix,
614                     list<string> coord_names, list<string> slice_names,
615                     bit msaa = 0> {
616  AMDGPUDimProps Dim = !cast<AMDGPUDimProps>(NAME);
617  string Name = name; // e.g. "2darraymsaa"
618  string AsmSuffix = asmsuffix; // e.g. 2D_MSAA_ARRAY (used in assembly strings)
619  bits<3> Encoding = enc;
620  bit DA = 0; // DA bit in MIMG encoding
621  bit MSAA = msaa;
622
623  list<AMDGPUArg> CoordSliceArgs =
624    makeArgList<!listconcat(coord_names, slice_names), llvm_anyfloat_ty>.ret;
625  list<AMDGPUArg> CoordSliceIntArgs =
626    makeArgList<!listconcat(coord_names, slice_names), llvm_anyint_ty>.ret;
627  list<AMDGPUArg> GradientArgs =
628    makeArgList<!listconcat(!foreach(name, coord_names, "d" # name # "dh"),
629                            !foreach(name, coord_names, "d" # name # "dv")),
630                llvm_anyfloat_ty>.ret;
631
632  bits<8> NumCoords = !size(CoordSliceArgs);
633  bits<8> NumGradients = !size(GradientArgs);
634}
635
636def AMDGPUDim1D : AMDGPUDimProps<0x0, "1d", "1D", ["s"], []>;
637def AMDGPUDim2D : AMDGPUDimProps<0x1, "2d", "2D", ["s", "t"], []>;
638def AMDGPUDim3D : AMDGPUDimProps<0x2, "3d", "3D", ["s", "t", "r"], []>;
639let DA = 1 in {
640  def AMDGPUDimCube : AMDGPUDimProps<0x3, "cube", "CUBE", ["s", "t"], ["face"]>;
641  def AMDGPUDim1DArray : AMDGPUDimProps<0x4, "1darray", "1D_ARRAY", ["s"], ["slice"]>;
642  def AMDGPUDim2DArray : AMDGPUDimProps<0x5, "2darray", "2D_ARRAY", ["s", "t"], ["slice"]>;
643}
644def AMDGPUDim2DMsaa : AMDGPUDimProps<0x6, "2dmsaa", "2D_MSAA", ["s", "t"], ["fragid"], 1>;
645let DA = 1 in {
646  def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<0x7, "2darraymsaa", "2D_MSAA_ARRAY", ["s", "t"], ["slice", "fragid"], 1>;
647}
648
649def AMDGPUDims {
650  list<AMDGPUDimProps> NoMsaa = [AMDGPUDim1D, AMDGPUDim2D, AMDGPUDim3D,
651                                 AMDGPUDimCube, AMDGPUDim1DArray,
652                                 AMDGPUDim2DArray];
653  list<AMDGPUDimProps> Msaa = [AMDGPUDim2DMsaa, AMDGPUDim2DArrayMsaa];
654  list<AMDGPUDimProps> All = !listconcat(NoMsaa, Msaa);
655}
656
657// Represent sample variants, i.e. _C, _O, _B, ... and combinations thereof.
658class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr> {
659  string UpperCaseMod = ucmod;
660  string LowerCaseMod = lcmod;
661
662  // {offset} {bias} {z-compare}
663  list<AMDGPUArg> ExtraAddrArgs = extra_addr;
664  bit Offset = false;
665  bit Bias = false;
666  bit ZCompare = false;
667  bit Gradients = false;
668
669  // Name of the {lod} or {clamp} argument that is appended to the coordinates,
670  // if any.
671  string LodOrClamp = "";
672}
673
674// AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE
675// AMDGPUSampleVariantsNoGradients: variants supported by IMAGE_GATHER4
676defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = {
677  multiclass AMDGPUSampleHelper_Offset<string ucmod, string lcmod,
678                                       list<AMDGPUArg> extra_addr> {
679    def NAME#lcmod : AMDGPUSampleVariant<ucmod, lcmod, extra_addr>;
680    let Offset = true in
681    def NAME#lcmod#_o : AMDGPUSampleVariant<
682        ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg<llvm_i32_ty, "offset">], extra_addr)>;
683  }
684
685  multiclass AMDGPUSampleHelper_Compare<string ucmod, string lcmod,
686                                        list<AMDGPUArg> extra_addr> {
687    defm NAME : AMDGPUSampleHelper_Offset<ucmod, lcmod, extra_addr>;
688    let ZCompare = true in
689    defm NAME : AMDGPUSampleHelper_Offset<
690        "_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg<llvm_float_ty, "zcompare">])>;
691  }
692
693  multiclass AMDGPUSampleHelper_Clamp<string ucmod, string lcmod,
694                                      list<AMDGPUArg> extra_addr> {
695    defm NAME : AMDGPUSampleHelper_Compare<ucmod, lcmod, extra_addr>;
696    let LodOrClamp = "clamp" in
697    defm NAME : AMDGPUSampleHelper_Compare<ucmod#"_CL", lcmod#"_cl", extra_addr>;
698  }
699
700  defset list<AMDGPUSampleVariant> AMDGPUSampleVariantsNoGradients = {
701    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>;
702    let Bias = true in
703    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<
704        "_B", "_b", [AMDGPUArg<llvm_anyfloat_ty, "bias">]>;
705    let LodOrClamp = "lod" in
706    defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_L", "_l", []>;
707    defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_LZ", "_lz", []>;
708  }
709
710  let Gradients = true in {
711    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_D", "_d", []>;
712    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_CD", "_cd", []>;
713  }
714}
715
716// Helper class to capture the profile of a dimension-aware image intrinsic.
717// This information is used to generate the intrinsic's type and to inform
718// codegen pattern matching.
719class AMDGPUDimProfile<string opmod,
720                       AMDGPUDimProps dim> {
721  AMDGPUDimProps Dim = dim;
722  string OpMod = opmod; // the corresponding instruction is named IMAGE_OpMod
723
724  // These are intended to be overwritten by subclasses
725  bit IsSample = false;
726  bit IsAtomic = false;
727  list<LLVMType> RetTypes = [];
728  list<AMDGPUArg> DataArgs = [];
729  list<AMDGPUArg> ExtraAddrArgs = [];
730  bit Offset = false;
731  bit Bias = false;
732  bit ZCompare = false;
733  bit Gradients = false;
734  string LodClampMip = "";
735
736  int NumRetAndDataAnyTypes =
737    !foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b,
738           !add(a, b.isAny));
739
740  list<AMDGPUArg> AddrArgs =
741    arglistconcat<[ExtraAddrArgs,
742                   !if(Gradients, dim.GradientArgs, []),
743                   !listconcat(!if(IsSample, dim.CoordSliceArgs, dim.CoordSliceIntArgs),
744                               !if(!empty(LodClampMip),
745                                   []<AMDGPUArg>,
746                                   [AMDGPUArg<LLVMMatchType<0>, LodClampMip>]))],
747                  NumRetAndDataAnyTypes>.ret;
748  list<LLVMType> AddrTypes = !foreach(arg, AddrArgs, arg.Type);
749  list<AMDGPUArg> AddrDefaultArgs =
750    !foreach(arg, AddrArgs,
751             AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)),
752                           !if(IsSample, llvm_float_ty, llvm_i32_ty), arg.Type),
753                       arg.Name>);
754  list<AMDGPUArg> AddrA16Args =
755    !foreach(arg, AddrArgs,
756             AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)),
757                           !if(IsSample, llvm_half_ty, llvm_i16_ty), arg.Type),
758                       arg.Name>);
759}
760
761class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod, base.Dim> {
762  let IsSample = base.IsSample;
763  let IsAtomic = base.IsAtomic;
764  let RetTypes = base.RetTypes;
765  let DataArgs = base.DataArgs;
766  let ExtraAddrArgs = base.ExtraAddrArgs;
767  let Offset = base.Offset;
768  let Bias = base.Bias;
769  let ZCompare = base.ZCompare;
770  let Gradients = base.Gradients;
771  let LodClampMip = base.LodClampMip;
772}
773
774class AMDGPUDimSampleProfile<string opmod,
775                             AMDGPUDimProps dim,
776                             AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> {
777  let IsSample = true;
778  let RetTypes = [llvm_any_ty];
779  let ExtraAddrArgs = sample.ExtraAddrArgs;
780  let Offset = sample.Offset;
781  let Bias = sample.Bias;
782  let ZCompare = sample.ZCompare;
783  let Gradients = sample.Gradients;
784  let LodClampMip = sample.LodOrClamp;
785}
786
787class AMDGPUDimNoSampleProfile<string opmod,
788                               AMDGPUDimProps dim,
789                               list<LLVMType> retty,
790                               list<AMDGPUArg> dataargs,
791                               bit Mip = false> : AMDGPUDimProfile<opmod, dim> {
792  let RetTypes = retty;
793  let DataArgs = dataargs;
794  let LodClampMip = !if(Mip, "mip", "");
795}
796
797class AMDGPUDimAtomicProfile<string opmod,
798                             AMDGPUDimProps dim,
799                             list<AMDGPUArg> dataargs> : AMDGPUDimProfile<opmod, dim> {
800  let RetTypes = [llvm_anyint_ty];
801  let DataArgs = dataargs;
802  let IsAtomic = true;
803}
804
805class AMDGPUDimAtomicFloatProfile<string opmod, AMDGPUDimProps dim,
806                                  list<AMDGPUArg> dataargs>
807    : AMDGPUDimAtomicProfile<opmod, dim, dataargs> {
808  let RetTypes = [llvm_anyfloat_ty];
809}
810
811class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim>
812    : AMDGPUDimProfile<"GET_RESINFO", dim> {
813  let RetTypes = [llvm_anyfloat_ty];
814  let DataArgs = [];
815  let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">];
816  let LodClampMip = "mip";
817}
818
819// Helper class for figuring out image intrinsic argument indexes.
820class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> {
821  int NumDataArgs = !size(P_.DataArgs);
822  int NumDmaskArgs = !not(P_.IsAtomic);
823  int NumOffsetArgs = !if(P_.Offset, 1, 0);
824  int NumBiasArgs = !if(P_.Bias, 1, 0);
825  int NumZCompareArgs = !if(P_.ZCompare, 1, 0);
826  int NumExtraAddrArgs = !add(NumOffsetArgs, NumBiasArgs, NumZCompareArgs);
827  int NumVAddrArgs = !size(P_.AddrArgs);
828  int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0);
829  int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs));
830  int NumRSrcArgs = 1;
831  int NumSampArgs = !if(P_.IsSample, 2, 0);
832  int DmaskArgIndex = NumDataArgs;
833  int VAddrArgIndex = !add(DmaskArgIndex, NumDmaskArgs);
834  int OffsetArgIndex = VAddrArgIndex;
835  int BiasArgIndex = !add(VAddrArgIndex, NumOffsetArgs);
836  int ZCompareArgIndex = !add(BiasArgIndex, NumBiasArgs);
837  int GradientArgIndex = !add(VAddrArgIndex, NumExtraAddrArgs);
838  int CoordArgIndex = !add(GradientArgIndex, NumGradientArgs);
839  int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1);
840  int MipArgIndex = LodArgIndex;
841  int RsrcArgIndex = !add(VAddrArgIndex, NumVAddrArgs);
842  int SampArgIndex = !add(RsrcArgIndex, NumRSrcArgs);
843  int UnormArgIndex = !add(SampArgIndex, 1);
844  int TexFailCtrlArgIndex = !add(SampArgIndex, NumSampArgs);
845  int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1);
846}
847
848// All dimension-aware intrinsics are derived from this class.
849class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
850                              list<IntrinsicProperty> props,
851                              list<SDNodeProperty> sdnodeprops> : Intrinsic<
852    P_.RetTypes,        // vdata(VGPR) -- for load/atomic-with-return
853    !listconcat(
854      !foreach(arg, P_.DataArgs, arg.Type),      // vdata(VGPR) -- for store/atomic
855      !if(P_.IsAtomic, [], [llvm_i32_ty]),       // dmask(imm)
856      P_.AddrTypes,                              // vaddr(VGPR)
857      [llvm_v8i32_ty],                           // rsrc(SGPR)
858      !if(P_.IsSample, [llvm_v4i32_ty,           // samp(SGPR)
859                        llvm_i1_ty], []),        // unorm(imm)
860      [llvm_i32_ty,                              // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe)
861       llvm_i32_ty]),                            // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc;
862                                                 //   gfx12+ imm: bits [0-2] = th, bits [3-4] = scope)
863
864     !listconcat(props, [IntrNoCallback, IntrNoFree, IntrWillReturn],
865          !if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]),
866          !if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []),
867          [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>,
868           ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>],
869          !if(P_.IsAtomic, [], [IntrNoSync])),
870
871
872      "", sdnodeprops>,
873  AMDGPURsrcIntrinsic<!add(!size(P_.DataArgs), !size(P_.AddrTypes),
874                           !if(P_.IsAtomic, 0, 1)), 1> {
875  AMDGPUDimProfile P = P_;
876
877  AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME);
878
879  let TargetPrefix = "amdgcn";
880}
881
882// Marker class for intrinsics with a DMask that determines the returned
883// channels.
884class AMDGPUImageDMaskIntrinsic;
885
886defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
887
888  //////////////////////////////////////////////////////////////////////////
889  // Load and store intrinsics
890  //////////////////////////////////////////////////////////////////////////
891  multiclass AMDGPUImageDimIntrinsicsNoMsaa<string opmod,
892                                            list<LLVMType> retty,
893                                            list<AMDGPUArg> dataargs,
894                                            list<IntrinsicProperty> props,
895                                            list<SDNodeProperty> sdnodeprops,
896                                            bit Mip = false> {
897    foreach dim = AMDGPUDims.NoMsaa in {
898      def !strconcat(NAME, "_", dim.Name)
899        : AMDGPUImageDimIntrinsic<
900            AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>,
901            props, sdnodeprops>;
902    }
903  }
904
905  multiclass AMDGPUImageDimIntrinsicsAll<string opmod,
906                                         list<LLVMType> retty,
907                                         list<AMDGPUArg> dataargs,
908                                         list<IntrinsicProperty> props,
909                                         list<SDNodeProperty> sdnodeprops,
910                                         bit Mip = false> {
911    foreach dim = AMDGPUDims.All in {
912      def !strconcat(NAME, "_", dim.Name)
913        : AMDGPUImageDimIntrinsic<
914            AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>,
915            props, sdnodeprops>;
916    }
917  }
918
919  defm int_amdgcn_image_load
920    : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem],
921                                  [SDNPMemOperand]>,
922      AMDGPUImageDMaskIntrinsic;
923  defm int_amdgcn_image_load_mip
924    : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [],
925                                     [IntrReadMem, IntrWillReturn], [SDNPMemOperand], 1>,
926      AMDGPUImageDMaskIntrinsic;
927
928  defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll<
929              "STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
930              [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>,
931              AMDGPUImageDMaskIntrinsic;
932  defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa<
933              "STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
934              [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>,
935              AMDGPUImageDMaskIntrinsic;
936
937  //////////////////////////////////////////////////////////////////////////
938  // MSAA intrinsics
939  //////////////////////////////////////////////////////////////////////////
940  foreach dim = AMDGPUDims.Msaa in {
941    def int_amdgcn_image_msaa_load_x # _ # dim.Name:
942        AMDGPUImageDimIntrinsic<
943            AMDGPUDimNoSampleProfile<"MSAA_LOAD_X", dim, [llvm_any_ty], []>,
944            [IntrReadMem], [SDNPMemOperand]>;
945  }
946
947  foreach dim = AMDGPUDims.Msaa in {
948    def int_amdgcn_image_msaa_load # _ # dim.Name:
949        AMDGPUImageDimIntrinsic<
950            AMDGPUDimNoSampleProfile<"MSAA_LOAD", dim, [llvm_any_ty], []>,
951            [IntrReadMem], [SDNPMemOperand]>;
952  }
953
954  //////////////////////////////////////////////////////////////////////////
955  // sample and getlod intrinsics
956  //////////////////////////////////////////////////////////////////////////
957  multiclass AMDGPUImageDimSampleDims<string opmod,
958                                      AMDGPUSampleVariant sample,
959                                      bit NoMem = false> {
960    foreach dim = AMDGPUDims.NoMsaa in {
961      def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic<
962          AMDGPUDimSampleProfile<opmod, dim, sample>,
963          !if(NoMem, [IntrNoMem], [IntrReadMem]),
964          !if(NoMem, [], [SDNPMemOperand])>;
965    }
966  }
967
968  foreach sample = AMDGPUSampleVariants in {
969    defm int_amdgcn_image_sample # sample.LowerCaseMod
970      : AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>,
971        AMDGPUImageDMaskIntrinsic;
972  }
973
974  defm int_amdgcn_image_getlod
975    : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>,
976      AMDGPUImageDMaskIntrinsic;
977
978  //////////////////////////////////////////////////////////////////////////
979  // getresinfo intrinsics
980  //////////////////////////////////////////////////////////////////////////
981  foreach dim = AMDGPUDims.All in {
982    def !strconcat("int_amdgcn_image_getresinfo_", dim.Name)
983      : AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>,
984        AMDGPUImageDMaskIntrinsic;
985  }
986
987  //////////////////////////////////////////////////////////////////////////
988  // gather4 intrinsics
989  //////////////////////////////////////////////////////////////////////////
990  foreach sample = AMDGPUSampleVariantsNoGradients in {
991    foreach dim = [AMDGPUDim2D, AMDGPUDimCube, AMDGPUDim2DArray] in {
992      def int_amdgcn_image_gather4 # sample.LowerCaseMod # _ # dim.Name:
993          AMDGPUImageDimIntrinsic<
994              AMDGPUDimSampleProfile<"GATHER4" # sample.UpperCaseMod, dim, sample>,
995              [IntrReadMem], [SDNPMemOperand]>;
996    }
997  }
998}
999
1000//////////////////////////////////////////////////////////////////////////
1001// atomic intrinsics
1002//////////////////////////////////////////////////////////////////////////
1003defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
1004  multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs,
1005                                   int isFloat = 0> {
1006        foreach dim = AMDGPUDims.All in {
1007          def !strconcat(NAME, "_", dim.Name): AMDGPUImageDimIntrinsic<
1008              !if (isFloat, AMDGPUDimAtomicFloatProfile<opmod, dim, dataargs>,
1009                   AMDGPUDimAtomicProfile<opmod, dim, dataargs>),
1010              [], [SDNPMemOperand]>;
1011        }
1012  }
1013
1014  multiclass AMDGPUImageDimAtomic<string opmod, int isFloat = 0> {
1015    defm ""
1016        : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">],
1017                                isFloat>;
1018  }
1019
1020  multiclass AMDGPUImageDimFloatAtomic<string opmod> {
1021    defm "" : AMDGPUImageDimAtomic<opmod, 1 /*isFloat*/>;
1022  }
1023
1024  defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">;
1025  defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">;
1026  defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">;
1027  defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">;
1028  defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">;
1029  defm int_amdgcn_image_atomic_fmin : AMDGPUImageDimFloatAtomic<"ATOMIC_FMIN">;
1030  defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">;
1031  defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">;
1032  defm int_amdgcn_image_atomic_fmax : AMDGPUImageDimFloatAtomic<"ATOMIC_FMAX">;
1033  defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">;
1034  defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">;
1035  defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">;
1036  defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">;
1037  defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">;
1038  defm int_amdgcn_image_atomic_add_flt : AMDGPUImageDimFloatAtomic<"ATOMIC_ADD_FLT">;
1039  defm int_amdgcn_image_atomic_min_flt : AMDGPUImageDimFloatAtomic<"ATOMIC_MIN_FLT">;
1040  defm int_amdgcn_image_atomic_max_flt : AMDGPUImageDimFloatAtomic<"ATOMIC_MAX_FLT">;
1041
1042  defm int_amdgcn_image_atomic_cmpswap :
1043      AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">,
1044                                               AMDGPUArg<LLVMMatchType<0>, "cmp">]>;
1045
1046  defm int_amdgcn_image_atomic_pk_add_f16 : AMDGPUImageDimFloatAtomic<"ATOMIC_PK_ADD_F16">;
1047  defm int_amdgcn_image_atomic_pk_add_bf16 : AMDGPUImageDimFloatAtomic<"ATOMIC_PK_ADD_BF16">;
1048}
1049
1050//////////////////////////////////////////////////////////////////////////
1051// Buffer intrinsics
1052//////////////////////////////////////////////////////////////////////////
1053
1054// Data type for buffer resources (V#). Maybe, in the future, we can create a
1055// similar one for textures (T#).
1056def AMDGPUBufferRsrcTy : LLVMQualPointerType<8>;
1057
1058let TargetPrefix = "amdgcn" in {
1059
1060def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic <
1061  [AMDGPUBufferRsrcTy],
1062  [llvm_anyptr_ty, // base
1063   llvm_i16_ty,    // stride (and swizzle control)
1064   llvm_i32_ty,    // NumRecords / extent
1065   llvm_i32_ty],   // flags
1066  // Attributes lifted from ptrmask + some extra argument attributes.
1067  [IntrNoMem, ReadNone<ArgIndex<0>>,
1068   IntrSpeculatable, IntrWillReturn]>;
1069
1070defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = {
1071
1072class AMDGPUBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
1073  [data_ty],
1074  [llvm_v4i32_ty,     // rsrc(SGPR)
1075   llvm_i32_ty,       // vindex(VGPR)
1076   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
1077   llvm_i1_ty,        // glc(imm)
1078   llvm_i1_ty],       // slc(imm)
1079  [IntrReadMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1080  AMDGPURsrcIntrinsic<0>;
1081def int_amdgcn_buffer_load_format : AMDGPUBufferLoad<llvm_anyfloat_ty>;
1082def int_amdgcn_buffer_load : AMDGPUBufferLoad;
1083
1084// Generate a buffer_load instruction that may be optimized to s_buffer_load if
1085// the offset argument is uniform.
1086def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic <
1087  [llvm_any_ty],
1088  [llvm_v4i32_ty,     // rsrc(SGPR)
1089   llvm_i32_ty,       // byte offset
1090   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc;
1091                      //   gfx12+ imm: bits [0-2] = th, bits [3-4] = scope)
1092                      // Note: volatile bit is **not** permitted here.
1093  [IntrNoMem, ImmArg<ArgIndex<2>>]>,
1094  AMDGPURsrcIntrinsic<0>;
1095
1096class AMDGPUBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
1097  [],
1098  [data_ty,          // vdata(VGPR)
1099   llvm_v4i32_ty,     // rsrc(SGPR)
1100   llvm_i32_ty,       // vindex(VGPR)
1101   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
1102   llvm_i1_ty,        // glc(imm)
1103   llvm_i1_ty],       // slc(imm)
1104  [IntrWriteMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1105  AMDGPURsrcIntrinsic<1>;
1106def int_amdgcn_buffer_store_format : AMDGPUBufferStore<llvm_anyfloat_ty>;
1107def int_amdgcn_buffer_store : AMDGPUBufferStore;
1108
1109// New buffer intrinsics with separate raw and struct variants.  The raw
1110// variant never has an index. The struct variant always has an index, even if
1111// it is const 0. A struct intrinsic with constant 0 index is different to the
1112// corresponding raw intrinsic on gfx9+ because the behavior of bound checking
1113// and swizzling changes depending on whether idxen is set in the instruction.
1114// These new instrinsics also keep the offset and soffset arguments separate as
1115// they behave differently in bounds checking and swizzling.
1116
1117// The versions of these intrinsics that take <4 x i32> arguments are deprecated
1118// in favor of their .ptr.buffer variants that take ptr addrspace(8) arguments,
1119// which allow for improved reasoning about memory accesses.
1120//
1121// Note that in the cachepolicy for all these intrinsics, bit 31 is not preserved
1122// through to final assembly selection and is used to signal that the buffer
1123// operation is volatile.
1124class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
1125  [data_ty],
1126  [llvm_v4i32_ty,     // rsrc(SGPR)
1127   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1128   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1129   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1130                      //                                       bit 1 = slc,
1131                      //                                       bit 2 = dlc on gfx10/gfx11),
1132                      //                      swizzled buffer (bit 3 = swz),
1133                      //                  gfx12+:
1134                      //                      cachepolicy (bits [0-2] = th,
1135                      //                                   bits [3-4] = scope)
1136                      //                      swizzled buffer (bit 6 = swz),
1137                      //                  all:
1138                      //                      volatile op (bit 31, stripped at lowering))
1139  [IntrReadMem, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>,
1140  AMDGPURsrcIntrinsic<0>;
1141def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>;
1142def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad;
1143
1144class AMDGPURawPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
1145  [data_ty],
1146  [AMDGPUBufferRsrcTy,         // rsrc(SGPR)
1147   llvm_i32_ty,                 // offset(VGPR/imm, included in bounds checking and swizzling)
1148   llvm_i32_ty,                 // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1149   llvm_i32_ty],                // auxiliary data (imm, cachepolicy (bit 0 = glc,
1150                                //                                   bit 1 = slc,
1151                                //                                   bit 2 = dlc on gfx10/gfx11),
1152                                //                      swizzled buffer (bit 3 = swz),
1153                                //                  gfx12+:
1154                                //                      cachepolicy (bits [0-2] = th,
1155                                //                                   bits [3-4] = scope)
1156                                //                      swizzled buffer (bit 6 = swz),
1157                                //                  all:
1158                                //                      volatile op (bit 31, stripped at lowering))
1159
1160  [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
1161  ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>,
1162  AMDGPURsrcIntrinsic<0>;
1163def int_amdgcn_raw_ptr_buffer_load_format : AMDGPURawPtrBufferLoad<llvm_anyfloat_ty>;
1164def int_amdgcn_raw_ptr_buffer_load : AMDGPURawPtrBufferLoad;
1165
1166class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
1167  [data_ty],
1168  [llvm_v4i32_ty,     // rsrc(SGPR)
1169   llvm_i32_ty,       // vindex(VGPR)
1170   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1171   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1172   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1173                      //                                       bit 1 = slc,
1174                      //                                       bit 2 = dlc on gfx10/gfx11),
1175                      //                      swizzled buffer (bit 3 = swz),
1176                      //                  gfx12+:
1177                      //                      cachepolicy (bits [0-2] = th,
1178                      //                                   bits [3-4] = scope)
1179                      //                      swizzled buffer (bit 6 = swz),
1180                      //                  all:
1181                      //                      volatile op (bit 31, stripped at lowering))
1182  [IntrReadMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1183  AMDGPURsrcIntrinsic<0>;
1184def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad;
1185def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad;
1186
1187class AMDGPUStructPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
1188  [data_ty],
1189  [AMDGPUBufferRsrcTy,          // rsrc(SGPR)
1190   llvm_i32_ty,                 // vindex(VGPR)
1191   llvm_i32_ty,                 // offset(VGPR/imm, included in bounds checking and swizzling)
1192   llvm_i32_ty,                 // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1193   llvm_i32_ty],                // auxiliary data (imm, cachepolicy (bit 0 = glc,
1194                                //                                   bit 1 = slc,
1195                                //                                   bit 2 = dlc on gfx10/gfx11),
1196                                //                      swizzled buffer (bit 3 = swz),
1197                                //                  gfx12+:
1198                                //                      cachepolicy (bits [0-2] = th,
1199                                //                                   bits [3-4] = scope)
1200                                //                      swizzled buffer (bit 6 = swz),
1201                                //                  all:
1202                                //                      volatile op (bit 31, stripped at lowering))
1203  [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
1204   ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1205  AMDGPURsrcIntrinsic<0>;
1206def int_amdgcn_struct_ptr_buffer_load_format : AMDGPUStructPtrBufferLoad;
1207def int_amdgcn_struct_ptr_buffer_load : AMDGPUStructPtrBufferLoad;
1208
1209class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
1210  [],
1211  [data_ty,           // vdata(VGPR)
1212   llvm_v4i32_ty,     // rsrc(SGPR)
1213   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1214   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1215   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1216                      //                                       bit 1 = slc,
1217                      //                                       bit 2 = dlc on gfx10/gfx11),
1218                      //                      swizzled buffer (bit 3 = swz),
1219                      //                  gfx12+:
1220                      //                      cachepolicy (bits [0-2] = th,
1221                      //                                   bits [3-4] = scope)
1222                      //                      swizzled buffer (bit 6 = swz),
1223                      //                  all:
1224                      //                      volatile op (bit 31, stripped at lowering))
1225  [IntrWriteMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1226  AMDGPURsrcIntrinsic<1>;
1227def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore<llvm_anyfloat_ty>;
1228def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore;
1229
1230class AMDGPURawPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
1231  [],
1232  [data_ty,                     // vdata(VGPR)
1233   AMDGPUBufferRsrcTy,          // rsrc(SGPR)
1234   llvm_i32_ty,                 // offset(VGPR/imm, included in bounds checking and swizzling)
1235   llvm_i32_ty,                 // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1236   llvm_i32_ty],                // auxiliary data (imm, cachepolicy (bit 0 = glc,
1237                                //                                   bit 1 = slc,
1238                                //                                   bit 2 = dlc on gfx10/gfx11),
1239                                //                      swizzled buffer (bit 3 = swz),
1240                                //                  gfx12+:
1241                                //                      cachepolicy (bits [0-2] = th,
1242                                //                                   bits [3-4] = scope)
1243                                //                      swizzled buffer (bit 6 = swz),
1244                                //                  all:
1245                                //                      volatile op (bit 31, stripped at lowering))
1246  [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
1247  ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1248  AMDGPURsrcIntrinsic<1>;
1249def int_amdgcn_raw_ptr_buffer_store_format : AMDGPURawPtrBufferStore<llvm_anyfloat_ty>;
1250def int_amdgcn_raw_ptr_buffer_store : AMDGPURawPtrBufferStore;
1251
1252class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
1253  [],
1254  [data_ty,           // vdata(VGPR)
1255   llvm_v4i32_ty,     // rsrc(SGPR)
1256   llvm_i32_ty,       // vindex(VGPR)
1257   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1258   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1259   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1260                      //                                       bit 1 = slc,
1261                      //                                       bit 2 = dlc on gfx10/gfx11),
1262                      //                      swizzled buffer (bit 3 = swz),
1263                      //                  gfx12+:
1264                      //                      cachepolicy (bits [0-2] = th,
1265                      //                                   bits [3-4] = scope)
1266                      //                      swizzled buffer (bit 6 = swz),
1267                      //                  all:
1268                      //                      volatile op (bit 31, stripped at lowering))
1269  [IntrWriteMem, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1270  AMDGPURsrcIntrinsic<1>;
1271def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore;
1272def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore;
1273
1274class AMDGPUStructPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
1275  [],
1276  [data_ty,                     // vdata(VGPR)
1277   AMDGPUBufferRsrcTy,          // rsrc(SGPR)
1278   llvm_i32_ty,                 // vindex(VGPR)
1279   llvm_i32_ty,                 // offset(VGPR/imm, included in bounds checking and swizzling)
1280   llvm_i32_ty,                 // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1281   llvm_i32_ty],                // auxiliary data (imm, cachepolicy (bit 0 = glc,
1282                                //                                   bit 1 = slc,
1283                                //                                   bit 2 = dlc on gfx10/gfx11),
1284                                //                      swizzled buffer (bit 3 = swz),
1285                                //                  gfx12+:
1286                                //                      cachepolicy (bits [0-2] = th,
1287                                //                                   bits [3-4] = scope)
1288                                //                      swizzled buffer (bit 6 = swz),
1289                                //                  all:
1290                                //                      volatile op (bit 31, stripped at lowering))
1291  [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
1292   ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1293  AMDGPURsrcIntrinsic<1>;
1294def int_amdgcn_struct_ptr_buffer_store_format : AMDGPUStructPtrBufferStore;
1295def int_amdgcn_struct_ptr_buffer_store : AMDGPUStructPtrBufferStore;
1296
1297class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1298  [data_ty],
1299  [LLVMMatchType<0>,  // vdata(VGPR)
1300   llvm_v4i32_ty,     // rsrc(SGPR)
1301   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1302   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1303   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile)
1304  [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1305  AMDGPURsrcIntrinsic<1, 0>;
1306def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic;
1307def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic;
1308def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic;
1309def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic;
1310def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic;
1311def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
1312def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic;
1313def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic;
1314def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
1315def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic;
1316def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic;
1317def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
1318def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic;
1319def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic;
1320def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic;
1321def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
1322  [llvm_anyint_ty],
1323  [LLVMMatchType<0>,  // src(VGPR)
1324   LLVMMatchType<0>,  // cmp(VGPR)
1325   llvm_v4i32_ty,     // rsrc(SGPR)
1326   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1327   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1328   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile)
1329  [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1330  AMDGPURsrcIntrinsic<2, 0>;
1331
1332class AMDGPURawPtrBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1333  [data_ty],
1334  [LLVMMatchType<0>,            // vdata(VGPR)
1335   AMDGPUBufferRsrcTy,          // rsrc(SGPR)
1336   llvm_i32_ty,                 // offset(VGPR/imm, included in bounds checking and swizzling)
1337   llvm_i32_ty,                 // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1338   llvm_i32_ty],                // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile)
1339  [IntrArgMemOnly, NoCapture<ArgIndex<1>>,
1340   ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1341  AMDGPURsrcIntrinsic<1, 0>;
1342
1343def int_amdgcn_raw_ptr_buffer_atomic_swap : AMDGPURawPtrBufferAtomic;
1344def int_amdgcn_raw_ptr_buffer_atomic_add : AMDGPURawPtrBufferAtomic;
1345def int_amdgcn_raw_ptr_buffer_atomic_sub : AMDGPURawPtrBufferAtomic;
1346def int_amdgcn_raw_ptr_buffer_atomic_smin : AMDGPURawPtrBufferAtomic;
1347def int_amdgcn_raw_ptr_buffer_atomic_umin : AMDGPURawPtrBufferAtomic;
1348def int_amdgcn_raw_ptr_buffer_atomic_fmin : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>;
1349def int_amdgcn_raw_ptr_buffer_atomic_smax : AMDGPURawPtrBufferAtomic;
1350def int_amdgcn_raw_ptr_buffer_atomic_umax : AMDGPURawPtrBufferAtomic;
1351def int_amdgcn_raw_ptr_buffer_atomic_fmax : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>;
1352def int_amdgcn_raw_ptr_buffer_atomic_and : AMDGPURawPtrBufferAtomic;
1353def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic;
1354def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic;
1355def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic;
1356def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic;
1357def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic;
1358def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
1359  [llvm_anyint_ty],
1360  [LLVMMatchType<0>,  // src(VGPR)
1361   LLVMMatchType<0>,  // cmp(VGPR)
1362   AMDGPUBufferRsrcTy, // rsrc(SGPR)
1363   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1364   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1365   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile)
1366  [IntrArgMemOnly, NoCapture<ArgIndex<2>>,
1367   ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1368  AMDGPURsrcIntrinsic<2, 0>;
1369
1370// gfx908 intrinsic
1371def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
1372def int_amdgcn_raw_ptr_buffer_atomic_fadd : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>;
1373// gfx12+ intrinsic
1374def int_amdgcn_raw_buffer_atomic_fadd_v2bf16 : Intrinsic <
1375  [llvm_v2bf16_ty],
1376  [llvm_v2bf16_ty,
1377   llvm_v4i32_ty,
1378   llvm_i32_ty,
1379   llvm_i32_ty,
1380   llvm_i32_ty],
1381 [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1382 AMDGPURsrcIntrinsic<1, 0>;
1383def int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16 : Intrinsic <
1384  [llvm_v2bf16_ty],
1385  [llvm_v2bf16_ty,
1386   AMDGPUBufferRsrcTy,
1387   llvm_i32_ty,
1388   llvm_i32_ty,
1389   llvm_i32_ty],
1390 [IntrArgMemOnly, NoCapture<ArgIndex<1>>,
1391  ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1392 AMDGPURsrcIntrinsic<1, 0>;
1393
1394class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1395  [data_ty],
1396  [LLVMMatchType<0>,  // vdata(VGPR)
1397   llvm_v4i32_ty,     // rsrc(SGPR)
1398   llvm_i32_ty,       // vindex(VGPR)
1399   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1400   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1401   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile)
1402  [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1403  AMDGPURsrcIntrinsic<1, 0>;
1404def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic;
1405def int_amdgcn_struct_buffer_atomic_add : AMDGPUStructBufferAtomic;
1406def int_amdgcn_struct_buffer_atomic_sub : AMDGPUStructBufferAtomic;
1407def int_amdgcn_struct_buffer_atomic_smin : AMDGPUStructBufferAtomic;
1408def int_amdgcn_struct_buffer_atomic_umin : AMDGPUStructBufferAtomic;
1409def int_amdgcn_struct_buffer_atomic_smax : AMDGPUStructBufferAtomic;
1410def int_amdgcn_struct_buffer_atomic_umax : AMDGPUStructBufferAtomic;
1411def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic;
1412def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic;
1413def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic;
1414def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic;
1415def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic;
1416def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic;
1417def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
1418  [llvm_anyint_ty],
1419  [LLVMMatchType<0>,  // src(VGPR)
1420   LLVMMatchType<0>,  // cmp(VGPR)
1421   llvm_v4i32_ty,     // rsrc(SGPR)
1422   llvm_i32_ty,       // vindex(VGPR)
1423   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1424   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1425   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile)
1426  [ImmArg<ArgIndex<6>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1427  AMDGPURsrcIntrinsic<2, 0>;
1428
1429class AMDGPUStructPtrBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1430  [data_ty],
1431  [LLVMMatchType<0>,            // vdata(VGPR)
1432   AMDGPUBufferRsrcTy,          // rsrc(SGPR)
1433   llvm_i32_ty,                 // vindex(VGPR)
1434   llvm_i32_ty,                 // offset(VGPR/imm, included in bounds checking and swizzling)
1435   llvm_i32_ty,                 // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1436   llvm_i32_ty],                // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile)
1437  [IntrArgMemOnly, NoCapture<ArgIndex<1>>,
1438   ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1439  AMDGPURsrcIntrinsic<1, 0>;
1440def int_amdgcn_struct_ptr_buffer_atomic_swap : AMDGPUStructPtrBufferAtomic;
1441def int_amdgcn_struct_ptr_buffer_atomic_add : AMDGPUStructPtrBufferAtomic;
1442def int_amdgcn_struct_ptr_buffer_atomic_sub : AMDGPUStructPtrBufferAtomic;
1443def int_amdgcn_struct_ptr_buffer_atomic_smin : AMDGPUStructPtrBufferAtomic;
1444def int_amdgcn_struct_ptr_buffer_atomic_umin : AMDGPUStructPtrBufferAtomic;
1445def int_amdgcn_struct_ptr_buffer_atomic_smax : AMDGPUStructPtrBufferAtomic;
1446def int_amdgcn_struct_ptr_buffer_atomic_umax : AMDGPUStructPtrBufferAtomic;
1447def int_amdgcn_struct_ptr_buffer_atomic_and : AMDGPUStructPtrBufferAtomic;
1448def int_amdgcn_struct_ptr_buffer_atomic_or : AMDGPUStructPtrBufferAtomic;
1449def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic;
1450def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic;
1451def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic;
1452def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : AMDGPUStructPtrBufferAtomic;
1453def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic<
1454  [llvm_anyint_ty],
1455  [LLVMMatchType<0>,  // src(VGPR)
1456   LLVMMatchType<0>,  // cmp(VGPR)
1457   AMDGPUBufferRsrcTy, // rsrc(SGPR)
1458   llvm_i32_ty,       // vindex(VGPR)
1459   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1460   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1461   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile)
1462  [IntrArgMemOnly, NoCapture<ArgIndex<2>>,
1463   ImmArg<ArgIndex<6>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1464  AMDGPURsrcIntrinsic<2, 0>;
1465
1466// gfx908 intrinsic
1467def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
1468def int_amdgcn_struct_ptr_buffer_atomic_fadd : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>;
1469// gfx12 intrinsic
1470def int_amdgcn_struct_buffer_atomic_fadd_v2bf16 : Intrinsic <
1471  [llvm_v2bf16_ty],
1472  [llvm_v2bf16_ty,
1473   llvm_v4i32_ty,
1474   llvm_i32_ty,
1475   llvm_i32_ty,
1476   llvm_i32_ty,
1477   llvm_i32_ty],
1478  [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1479  AMDGPURsrcIntrinsic<1, 0>;
1480def int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16 : Intrinsic <
1481  [llvm_v2bf16_ty],
1482  [llvm_v2bf16_ty,
1483   AMDGPUBufferRsrcTy,
1484   llvm_i32_ty,
1485   llvm_i32_ty,
1486   llvm_i32_ty,
1487   llvm_i32_ty],
1488  [IntrArgMemOnly, NoCapture<ArgIndex<1>>,
1489   ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1490  AMDGPURsrcIntrinsic<1, 0>;
1491
1492// gfx90a intrinsics
1493def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
1494def int_amdgcn_struct_buffer_atomic_fmax : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
1495
1496def int_amdgcn_struct_ptr_buffer_atomic_fmin : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>;
1497def int_amdgcn_struct_ptr_buffer_atomic_fmax : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>;
1498
1499// Obsolescent tbuffer intrinsics.
1500def int_amdgcn_tbuffer_load : DefaultAttrsIntrinsic <
1501    [llvm_any_ty],    // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1502    [llvm_v4i32_ty,   // rsrc(SGPR)
1503     llvm_i32_ty,     // vindex(VGPR)
1504     llvm_i32_ty,     // voffset(VGPR)
1505     llvm_i32_ty,     // soffset(SGPR)
1506     llvm_i32_ty,     // offset(imm)
1507     llvm_i32_ty,     // dfmt(imm)
1508     llvm_i32_ty,     // nfmt(imm)
1509     llvm_i1_ty,     // glc(imm)
1510     llvm_i1_ty],    // slc(imm)
1511    [IntrReadMem,
1512     ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>,
1513     ImmArg<ArgIndex<7>>, ImmArg<ArgIndex<8>>], "", [SDNPMemOperand]>,
1514  AMDGPURsrcIntrinsic<0>;
1515
1516def int_amdgcn_tbuffer_store : DefaultAttrsIntrinsic <
1517    [],
1518    [llvm_any_ty,    // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1519     llvm_v4i32_ty,  // rsrc(SGPR)
1520     llvm_i32_ty,    // vindex(VGPR)
1521     llvm_i32_ty,    // voffset(VGPR)
1522     llvm_i32_ty,    // soffset(SGPR)
1523     llvm_i32_ty,    // offset(imm)
1524     llvm_i32_ty,    // dfmt(imm)
1525     llvm_i32_ty,    // nfmt(imm)
1526     llvm_i1_ty,     // glc(imm)
1527     llvm_i1_ty],    // slc(imm)
1528    [IntrWriteMem, ImmArg<ArgIndex<5>>,
1529     ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>,
1530     ImmArg<ArgIndex<8>>, ImmArg<ArgIndex<9>>], "", [SDNPMemOperand]>,
1531  AMDGPURsrcIntrinsic<1>;
1532
1533// New tbuffer intrinsics, with:
1534// - raw and struct variants
1535// - joint format field
1536// - joint cachepolicy field
1537def int_amdgcn_raw_tbuffer_load : DefaultAttrsIntrinsic <
1538    [llvm_any_ty],    // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1539    [llvm_v4i32_ty,   // rsrc(SGPR)
1540     llvm_i32_ty,     // offset(VGPR/imm, included in bounds checking and swizzling)
1541     llvm_i32_ty,     // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1542     llvm_i32_ty,     // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1543     llvm_i32_ty],    // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1544                      //                                       bit 1 = slc,
1545                      //                                       bit 2 = dlc on gfx10/gfx11),
1546                      //                      swizzled buffer (bit 3 = swz))
1547                      //                  gfx12+:
1548                      //                      cachepolicy (bits [0-2] = th,
1549                      //                                   bits [3-4] = scope)
1550                      //                      swizzled buffer (bit 6 = swz)
1551    [IntrReadMem,
1552     ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1553  AMDGPURsrcIntrinsic<0>;
1554
1555def int_amdgcn_raw_ptr_tbuffer_load : DefaultAttrsIntrinsic <
1556    [llvm_any_ty],      // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1557    [AMDGPUBufferRsrcTy, // rsrc(SGPR)
1558     llvm_i32_ty,     // offset(VGPR/imm, included in bounds` checking and swizzling)
1559     llvm_i32_ty,     // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1560     llvm_i32_ty,     // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1561     llvm_i32_ty],    // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1562                      //                                       bit 1 = slc,
1563                      //                                       bit 2 = dlc on gfx10/gfx11),
1564                      //                      swizzled buffer (bit 3 = swz),
1565                      //                  gfx12+:
1566                      //                      cachepolicy (bits [0-2] = th,
1567                      //                                   bits [3-4] = scope)
1568                      //                      swizzled buffer (bit 6 = swz)
1569                      //                      volatile op (bit 31, stripped at lowering))
1570    [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
1571     ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1572  AMDGPURsrcIntrinsic<0>;
1573
1574def int_amdgcn_raw_tbuffer_store : DefaultAttrsIntrinsic <
1575    [],
1576    [llvm_any_ty,    // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1577     llvm_v4i32_ty,  // rsrc(SGPR)
1578     llvm_i32_ty,    // offset(VGPR/imm, included in bounds checking and swizzling)
1579     llvm_i32_ty,    // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1580     llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1581     llvm_i32_ty],   // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1582                     //                                       bit 1 = slc,
1583                     //                                       bit 2 = dlc on gfx10/gfx11),
1584                     //                      swizzled buffer (bit 3 = swz),
1585                     //                  gfx12+:
1586                     //                      cachepolicy (bits [0-2] = th,
1587                     //                                   bits [3-4] = scope)
1588                     //                      swizzled buffer (bit 6 = swz),
1589                     //                  all:
1590                     //                      volatile op (bit 31, stripped at lowering))
1591    [IntrWriteMem,
1592     ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1593  AMDGPURsrcIntrinsic<1>;
1594
1595def int_amdgcn_raw_ptr_tbuffer_store : DefaultAttrsIntrinsic <
1596    [],
1597    [llvm_any_ty,    // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1598     AMDGPUBufferRsrcTy, // rsrc(SGPR)
1599     llvm_i32_ty,    // offset(VGPR/imm, included in bounds checking and swizzling)
1600     llvm_i32_ty,    // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1601     llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1602     llvm_i32_ty],   // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1603                     //                                       bit 1 = slc,
1604                     //                                       bit 2 = dlc on gfx10/gfx11),
1605                     //                      swizzled buffer (bit 3 = swz),
1606                     //                  gfx12+:
1607                     //                      cachepolicy (bits [0-2] = th,
1608                     //                                   bits [3-4] = scope)
1609                     //                      swizzled buffer (bit 6 = swz),
1610                     //                  all:
1611                     //                      volatile op (bit 31, stripped at lowering))
1612    [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
1613     ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1614  AMDGPURsrcIntrinsic<1>;
1615
1616def int_amdgcn_struct_tbuffer_load : DefaultAttrsIntrinsic <
1617    [llvm_any_ty],    // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1618    [llvm_v4i32_ty,   // rsrc(SGPR)
1619     llvm_i32_ty,     // vindex(VGPR)
1620     llvm_i32_ty,     // offset(VGPR/imm, included in bounds checking and swizzling)
1621     llvm_i32_ty,     // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1622     llvm_i32_ty,     // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1623     llvm_i32_ty],    // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1624                      //                                       bit 1 = slc,
1625                      //                                       bit 2 = dlc on gfx10/gfx11),
1626                      //                      swizzled buffer (bit 3 = swz),
1627                      //                  gfx12+:
1628                      //                      cachepolicy (bits [0-2] = th,
1629                      //                                   bits [3-4] = scope)
1630                      //                      swizzled buffer (bit 6 = swz),
1631                      //                  all:
1632                      //                      volatile op (bit 31, stripped at lowering))
1633    [IntrReadMem,
1634     ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1635  AMDGPURsrcIntrinsic<0>;
1636
1637def int_amdgcn_struct_ptr_tbuffer_load : DefaultAttrsIntrinsic <
1638    [llvm_any_ty],    // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1639    [AMDGPUBufferRsrcTy, // rsrc(SGPR)
1640     llvm_i32_ty,     // vindex(VGPR)
1641     llvm_i32_ty,     // offset(VGPR/imm, included in bounds checking and swizzling)
1642     llvm_i32_ty,     // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1643     llvm_i32_ty,     // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1644     llvm_i32_ty],    // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1645                      //                                       bit 1 = slc,
1646                      //                                       bit 2 = dlc on gfx10/gfx11),
1647                      //                      swizzled buffer (bit 3 = swz),
1648                      //                  gfx12+:
1649                      //                      cachepolicy (bits [0-2] = th,
1650                      //                                   bits [3-4] = scope)
1651                      //                      swizzled buffer (bit 6 = swz),
1652                      //                  all:
1653                      //                      volatile op (bit 31, stripped at lowering))
1654    [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
1655     ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1656  AMDGPURsrcIntrinsic<0>;
1657
1658def int_amdgcn_struct_ptr_tbuffer_store : DefaultAttrsIntrinsic <
1659    [],
1660    [llvm_any_ty,    // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1661     AMDGPUBufferRsrcTy, // rsrc(SGPR)
1662     llvm_i32_ty,    // vindex(VGPR)
1663     llvm_i32_ty,    // offset(VGPR/imm, included in bounds checking and swizzling)
1664     llvm_i32_ty,    // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1665     llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1666     llvm_i32_ty],   // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1667                     //                                       bit 1 = slc,
1668                     //                                       bit 2 = dlc on gfx10/gfx11),
1669                     //                      swizzled buffer (bit 3 = swz),
1670                     //                  gfx12+:
1671                     //                      cachepolicy (bits [0-2] = th,
1672                     //                                   bits [3-4] = scope)
1673                     //                      swizzled buffer (bit 6 = swz),
1674                     //                  all:
1675                     //                      volatile op (bit 31, stripped at lowering))
1676    [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
1677     ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>,
1678  AMDGPURsrcIntrinsic<1>;
1679
1680def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic <
1681    [],
1682    [llvm_any_ty,    // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1683     llvm_v4i32_ty,  // rsrc(SGPR)
1684     llvm_i32_ty,    // vindex(VGPR)
1685     llvm_i32_ty,    // offset(VGPR/imm, included in bounds checking and swizzling)
1686     llvm_i32_ty,    // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1687     llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1688     llvm_i32_ty],   // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1689                     //                                       bit 1 = slc,
1690                     //                                       bit 2 = dlc on gfx10/gfx11),
1691                     //                      swizzled buffer (bit 3 = swz),
1692                     //                  gfx12+:
1693                     //                      cachepolicy (bits [0-2] = th,
1694                     //                                   bits [3-4] = scope)
1695                     //                      swizzled buffer (bit 6 = swz),
1696                     //                  all:
1697                     //                      volatile op (bit 31, stripped at lowering))
1698    [IntrWriteMem,
1699     ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>,
1700  AMDGPURsrcIntrinsic<1>;
1701
1702class AMDGPUBufferAtomic : Intrinsic <
1703  [llvm_anyint_ty],
1704  [LLVMMatchType<0>,       // vdata(VGPR)
1705   llvm_v4i32_ty,     // rsrc(SGPR)
1706   llvm_i32_ty,       // vindex(VGPR)
1707   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
1708   llvm_i1_ty],       // slc(imm)
1709  [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1710  AMDGPURsrcIntrinsic<1, 0>;
1711def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic;
1712def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic;
1713def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic;
1714def int_amdgcn_buffer_atomic_smin : AMDGPUBufferAtomic;
1715def int_amdgcn_buffer_atomic_umin : AMDGPUBufferAtomic;
1716def int_amdgcn_buffer_atomic_smax : AMDGPUBufferAtomic;
1717def int_amdgcn_buffer_atomic_umax : AMDGPUBufferAtomic;
1718def int_amdgcn_buffer_atomic_and : AMDGPUBufferAtomic;
1719def int_amdgcn_buffer_atomic_or : AMDGPUBufferAtomic;
1720def int_amdgcn_buffer_atomic_xor : AMDGPUBufferAtomic;
1721def int_amdgcn_buffer_atomic_cmpswap : Intrinsic<
1722  [llvm_i32_ty],
1723  [llvm_i32_ty,       // src(VGPR)
1724   llvm_i32_ty,       // cmp(VGPR)
1725   llvm_v4i32_ty,     // rsrc(SGPR)
1726   llvm_i32_ty,       // vindex(VGPR)
1727   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
1728   llvm_i1_ty],       // slc(imm)
1729  [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1730  AMDGPURsrcIntrinsic<2, 0>;
1731
1732def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic;
1733
1734class AMDGPUBufferAtomicFP : Intrinsic <
1735  [llvm_anyfloat_ty],
1736  [LLVMMatchType<0>, // vdata(VGPR)
1737   llvm_v4i32_ty,    // rsrc(SGPR)
1738   llvm_i32_ty,      // vindex(VGPR)
1739   llvm_i32_ty,      // offset(SGPR/VGPR/imm)
1740   llvm_i1_ty],      // slc(imm)
1741  [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
1742  AMDGPURsrcIntrinsic<1, 0>;
1743
1744// Legacy form of the intrinsic. raw and struct forms should be preferred.
1745def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP;
1746
1747class AMDGPURawBufferLoadLDS : Intrinsic <
1748  [],
1749  [llvm_v4i32_ty,                      // rsrc(SGPR)
1750   LLVMQualPointerType<3>,             // LDS base offset
1751   llvm_i32_ty,                        // Data byte size: 1/2/4
1752   llvm_i32_ty,                        // voffset(VGPR, included in bounds checking and swizzling)
1753   llvm_i32_ty,                        // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1754   llvm_i32_ty,                        // imm offset(imm, included in bounds checking and swizzling)
1755   llvm_i32_ty],                       // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1756                                       //                                       bit 1 = slc,
1757                                       //                                       bit 2 = dlc on gfx10/gfx11))
1758                                       //                      swizzled buffer (bit 3 = swz),
1759                                       //                  gfx12+:
1760                                       //                      cachepolicy (bits [0-2] = th,
1761                                       //                                   bits [3-4] = scope)
1762                                       //                      swizzled buffer (bit 6 = swz),
1763                                       //                  all:
1764                                       //                      volatile op (bit 31, stripped at lowering))
1765  [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>,
1766   ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
1767def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS;
1768
1769class AMDGPURawPtrBufferLoadLDS : Intrinsic <
1770  [],
1771  [AMDGPUBufferRsrcTy,                 // rsrc(SGPR)
1772   LLVMQualPointerType<3>,             // LDS base offset
1773   llvm_i32_ty,                        // Data byte size: 1/2/4
1774   llvm_i32_ty,                        // voffset(VGPR, included in bounds checking and swizzling)
1775   llvm_i32_ty,                        // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1776   llvm_i32_ty,                        // imm offset(imm, included in bounds checking and swizzling)
1777   llvm_i32_ty],                       // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1778                                       //                                       bit 1 = slc,
1779                                       //                                       bit 2 = dlc on gfx10/gfx11))
1780                                       //                      swizzled buffer (bit 3 = swz),
1781                                       //                  gfx12+:
1782                                       //                      cachepolicy (bits [0-2] = th,
1783                                       //                                   bits [3-4] = scope)
1784                                       //                      swizzled buffer (bit 6 = swz),
1785                                       //                  all:
1786                                       //                      volatile op (bit 31, stripped at lowering))
1787  [IntrWillReturn, IntrArgMemOnly,
1788   ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
1789   WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
1790   ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>,
1791   ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
1792def int_amdgcn_raw_ptr_buffer_load_lds : AMDGPURawPtrBufferLoadLDS;
1793
1794class AMDGPUStructBufferLoadLDS : Intrinsic <
1795  [],
1796  [llvm_v4i32_ty,                      // rsrc(SGPR)
1797   LLVMQualPointerType<3>,             // LDS base offset
1798   llvm_i32_ty,                        // Data byte size: 1/2/4
1799   llvm_i32_ty,                        // vindex(VGPR)
1800   llvm_i32_ty,                        // voffset(VGPR, included in bounds checking and swizzling)
1801   llvm_i32_ty,                        // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1802   llvm_i32_ty,                        // imm offset(imm, included in bounds checking and swizzling)
1803   llvm_i32_ty],                       // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1804                                       //                                       bit 1 = slc,
1805                                       //                                       bit 2 = dlc on gfx10/gfx11))
1806                                       //                      swizzled buffer (bit 3 = swz),
1807                                       //                  gfx12+:
1808                                       //                      cachepolicy (bits [0-2] = th,
1809                                       //                                   bits [3-4] = scope)
1810                                       //                      swizzled buffer (bit 6 = swz),
1811                                       //                  all:
1812                                       //                      volatile op (bit 31, stripped at lowering))
1813  [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>,
1814   ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
1815def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS;
1816
1817class AMDGPUStructPtrBufferLoadLDS : Intrinsic <
1818  [],
1819  [AMDGPUBufferRsrcTy,                 // rsrc(SGPR)
1820   LLVMQualPointerType<3> ,            // LDS base offset
1821   llvm_i32_ty,                        // Data byte size: 1/2/4
1822   llvm_i32_ty,                        // vindex(VGPR)
1823   llvm_i32_ty,                        // voffset(VGPR, included in bounds checking and swizzling)
1824   llvm_i32_ty,                        // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1825   llvm_i32_ty,                        // imm offset(imm, included in bounds checking and swizzling)
1826   llvm_i32_ty],                       // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1827                                       //                                       bit 1 = slc,
1828                                       //                                       bit 2 = dlc on gfx10/gfx11))
1829                                       //                      swizzled buffer (bit 3 = swz),
1830                                       //                  gfx12+:
1831                                       //                      cachepolicy (bits [0-2] = th,
1832                                       //                                   bits [3-4] = scope)
1833                                       //                      swizzled buffer (bit 6 = swz),
1834                                       //                  all:
1835                                       //                      volatile op (bit 31, stripped at lowering))
1836  [IntrWillReturn, IntrArgMemOnly,
1837   ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
1838   WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
1839   ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>,
1840   ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
1841def int_amdgcn_struct_ptr_buffer_load_lds : AMDGPUStructPtrBufferLoadLDS;
1842
1843} // defset AMDGPUBufferIntrinsics
1844
1845// Uses that do not set the done bit should set IntrWriteMem on the
1846// call site.
1847def int_amdgcn_exp : DefaultAttrsIntrinsic <[], [
1848  llvm_i32_ty,       // tgt,
1849  llvm_i32_ty,       // en
1850  llvm_any_ty,       // src0 (f32 or i32)
1851  LLVMMatchType<0>,  // src1
1852  LLVMMatchType<0>,  // src2
1853  LLVMMatchType<0>,  // src3
1854  llvm_i1_ty,        // done
1855  llvm_i1_ty         // vm (ignored on GFX11+)
1856  ],
1857  [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>,
1858   ImmArg<ArgIndex<7>>, IntrWriteMem, IntrInaccessibleMemOnly]
1859>;
1860
1861// exp with row_en bit set. Only supported on GFX11+.
1862def int_amdgcn_exp_row : DefaultAttrsIntrinsic <[], [
1863  llvm_i32_ty,       // tgt,
1864  llvm_i32_ty,       // en
1865  llvm_any_ty,       // src0 (f32 or i32)
1866  LLVMMatchType<0>,  // src1
1867  LLVMMatchType<0>,  // src2
1868  LLVMMatchType<0>,  // src3
1869  llvm_i1_ty,        // done
1870  llvm_i32_ty],      // row number
1871  [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>,
1872   IntrWriteMem, IntrInaccessibleMemOnly]
1873>;
1874
1875// exp with compr bit set. Not supported on GFX11+.
1876def int_amdgcn_exp_compr : DefaultAttrsIntrinsic <[], [
1877  llvm_i32_ty,       // tgt,
1878  llvm_i32_ty,       // en
1879  llvm_anyvector_ty, // src0 (v2f16 or v2i16)
1880  LLVMMatchType<0>,  // src1
1881  llvm_i1_ty,        // done
1882  llvm_i1_ty],       // vm
1883  [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>,
1884   ImmArg<ArgIndex<5>>, IntrWriteMem, IntrInaccessibleMemOnly]
1885>;
1886
1887def int_amdgcn_buffer_wbinvl1_sc :
1888  ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">,
1889  DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
1890
1891def int_amdgcn_buffer_wbinvl1 :
1892  ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1">,
1893  DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
1894
1895def int_amdgcn_s_dcache_inv :
1896  ClangBuiltin<"__builtin_amdgcn_s_dcache_inv">,
1897  DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
1898
1899def int_amdgcn_s_memtime :
1900  ClangBuiltin<"__builtin_amdgcn_s_memtime">,
1901  DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects]>;
1902
1903def int_amdgcn_s_sleep :
1904  ClangBuiltin<"__builtin_amdgcn_s_sleep">,
1905  DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1906                                IntrHasSideEffects]> {
1907}
1908
1909def int_amdgcn_s_sleep_var
1910    : ClangBuiltin<"__builtin_amdgcn_s_sleep_var">,
1911      Intrinsic<[], [llvm_i32_ty],
1912                [IntrNoMem, IntrHasSideEffects, IntrWillReturn]> {
1913}
1914
1915def int_amdgcn_s_nop :
1916  DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1917                                IntrHasSideEffects]> {
1918}
1919
1920def int_amdgcn_s_incperflevel :
1921  ClangBuiltin<"__builtin_amdgcn_s_incperflevel">,
1922  DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1923                                IntrHasSideEffects]> {
1924}
1925
1926def int_amdgcn_s_decperflevel :
1927  ClangBuiltin<"__builtin_amdgcn_s_decperflevel">,
1928  DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1929                                IntrHasSideEffects]> {
1930}
1931
1932def int_amdgcn_s_sethalt :
1933  DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1934                                IntrHasSideEffects]>;
1935
1936def int_amdgcn_s_setprio :
1937  ClangBuiltin<"__builtin_amdgcn_s_setprio">,
1938  DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1939                                IntrHasSideEffects]>;
1940
1941def int_amdgcn_s_ttracedata :
1942  DefaultAttrsIntrinsic<[], [llvm_i32_ty],
1943                        [IntrNoMem, IntrHasSideEffects]>;
1944def int_amdgcn_s_ttracedata_imm :
1945  DefaultAttrsIntrinsic<[], [llvm_i16_ty],
1946                        [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
1947
1948// This is IntrHasSideEffects so it can be used to read cycle counters.
1949def int_amdgcn_s_getreg :
1950  ClangBuiltin<"__builtin_amdgcn_s_getreg">,
1951  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty],
1952  [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]
1953>;
1954
1955// Note this can be used to set FP environment properties that are
1956// unsafe to change in non-strictfp functions. The register properties
1957// available (and value required to access them) may differ per
1958// subtarget. llvm.amdgcn.s.setreg(hwmode, value)
1959def int_amdgcn_s_setreg :
1960  ClangBuiltin<"__builtin_amdgcn_s_setreg">,
1961  DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_i32_ty],
1962  [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]
1963>;
1964
1965// int_amdgcn_s_getpc is provided to allow a specific style of position
1966// independent code to determine the high part of its address when it is
1967// known (through convention) that the code and any data of interest does
1968// not cross a 4Gb address boundary. Use for any other purpose may not
1969// produce the desired results as optimizations may cause code movement,
1970// especially as we explicitly use IntrNoMem to allow optimizations.
1971// This intrinsic always returns PC sign-extended from 48 bits even if the
1972// s_getpc_b64 instruction returns a zero-extended value.
1973def int_amdgcn_s_getpc :
1974  ClangBuiltin<"__builtin_amdgcn_s_getpc">,
1975  DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable,
1976                                IntrWillReturn]>;
1977
1978// __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0>
1979// param values: 0 = P10, 1 = P20, 2 = P0
1980def int_amdgcn_interp_mov :
1981  ClangBuiltin<"__builtin_amdgcn_interp_mov">,
1982  DefaultAttrsIntrinsic<[llvm_float_ty],
1983            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1984            [IntrNoMem, IntrSpeculatable,
1985              ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
1986
1987// __builtin_amdgcn_interp_p1 <i>, <attr_chan>, <attr>, <m0>
1988// This intrinsic reads from lds, but the memory values are constant,
1989// so it behaves like IntrNoMem.
1990def int_amdgcn_interp_p1 :
1991  ClangBuiltin<"__builtin_amdgcn_interp_p1">,
1992  DefaultAttrsIntrinsic<[llvm_float_ty],
1993            [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1994            [IntrNoMem, IntrSpeculatable,
1995             ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
1996
1997// __builtin_amdgcn_interp_p2 <p1>, <j>, <attr_chan>, <attr>, <m0>
1998def int_amdgcn_interp_p2 :
1999  ClangBuiltin<"__builtin_amdgcn_interp_p2">,
2000  DefaultAttrsIntrinsic<[llvm_float_ty],
2001            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
2002            [IntrNoMem, IntrSpeculatable,
2003             ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
2004          // See int_amdgcn_v_interp_p1 for why this is IntrNoMem.
2005
2006// __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0>
2007// high selects whether high or low 16-bits are loaded from LDS
2008def int_amdgcn_interp_p1_f16 :
2009  ClangBuiltin<"__builtin_amdgcn_interp_p1_f16">,
2010  DefaultAttrsIntrinsic<[llvm_float_ty],
2011            [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
2012            [IntrNoMem, IntrSpeculatable,
2013             ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
2014
2015// __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0>
2016// high selects whether high or low 16-bits are loaded from LDS
2017def int_amdgcn_interp_p2_f16 :
2018  ClangBuiltin<"__builtin_amdgcn_interp_p2_f16">,
2019  DefaultAttrsIntrinsic<[llvm_half_ty],
2020            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
2021            [IntrNoMem, IntrSpeculatable,
2022             ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
2023
2024// llvm.amdgcn.lds.direct.load <m0>
2025// The input argument is m0, which contains a packed combination of address
2026// offset and flags describing the data type.
2027def int_amdgcn_lds_direct_load :
2028  DefaultAttrsIntrinsic<[llvm_any_ty], // overloaded for types u8, u16, i32/f32, i8, i16
2029            [llvm_i32_ty],
2030            [IntrReadMem, IntrSpeculatable]>;
2031
2032// llvm.amdgcn.lds.param.load <attr_chan>, <attr>, <m0>
2033// Like interp intrinsics, this reads from lds, but the memory values are constant,
2034// so it behaves like IntrNoMem.
2035def int_amdgcn_lds_param_load :
2036  DefaultAttrsIntrinsic<[llvm_float_ty],
2037            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
2038            [IntrNoMem, IntrSpeculatable,
2039             ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
2040
2041// llvm.amdgcn.interp.inreg.p10 <p>, <i>, <p0>
2042def int_amdgcn_interp_inreg_p10 :
2043  DefaultAttrsIntrinsic<[llvm_float_ty],
2044            [llvm_float_ty, llvm_float_ty, llvm_float_ty],
2045            [IntrNoMem, IntrSpeculatable]>;
2046
2047// llvm.amdgcn.interp.inreg.p2 <p>, <j>, <tmp>
2048def int_amdgcn_interp_inreg_p2 :
2049  DefaultAttrsIntrinsic<[llvm_float_ty],
2050            [llvm_float_ty, llvm_float_ty, llvm_float_ty],
2051            [IntrNoMem, IntrSpeculatable]>;
2052
2053// llvm.amdgcn.interp.inreg.p10.f16 <p>, <i>, <p0>, <high>
2054// high selects whether high or low 16-bits are used for p and p0 operands
2055def int_amdgcn_interp_inreg_p10_f16:
2056  DefaultAttrsIntrinsic<[llvm_float_ty],
2057            [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
2058            [IntrNoMem, IntrSpeculatable,
2059             ImmArg<ArgIndex<3>>]>;
2060
2061// llvm.amdgcn.interp.inreg.p2.f16 <p>, <j>, <tmp>, <high>
2062// high selects whether high or low 16-bits are used for p operand
2063def int_amdgcn_interp_inreg_p2_f16 :
2064  DefaultAttrsIntrinsic<[llvm_half_ty],
2065            [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
2066            [IntrNoMem, IntrSpeculatable,
2067             ImmArg<ArgIndex<3>>]>;
2068
2069// Deprecated: use llvm.amdgcn.live.mask instead.
2070def int_amdgcn_ps_live : DefaultAttrsIntrinsic <
2071  [llvm_i1_ty],
2072  [],
2073  [IntrNoMem]>;
2074
2075// Query currently live lanes.
2076// Returns true if lane is live (and not a helper lane).
2077def int_amdgcn_live_mask : DefaultAttrsIntrinsic <[llvm_i1_ty],
2078  [], [IntrReadMem, IntrInaccessibleMemOnly]
2079>;
2080
2081def int_amdgcn_mbcnt_lo :
2082  ClangBuiltin<"__builtin_amdgcn_mbcnt_lo">,
2083  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
2084   [IntrNoMem]>;
2085
2086def int_amdgcn_mbcnt_hi :
2087  ClangBuiltin<"__builtin_amdgcn_mbcnt_hi">,
2088  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
2089            [IntrNoMem]>;
2090
2091// llvm.amdgcn.ds.swizzle src offset
2092def int_amdgcn_ds_swizzle :
2093  ClangBuiltin<"__builtin_amdgcn_ds_swizzle">,
2094  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
2095            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree,
2096             ImmArg<ArgIndex<1>>]>;
2097
2098def int_amdgcn_ubfe : DefaultAttrsIntrinsic<[llvm_anyint_ty],
2099    [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
2100    [IntrNoMem, IntrSpeculatable]
2101>;
2102
2103def int_amdgcn_sbfe : DefaultAttrsIntrinsic<[llvm_anyint_ty],
2104    [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
2105    [IntrNoMem, IntrSpeculatable]
2106>;
2107
2108def int_amdgcn_lerp :
2109  ClangBuiltin<"__builtin_amdgcn_lerp">,
2110  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
2111  [IntrNoMem, IntrSpeculatable]
2112>;
2113
2114def int_amdgcn_sad_u8 :
2115  ClangBuiltin<"__builtin_amdgcn_sad_u8">,
2116  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
2117  [IntrNoMem, IntrSpeculatable]
2118>;
2119
2120def int_amdgcn_msad_u8 :
2121  ClangBuiltin<"__builtin_amdgcn_msad_u8">,
2122  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
2123  [IntrNoMem, IntrSpeculatable]
2124>;
2125
2126def int_amdgcn_sad_hi_u8 :
2127  ClangBuiltin<"__builtin_amdgcn_sad_hi_u8">,
2128  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
2129  [IntrNoMem, IntrSpeculatable]
2130>;
2131
2132def int_amdgcn_sad_u16 :
2133  ClangBuiltin<"__builtin_amdgcn_sad_u16">,
2134  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
2135  [IntrNoMem, IntrSpeculatable]
2136>;
2137
2138def int_amdgcn_qsad_pk_u16_u8 :
2139  ClangBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">,
2140  DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
2141  [IntrNoMem, IntrSpeculatable]
2142>;
2143
2144def int_amdgcn_mqsad_pk_u16_u8 :
2145  ClangBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">,
2146  DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
2147  [IntrNoMem, IntrSpeculatable]
2148>;
2149
2150def int_amdgcn_mqsad_u32_u8 :
2151  ClangBuiltin<"__builtin_amdgcn_mqsad_u32_u8">,
2152  DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty],
2153  [IntrNoMem, IntrSpeculatable]
2154>;
2155
2156def int_amdgcn_cvt_pk_u8_f32 :
2157  ClangBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">,
2158  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
2159  [IntrNoMem, IntrSpeculatable]
2160>;
2161
2162def int_amdgcn_icmp :
2163  Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>, llvm_i32_ty],
2164            [IntrNoMem, IntrConvergent,
2165             ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2166
2167def int_amdgcn_fcmp :
2168  Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>, llvm_i32_ty],
2169            [IntrNoMem, IntrConvergent,
2170             ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2171
2172def int_amdgcn_ballot :
2173  Intrinsic<[llvm_anyint_ty], [llvm_i1_ty],
2174            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2175
2176def int_amdgcn_inverse_ballot :
2177  Intrinsic<[llvm_i1_ty], [llvm_anyint_ty],
2178            [IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2179
2180// Lowers to S_BITREPLICATE_B64_B32.
2181// The argument must be uniform; otherwise, the result is undefined.
2182def int_amdgcn_s_bitreplicate :
2183  DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
2184
2185// Lowers to S_QUADMASK_B{32,64}
2186// The argument must be uniform; otherwise, the result is undefined.
2187def int_amdgcn_s_quadmask :
2188  DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>;
2189
2190// Lowers to S_WQM_B{32,64}
2191// The argument must be uniform; otherwise, the result is undefined.
2192// Does not set WQM; merely calculates the bitmask.
2193def int_amdgcn_s_wqm :
2194  DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>;
2195
2196class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
2197    [data_ty],
2198    [
2199      LLVMMatchType<0>,   // llvm value to reduce (SGPR/VGPR)
2200      llvm_i32_ty         // Reduction Strategy Switch for lowering ( 0: Default,
2201                          //                                          1: Iterative strategy, and
2202                          //                                          2. DPP)
2203    ],
2204    [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
2205
2206def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
2207def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
2208
2209def int_amdgcn_readfirstlane :
2210  ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
2211  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
2212            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2213
2214// The lane argument must be uniform across the currently active threads of the
2215// current wave. Otherwise, the result is undefined.
2216def int_amdgcn_readlane :
2217  ClangBuiltin<"__builtin_amdgcn_readlane">,
2218  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
2219            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2220
2221// The value to write and lane select arguments must be uniform across the
2222// currently active threads of the current wave. Otherwise, the result is
2223// undefined.
2224def int_amdgcn_writelane :
2225  ClangBuiltin<"__builtin_amdgcn_writelane">,
2226  Intrinsic<[llvm_i32_ty], [
2227    llvm_i32_ty,    // uniform value to write: returned by the selected lane
2228    llvm_i32_ty,    // uniform lane select
2229    llvm_i32_ty     // returned by all lanes other than the selected one
2230  ],
2231  [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
2232>;
2233
2234def int_amdgcn_alignbyte : ClangBuiltin<"__builtin_amdgcn_alignbyte">,
2235  DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
2236  [IntrNoMem, IntrSpeculatable]
2237>;
2238
2239// mul24 intrinsics can return i32 or i64.
2240// When returning i64, they're lowered to a mul24/mulhi24 pair.
2241def int_amdgcn_mul_i24 : DefaultAttrsIntrinsic<[llvm_anyint_ty],
2242  [llvm_i32_ty, llvm_i32_ty],
2243  [IntrNoMem, IntrSpeculatable]
2244>;
2245
2246def int_amdgcn_mul_u24 : DefaultAttrsIntrinsic<[llvm_anyint_ty],
2247  [llvm_i32_ty, llvm_i32_ty],
2248  [IntrNoMem, IntrSpeculatable]
2249>;
2250
2251def int_amdgcn_mulhi_i24 : DefaultAttrsIntrinsic<[llvm_i32_ty],
2252  [llvm_i32_ty, llvm_i32_ty],
2253  [IntrNoMem, IntrSpeculatable]
2254>;
2255
2256def int_amdgcn_mulhi_u24 : DefaultAttrsIntrinsic<[llvm_i32_ty],
2257  [llvm_i32_ty, llvm_i32_ty],
2258  [IntrNoMem, IntrSpeculatable]
2259>;
2260
2261// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id)
2262//
2263// bar_val is the total number of waves that will wait on this
2264// barrier, minus 1.
2265def int_amdgcn_ds_gws_init :
2266  ClangBuiltin<"__builtin_amdgcn_ds_gws_init">,
2267  Intrinsic<[],
2268  [llvm_i32_ty, llvm_i32_ty],
2269  [IntrConvergent, IntrWriteMem,
2270   IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "",
2271  [SDNPMemOperand]
2272>;
2273
2274// llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id)
2275// bar_val is the total number of waves that will wait on this
2276// barrier, minus 1.
2277def int_amdgcn_ds_gws_barrier :
2278  ClangBuiltin<"__builtin_amdgcn_ds_gws_barrier">,
2279  Intrinsic<[],
2280  [llvm_i32_ty, llvm_i32_ty],
2281  [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "",
2282  [SDNPMemOperand]
2283>;
2284
2285// llvm.amdgcn.ds.gws.sema.v(i32 resource_id)
2286def int_amdgcn_ds_gws_sema_v :
2287  ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_v">,
2288  Intrinsic<[],
2289  [llvm_i32_ty],
2290  [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "",
2291  [SDNPMemOperand]
2292>;
2293
2294// llvm.amdgcn.ds.gws.sema.br(i32 vsrc, i32 resource_id)
2295def int_amdgcn_ds_gws_sema_br :
2296  ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_br">,
2297  Intrinsic<[],
2298  [llvm_i32_ty, llvm_i32_ty],
2299  [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "",
2300  [SDNPMemOperand]
2301>;
2302
2303// llvm.amdgcn.ds.gws.sema.p(i32 resource_id)
2304def int_amdgcn_ds_gws_sema_p :
2305  ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_p">,
2306  Intrinsic<[],
2307  [llvm_i32_ty],
2308  [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "",
2309  [SDNPMemOperand]
2310>;
2311
2312// llvm.amdgcn.ds.gws.sema.release.all(i32 resource_id)
2313def int_amdgcn_ds_gws_sema_release_all :
2314  ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_release_all">,
2315  Intrinsic<[],
2316  [llvm_i32_ty],
2317  [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "",
2318  [SDNPMemOperand]
2319>;
2320
2321
2322// Copies the source value to the destination value, with the guarantee that
2323// the source value is computed as if the entire program were executed in WQM.
2324def int_amdgcn_wqm : Intrinsic<[llvm_any_ty],
2325  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]
2326>;
2327
2328// Copies the source value to the destination value, such that the source
2329// is computed as if the entire program were executed in WQM if any other
2330// program code executes in WQM.
2331def int_amdgcn_softwqm : Intrinsic<[llvm_any_ty],
2332  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]
2333>;
2334
2335// Return true if at least one thread within the pixel quad passes true into
2336// the function.
2337def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty],
2338  [llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
2339>;
2340
2341// If false, set EXEC=0 for the current thread until the end of program.
2342// FIXME: Should this be IntrNoMem, IntrHasSideEffects, or IntrWillReturn?
2343def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], [IntrNoCallback, IntrNoFree]>;
2344
2345def int_amdgcn_endpgm : ClangBuiltin<"__builtin_amdgcn_endpgm">,
2346  Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects, IntrConvergent,
2347                     IntrNoCallback, IntrNoFree]
2348>;
2349
2350// If false, mark all active lanes as helper lanes until the end of program.
2351def int_amdgcn_wqm_demote : Intrinsic<[],
2352  [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly, IntrNoCallback, IntrNoFree]
2353>;
2354
2355// Copies the active channels of the source value to the destination value,
2356// with the guarantee that the source value is computed as if the entire
2357// program were executed in Whole Wavefront Mode, i.e. with all channels
2358// enabled, with a few exceptions: - Phi nodes which require WWM return an
2359// undefined value.
2360def int_amdgcn_strict_wwm : Intrinsic<[llvm_any_ty],
2361  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
2362                       IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
2363>;
2364// Deprecated. Use int_amdgcn_strict_wwm instead.
2365def int_amdgcn_wwm : Intrinsic<[llvm_any_ty],
2366  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
2367                       IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
2368>;
2369def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty],
2370  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
2371                       IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
2372>;
2373
2374// Given a value, copies it while setting all the inactive lanes to a given
2375// value. Note that OpenGL helper lanes are considered active, so if the
2376// program ever uses WQM, then the instruction and the first source will be
2377// computed in WQM.
2378def int_amdgcn_set_inactive :
2379  Intrinsic<[llvm_anyint_ty],
2380            [LLVMMatchType<0>, // value to be copied
2381             LLVMMatchType<0>], // value for the inactive lanes to take
2382            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2383
2384// Similar to int_amdgcn_set_inactive, but the value for the inactive lanes must
2385// be a VGPR function argument.
2386// Can only be used in functions with the `amdgpu_cs_chain` or
2387// `amdgpu_cs_chain_preserve` calling conventions, and only in uniform control
2388// flow.
2389def int_amdgcn_set_inactive_chain_arg :
2390  Intrinsic<[llvm_anyint_ty],
2391            [LLVMMatchType<0>, // value to be copied
2392             LLVMMatchType<0>], // value for the inactive lanes to take
2393            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2394
2395// Return if the given flat pointer points to a local memory address.
2396def int_amdgcn_is_shared : ClangBuiltin<"__builtin_amdgcn_is_shared">,
2397  DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty],
2398  [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>]
2399>;
2400
2401// Return if the given flat pointer points to a prvate memory address.
2402def int_amdgcn_is_private : ClangBuiltin<"__builtin_amdgcn_is_private">,
2403  DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty],
2404  [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>]
2405>;
2406
2407// A uniform tail call to a function with the `amdgpu_cs_chain` or
2408// `amdgpu_cs_chain_preserve` calling convention. It will populate the SGPRs
2409// starting at s0 and the VGPRs starting at v8, set EXEC and perform a jump to
2410// the given function.
2411// Can only be used in functions with the `amdgpu_cs`, `amdgpu_cs_chain` or
2412// `amdgpu_cs_chain_preserve` calling conventions, and only in uniform control
2413// flow.
2414def int_amdgcn_cs_chain:
2415  Intrinsic<[],
2416            [llvm_anyptr_ty, // The function to jump to.
2417             llvm_anyint_ty, // Value to put in EXEC (should be i32 or i64).
2418             llvm_any_ty, // Arguments that will be copied into SGPRs (s0+).
2419                          // Must be uniform.
2420             llvm_any_ty, // Arguments that will be copied into VGPRs (v8+).
2421                          // Need not be uniform.
2422             llvm_i32_ty, // Flags.
2423             llvm_vararg_ty // Additional arguments. Only present if Flags is
2424                            // non-zero.
2425            ],
2426            [IntrConvergent, IntrNoReturn, ImmArg<ArgIndex<4>>]>;
2427
2428
2429//===----------------------------------------------------------------------===//
2430// CI+ Intrinsics
2431//===----------------------------------------------------------------------===//
2432
2433def int_amdgcn_s_dcache_inv_vol :
2434  ClangBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">,
2435  DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
2436
2437def int_amdgcn_buffer_wbinvl1_vol :
2438  ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">,
2439  DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
2440
2441//===----------------------------------------------------------------------===//
2442// VI Intrinsics
2443//===----------------------------------------------------------------------===//
2444
2445// llvm.amdgcn.mov.dpp.i32 <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
2446def int_amdgcn_mov_dpp :
2447  Intrinsic<[llvm_anyint_ty],
2448            [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
2449             llvm_i1_ty],
2450             [IntrNoMem, IntrConvergent, IntrWillReturn,
2451             ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>,
2452             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
2453
2454// llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
2455// Should be equivalent to:
2456// v_mov_b32 <dest> <old>
2457// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
2458def int_amdgcn_update_dpp :
2459  Intrinsic<[llvm_any_ty],
2460            [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty,
2461            llvm_i32_ty, llvm_i32_ty, llvm_i1_ty],
2462             [IntrNoMem, IntrConvergent, IntrWillReturn,
2463              ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>,
2464              ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
2465
2466def int_amdgcn_s_dcache_wb :
2467  ClangBuiltin<"__builtin_amdgcn_s_dcache_wb">,
2468  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2469
2470def int_amdgcn_s_dcache_wb_vol :
2471  ClangBuiltin<"__builtin_amdgcn_s_dcache_wb_vol">,
2472  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2473
2474def int_amdgcn_s_memrealtime :
2475  ClangBuiltin<"__builtin_amdgcn_s_memrealtime">,
2476  Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2477
2478// llvm.amdgcn.ds.permute <index> <src>
2479def int_amdgcn_ds_permute :
2480  ClangBuiltin<"__builtin_amdgcn_ds_permute">,
2481  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
2482    [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2483
2484// llvm.amdgcn.ds.bpermute <index> <src>
2485def int_amdgcn_ds_bpermute :
2486  ClangBuiltin<"__builtin_amdgcn_ds_bpermute">,
2487  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
2488     [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2489
2490// llvm.amdgcn.perm <src0> <src1> <selector>
2491def int_amdgcn_perm :
2492  ClangBuiltin<"__builtin_amdgcn_perm">,
2493  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
2494     [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2495
2496//===----------------------------------------------------------------------===//
2497// GFX9 Intrinsics
2498//===----------------------------------------------------------------------===//
2499
2500class AMDGPUGlobalLoadLDS : Intrinsic <
2501  [],
2502  [LLVMQualPointerType<1>,             // Base global pointer to load from
2503   LLVMQualPointerType<3>,             // LDS base pointer to store to
2504   llvm_i32_ty,                        // Data byte size: 1/2/4
2505   llvm_i32_ty,                        // imm offset (applied to both global and LDS address)
2506   llvm_i32_ty],                       // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0,
2507                                       //                                   bit 1 = slc/sc1,
2508                                       //                                   bit 2 = dlc on gfx10/gfx11))
2509                                       //                                   bit 4 = scc/nt on gfx90a+))
2510                                       //                  gfx12+:
2511                                       //                      cachepolicy (bits [0-2] = th,
2512                                       //                                   bits [3-4] = scope)
2513                                       //                      swizzled buffer (bit 6 = swz),
2514  [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
2515   ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
2516  "", [SDNPMemOperand]>;
2517def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
2518
2519//===----------------------------------------------------------------------===//
2520// GFX10 Intrinsics
2521//===----------------------------------------------------------------------===//
2522
2523// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control>
2524def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">,
2525  Intrinsic<[llvm_i32_ty],
2526            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
2527            [IntrNoMem, IntrConvergent, IntrWillReturn,
2528             ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
2529
2530// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
2531def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">,
2532  Intrinsic<[llvm_i32_ty],
2533            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
2534            [IntrNoMem, IntrConvergent, IntrWillReturn,
2535             ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
2536
2537// llvm.amdgcn.mov.dpp8.i32 <src> <sel>
2538// <sel> is a 32-bit constant whose high 8 bits must be zero which selects
2539// the lanes to read from.
2540def int_amdgcn_mov_dpp8 :
2541  Intrinsic<[llvm_anyint_ty],
2542            [LLVMMatchType<0>, llvm_i32_ty],
2543            [IntrNoMem, IntrConvergent, IntrWillReturn,
2544             ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree]>;
2545
2546def int_amdgcn_s_get_waveid_in_workgroup :
2547  ClangBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">,
2548  Intrinsic<[llvm_i32_ty], [],
2549    [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2550
2551class AMDGPUAtomicRtn<LLVMType vt, LLVMType pt = llvm_anyptr_ty> : Intrinsic <
2552  [vt],
2553  [pt,  // vaddr
2554   vt], // vdata(VGPR)
2555  [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "",
2556  [SDNPMemOperand]>;
2557
2558def int_amdgcn_global_atomic_csub : AMDGPUAtomicRtn<llvm_i32_ty>;
2559
2560// uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>,
2561//                                           <ray_dir>, <ray_inv_dir>, <texture_descr>
2562// <node_ptr> is i32 or i64.
2563// <ray_dir> and <ray_inv_dir> are both v3f16 or both v3f32.
2564def int_amdgcn_image_bvh_intersect_ray :
2565  DefaultAttrsIntrinsic<[llvm_v4i32_ty],
2566            [llvm_anyint_ty, llvm_float_ty, llvm_v3f32_ty, llvm_anyvector_ty,
2567             LLVMMatchType<1>, llvm_v4i32_ty],
2568            [IntrReadMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2569
2570//===----------------------------------------------------------------------===//
2571// GFX11 Intrinsics
2572//===----------------------------------------------------------------------===//
2573
2574// llvm.amdgcn.permlane64 <src0>
2575def int_amdgcn_permlane64 :
2576  ClangBuiltin<"__builtin_amdgcn_permlane64">,
2577  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
2578            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
2579
2580def int_amdgcn_ds_add_gs_reg_rtn :
2581  ClangBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">,
2582  Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
2583            [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree],
2584            "", [SDNPMemOperand]>;
2585
2586def int_amdgcn_ds_sub_gs_reg_rtn :
2587  ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">,
2588  Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
2589            [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree],
2590            "", [SDNPMemOperand]>;
2591
2592def int_amdgcn_ds_bvh_stack_rtn :
2593  Intrinsic<
2594    [llvm_i32_ty, llvm_i32_ty], // %vdst, %addr
2595    [
2596      llvm_i32_ty,   // %addr
2597      llvm_i32_ty,   // %data0
2598      llvm_v4i32_ty, // %data1
2599      llvm_i32_ty,   // %offset
2600    ],
2601    [ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
2602  >;
2603
2604def int_amdgcn_s_wait_event_export_ready :
2605  ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
2606  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
2607>;
2608
2609// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
2610//
2611// These operations perform a matrix multiplication and accumulation of
2612// the form: D = A * B + C .
2613
2614class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
2615  Intrinsic<
2616    [CD], // %D
2617    [
2618      AB,               // %A
2619      LLVMMatchType<1>, // %B
2620      LLVMMatchType<0>, // %C
2621    ],
2622    [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
2623>;
2624
2625class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> :
2626  Intrinsic<
2627    [CD], // %D
2628    [
2629      AB,               // %A
2630      LLVMMatchType<1>, // %B
2631      LLVMMatchType<0>, // %C
2632      llvm_i1_ty,       // %high (op_sel) for GFX11, 0 for GFX12
2633    ],
2634    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
2635>;
2636
2637class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
2638  Intrinsic<
2639    [CD], // %D
2640    [
2641      llvm_i1_ty,       // %A_sign
2642      AB,               // %A
2643      llvm_i1_ty,       // %B_sign
2644      LLVMMatchType<1>, // %B
2645      LLVMMatchType<0>, // %C
2646      llvm_i1_ty,       // %clamp
2647    ],
2648    [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
2649>;
2650
2651// WMMA GFX11Only
2652
2653// The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
2654// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers.
2655// The content of the other 16-bit half is preserved from the input.
2656def int_amdgcn_wmma_f16_16x16x16_f16_tied   : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
2657def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
2658
2659// WMMA GFX11Plus
2660
2661def int_amdgcn_wmma_f32_16x16x16_f16   : AMDGPUWmmaIntrinsic<llvm_anyfloat_ty, llvm_anyfloat_ty>;
2662def int_amdgcn_wmma_f32_16x16x16_bf16  : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
2663def int_amdgcn_wmma_i32_16x16x16_iu8   : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
2664def int_amdgcn_wmma_i32_16x16x16_iu4   : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
2665
2666// GFX11: The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
2667//        The content of the other 16-bit half is undefined.
2668// GFX12: The op_sel bit must be 0.
2669def int_amdgcn_wmma_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
2670def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
2671
2672//===----------------------------------------------------------------------===//
2673// GFX12 Intrinsics
2674//===----------------------------------------------------------------------===//
2675
2676// llvm.amdgcn.permlane16.var <old> <src0> <src1> <fi> <bound_control>
2677def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">,
2678  Intrinsic<[llvm_i32_ty],
2679            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
2680            [IntrNoMem, IntrConvergent, IntrWillReturn,
2681             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
2682
2683// llvm.amdgcn.permlanex16.var <old> <src0> <src1> <fi> <bound_control>
2684def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var">,
2685  Intrinsic<[llvm_i32_ty],
2686            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
2687            [IntrNoMem, IntrConvergent, IntrWillReturn,
2688             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
2689
2690
2691// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
2692//
2693// These operations perform a matrix multiplication and accumulation of
2694// the form: D = A * B + C .
2695
2696// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
2697def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
2698def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
2699def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
2700def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
2701// A and B are <16 x iu4>.
2702def int_amdgcn_wmma_i32_16x16x32_iu4     : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
2703
2704// SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics
2705//
2706// These operations perform a sparse matrix multiplication and accumulation of
2707// the form: D = A * B + C.
2708// A is sparse matrix, half the size of B, and is expanded using sparsity index.
2709
2710class AMDGPUSWmmacIntrinsicIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
2711  Intrinsic<
2712    [CD],               // %D
2713    [
2714      A,                // %A
2715      B,                // %B
2716      LLVMMatchType<0>, // %C
2717      Index             // %Sparsity index for A
2718    ],
2719    [IntrNoMem, IntrConvergent, IntrWillReturn]
2720>;
2721
2722class AMDGPUSWmmacIntrinsicIUIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
2723  Intrinsic<
2724    [CD],               // %D
2725    [
2726      llvm_i1_ty,       // %A_sign
2727      A,                // %A
2728      llvm_i1_ty,       // %B_sign
2729      B,                // %B
2730      LLVMMatchType<0>, // %C
2731      Index,            // %Sparsity index for A
2732      llvm_i1_ty,       // %clamp
2733    ],
2734    [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>]
2735>;
2736
2737def int_amdgcn_swmmac_f32_16x16x32_f16     : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
2738def int_amdgcn_swmmac_f32_16x16x32_bf16    : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
2739def int_amdgcn_swmmac_f16_16x16x32_f16     : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
2740def int_amdgcn_swmmac_bf16_16x16x32_bf16   : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
2741def int_amdgcn_swmmac_i32_16x16x32_iu8     : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
2742def int_amdgcn_swmmac_i32_16x16x32_iu4     : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
2743def int_amdgcn_swmmac_i32_16x16x64_iu4     : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
2744def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
2745def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
2746def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
2747def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
2748
2749def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>;
2750
2751def int_amdgcn_flat_atomic_fmin_num   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
2752def int_amdgcn_flat_atomic_fmax_num   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
2753def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
2754def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
2755
2756def int_amdgcn_atomic_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty>;
2757
2758class AMDGPULoadTr<LLVMType ptr_ty>:
2759  Intrinsic<
2760    [llvm_any_ty],
2761    [ptr_ty],
2762    [IntrReadMem, IntrWillReturn, IntrConvergent, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree],
2763    "",
2764    [SDNPMemOperand]
2765  >;
2766
2767// Wave32
2768// <2 x i32>    @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1))  -> global_load_tr_b64
2769// <8 x i16>    @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1))  -> global_load_tr_b128
2770// <8 x half>   @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1))  -> global_load_tr_b128
2771// <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1)) -> global_load_tr_b128
2772// Wave64
2773// i32          @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1))    -> global_load_tr_b64
2774// <4 x i16>    @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1))  -> global_load_tr_b128
2775// <4 x half>   @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1))  -> global_load_tr_b128
2776// <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1)) -> global_load_tr_b128
2777
2778def int_amdgcn_global_load_tr : AMDGPULoadTr<global_ptr_ty>;
2779
2780// i32 @llvm.amdgcn.wave.id()
2781def int_amdgcn_wave_id :
2782  DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
2783
2784//===----------------------------------------------------------------------===//
2785// Deep learning intrinsics.
2786//===----------------------------------------------------------------------===//
2787
2788// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp)
2789//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2790def int_amdgcn_fdot2 :
2791  ClangBuiltin<"__builtin_amdgcn_fdot2">,
2792  DefaultAttrsIntrinsic<
2793    [llvm_float_ty], // %r
2794    [
2795      llvm_v2f16_ty, // %a
2796      llvm_v2f16_ty, // %b
2797      llvm_float_ty, // %c
2798      llvm_i1_ty     // %clamp
2799    ],
2800    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
2801  >;
2802
2803// f16 %r = llvm.amdgcn.fdot2.f16.f16(v2f16 %a, v2f16 %b, f16 %c)
2804//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2805def int_amdgcn_fdot2_f16_f16 :
2806  ClangBuiltin<"__builtin_amdgcn_fdot2_f16_f16">,
2807  DefaultAttrsIntrinsic<
2808    [llvm_half_ty],  // %r
2809    [
2810      llvm_v2f16_ty, // %a
2811      llvm_v2f16_ty, // %b
2812      llvm_half_ty   // %c
2813    ],
2814    [IntrNoMem, IntrSpeculatable]
2815  >;
2816
2817// bf16 %r = llvm.amdgcn.fdot2.bf16.bf16(v2bf16 %a, v2bf16 %b, bf16 %c)
2818//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2819def int_amdgcn_fdot2_bf16_bf16 :
2820  ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">,
2821  DefaultAttrsIntrinsic<
2822    [llvm_i16_ty],   // %r
2823    [
2824      llvm_v2i16_ty, // %a
2825      llvm_v2i16_ty, // %b
2826      llvm_i16_ty    // %c
2827    ],
2828    [IntrNoMem, IntrSpeculatable]
2829  >;
2830
2831// f32 %r = llvm.amdgcn.fdot2.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp)
2832//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2833def int_amdgcn_fdot2_f32_bf16 :
2834  ClangBuiltin<"__builtin_amdgcn_fdot2_f32_bf16">,
2835  DefaultAttrsIntrinsic<
2836    [llvm_float_ty], // %r
2837    [
2838      llvm_v2i16_ty, // %a
2839      llvm_v2i16_ty, // %b
2840      llvm_float_ty, // %c
2841      llvm_i1_ty     // %clamp
2842    ],
2843    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
2844  >;
2845
2846// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp)
2847//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2848def int_amdgcn_sdot2 :
2849  ClangBuiltin<"__builtin_amdgcn_sdot2">,
2850  DefaultAttrsIntrinsic<
2851    [llvm_i32_ty], // %r
2852    [
2853      llvm_v2i16_ty, // %a
2854      llvm_v2i16_ty, // %b
2855      llvm_i32_ty,   // %c
2856      llvm_i1_ty     // %clamp
2857    ],
2858    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
2859  >;
2860
2861// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp)
2862//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2863def int_amdgcn_udot2 :
2864  ClangBuiltin<"__builtin_amdgcn_udot2">,
2865  DefaultAttrsIntrinsic<
2866    [llvm_i32_ty], // %r
2867    [
2868      llvm_v2i16_ty, // %a
2869      llvm_v2i16_ty, // %b
2870      llvm_i32_ty,   // %c
2871      llvm_i1_ty     // %clamp
2872    ],
2873    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
2874  >;
2875
2876// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp)
2877//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2878def int_amdgcn_sdot4 :
2879  ClangBuiltin<"__builtin_amdgcn_sdot4">,
2880  DefaultAttrsIntrinsic<
2881    [llvm_i32_ty], // %r
2882    [
2883      llvm_i32_ty, // %a
2884      llvm_i32_ty, // %b
2885      llvm_i32_ty, // %c
2886      llvm_i1_ty   // %clamp
2887    ],
2888    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
2889  >;
2890
2891// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp)
2892//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2893def int_amdgcn_udot4 :
2894  ClangBuiltin<"__builtin_amdgcn_udot4">,
2895  DefaultAttrsIntrinsic<
2896    [llvm_i32_ty], // %r
2897    [
2898      llvm_i32_ty, // %a
2899      llvm_i32_ty, // %b
2900      llvm_i32_ty, // %c
2901      llvm_i1_ty   // %clamp
2902    ],
2903    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
2904  >;
2905
2906// i32 %r = llvm.amdgcn.sudot4(i1 %a_sign, v4i8 (as i32) %a, i1 %b_sign, v4i8 (as i32) %b, i32 %c, i1 %clamp)
2907// Treat input as signed (_sign = 1) or unsigned (_sign = 0).
2908// a[i in 0. . . 3] = (%a_sign ? a.i8[i] : promoteToSigned(a.u8[i]));
2909// b[i in 0. . . 3] = (%b_sign ? b.i8[i] : promoteToSigned(b.u8[i]));
2910//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2911def int_amdgcn_sudot4 :
2912  ClangBuiltin<"__builtin_amdgcn_sudot4">,
2913  DefaultAttrsIntrinsic<
2914    [llvm_i32_ty], // %r
2915    [
2916      llvm_i1_ty,  // %a_sign
2917      llvm_i32_ty, // %a
2918      llvm_i1_ty,  // %b_sign
2919      llvm_i32_ty, // %b
2920      llvm_i32_ty, // %c
2921      llvm_i1_ty   // %clamp
2922    ],
2923    [IntrNoMem, IntrSpeculatable,
2924     ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
2925  >;
2926
2927// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp)
2928//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
2929//        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
2930def int_amdgcn_sdot8 :
2931  ClangBuiltin<"__builtin_amdgcn_sdot8">,
2932  DefaultAttrsIntrinsic<
2933    [llvm_i32_ty], // %r
2934    [
2935      llvm_i32_ty, // %a
2936      llvm_i32_ty, // %b
2937      llvm_i32_ty, // %c
2938      llvm_i1_ty   // %clamp
2939    ],
2940    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
2941  >;
2942
2943// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp)
2944//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
2945//        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
2946def int_amdgcn_udot8 :
2947  ClangBuiltin<"__builtin_amdgcn_udot8">,
2948  DefaultAttrsIntrinsic<
2949    [llvm_i32_ty], // %r
2950    [
2951      llvm_i32_ty, // %a
2952      llvm_i32_ty, // %b
2953      llvm_i32_ty, // %c
2954      llvm_i1_ty   // %clamp
2955    ],
2956    [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
2957  >;
2958
2959// i32 %r = llvm.amdgcn.sudot8(i1 %a_sign, v8i4 (as i32) %a, i1 %b_sign, v8i4 (as i32) %b, i32 %c, i1 %clamp)
2960// Treat input as signed (_sign = 1) or unsigned (_sign = 0).
2961// a[i in 0. . . 7] = (%a_sign ? a.i4[i] : promoteToSigned(a.u4[i]));
2962// b[i in 0. . . 7] = (%b_sign ? b.i4[i] : promoteToSigned(b.u4[i]));
2963//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
2964//        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
2965  def int_amdgcn_sudot8 :
2966  ClangBuiltin<"__builtin_amdgcn_sudot8">,
2967  DefaultAttrsIntrinsic<
2968    [llvm_i32_ty], // %r
2969    [
2970      llvm_i1_ty,  // %a_sign
2971      llvm_i32_ty, // %a
2972      llvm_i1_ty,  // %b_sign
2973      llvm_i32_ty, // %b
2974      llvm_i32_ty, // %c
2975      llvm_i1_ty   // %clamp
2976    ],
2977    [IntrNoMem, IntrSpeculatable,
2978     ImmArg<ArgIndex<0>>,  ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
2979  >;
2980
2981// f32 %r = llvm.amdgcn.dot4.f32.type_a.type_b (v4type_a (as i32) %a, v4type_b (as i32) %b, f32 %c)
2982//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2983class AMDGPU8bitFloatDot4Intrinsic :
2984  ClangBuiltin<!subst("int", "__builtin", NAME)>,
2985  DefaultAttrsIntrinsic<
2986    [llvm_float_ty], // %r
2987    [
2988      llvm_i32_ty,   // %a
2989      llvm_i32_ty,   // %b
2990      llvm_float_ty, // %c
2991    ],
2992    [IntrNoMem, IntrSpeculatable]
2993  >;
2994
2995def int_amdgcn_dot4_f32_fp8_bf8 : AMDGPU8bitFloatDot4Intrinsic;
2996def int_amdgcn_dot4_f32_bf8_fp8 : AMDGPU8bitFloatDot4Intrinsic;
2997def int_amdgcn_dot4_f32_fp8_fp8 : AMDGPU8bitFloatDot4Intrinsic;
2998def int_amdgcn_dot4_f32_bf8_bf8 : AMDGPU8bitFloatDot4Intrinsic;
2999
3000//===----------------------------------------------------------------------===//
3001// gfx908 intrinsics
3002// ===----------------------------------------------------------------------===//
3003
3004def int_amdgcn_global_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
3005
3006// llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
3007class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
3008  ClangBuiltin<!subst("int", "__builtin", NAME)>,
3009  DefaultAttrsIntrinsic<[DestTy],
3010            [SrcABTy, SrcABTy, DestTy,
3011             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
3012            [IntrConvergent, IntrNoMem,
3013             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
3014
3015def int_amdgcn_mfma_f32_32x32x1f32  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>;
3016def int_amdgcn_mfma_f32_16x16x1f32  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>;
3017def int_amdgcn_mfma_f32_4x4x1f32    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_float_ty>;
3018def int_amdgcn_mfma_f32_32x32x2f32  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>;
3019def int_amdgcn_mfma_f32_16x16x4f32  : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_float_ty>;
3020def int_amdgcn_mfma_f32_32x32x4f16  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4f16_ty>;
3021def int_amdgcn_mfma_f32_16x16x4f16  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>;
3022def int_amdgcn_mfma_f32_4x4x4f16    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4f16_ty>;
3023def int_amdgcn_mfma_f32_32x32x8f16  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>;
3024def int_amdgcn_mfma_f32_16x16x16f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4f16_ty>;
3025def int_amdgcn_mfma_i32_32x32x4i8   : AMDGPUMfmaIntrinsic<llvm_v32i32_ty, llvm_i32_ty>;
3026def int_amdgcn_mfma_i32_16x16x4i8   : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>;
3027def int_amdgcn_mfma_i32_4x4x4i8     : AMDGPUMfmaIntrinsic<llvm_v4i32_ty,  llvm_i32_ty>;
3028def int_amdgcn_mfma_i32_32x32x8i8   : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>;
3029def int_amdgcn_mfma_i32_16x16x16i8  : AMDGPUMfmaIntrinsic<llvm_v4i32_ty,  llvm_i32_ty>;
3030def int_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v2i16_ty>;
3031def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>;
3032def int_amdgcn_mfma_f32_4x4x2bf16   : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2i16_ty>;
3033def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>;
3034def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2i16_ty>;
3035
3036//===----------------------------------------------------------------------===//
3037// gfx90a intrinsics
3038// ===----------------------------------------------------------------------===//
3039
3040def int_amdgcn_global_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
3041def int_amdgcn_global_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
3042def int_amdgcn_flat_atomic_fadd   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
3043def int_amdgcn_flat_atomic_fmin   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
3044def int_amdgcn_flat_atomic_fmax   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
3045
3046def int_amdgcn_mfma_f32_32x32x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>;
3047def int_amdgcn_mfma_f32_16x16x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
3048def int_amdgcn_mfma_f32_4x4x4bf16_1k    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty>;
3049def int_amdgcn_mfma_f32_32x32x8bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
3050def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty>;
3051
3052// Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA.
3053//       Three bits corresponding to the neg modifier applied to the respective
3054//       source operand.
3055def int_amdgcn_mfma_f64_16x16x4f64      : AMDGPUMfmaIntrinsic<llvm_v4f64_ty,  llvm_double_ty>;
3056def int_amdgcn_mfma_f64_4x4x4f64        : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>;
3057
3058//===----------------------------------------------------------------------===//
3059// gfx940 intrinsics
3060// ===----------------------------------------------------------------------===//
3061
3062// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
3063def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
3064def int_amdgcn_flat_atomic_fadd_v2bf16   : AMDGPUAtomicRtn<llvm_v2i16_ty>;
3065def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
3066    [llvm_v2i16_ty],
3067    [LLVMQualPointerType<3>, llvm_v2i16_ty],
3068    [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>,
3069    ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;
3070
3071def int_amdgcn_mfma_i32_16x16x32_i8     : AMDGPUMfmaIntrinsic<llvm_v4i32_ty,  llvm_i64_ty>;
3072def int_amdgcn_mfma_i32_32x32x16_i8     : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
3073def int_amdgcn_mfma_f32_16x16x8_xf32    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2f32_ty>;
3074def int_amdgcn_mfma_f32_32x32x4_xf32    : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;
3075
3076class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> :
3077  AMDGPUMfmaIntrinsic<DestTy, llvm_i64_ty>;
3078
3079multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> {
3080  foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in
3081    def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>;
3082}
3083
3084defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>;
3085defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>;
3086
3087// llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid
3088class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> :
3089  ClangBuiltin<!subst("int", "__builtin", NAME)>,
3090  DefaultAttrsIntrinsic<[DestTy],
3091            [SrcA, SrcB, DestTy, llvm_i32_ty,
3092             llvm_i32_ty, llvm_i32_ty],
3093            [IntrConvergent, IntrNoMem,
3094             ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
3095
3096def int_amdgcn_smfmac_f32_16x16x32_f16  : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty,  llvm_v4f16_ty, llvm_v8f16_ty>;
3097def int_amdgcn_smfmac_f32_32x32x16_f16  : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
3098def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty, llvm_v8i16_ty>;
3099def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
3100def int_amdgcn_smfmac_i32_16x16x64_i8   : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty,  llvm_v2i32_ty, llvm_v4i32_ty>;
3101def int_amdgcn_smfmac_i32_32x32x32_i8   : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
3102
3103class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> :
3104  AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>;
3105
3106multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> {
3107  foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in
3108    def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>;
3109}
3110
3111defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>;
3112defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>;
3113
3114// llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3]
3115// byte_sel selects byte from srcA.
3116def int_amdgcn_cvt_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_bf8">,
3117  DefaultAttrsIntrinsic<[llvm_float_ty],
3118            [llvm_i32_ty, llvm_i32_ty],
3119            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
3120
3121// llvm.amdgcn.cvt.f32.fp8 float vdst, int srcA, imm byte_sel [0..3]
3122def int_amdgcn_cvt_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_fp8">,
3123  DefaultAttrsIntrinsic<[llvm_float_ty],
3124            [llvm_i32_ty, llvm_i32_ty],
3125            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
3126
3127// llvm.amdgcn.cvt.pk.f32.bf8 float2 vdst, int srcA, imm word_sel
3128// word_sel = 1 selects 2 high bytes, 0 selects 2 low bytes.
3129def int_amdgcn_cvt_pk_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_bf8">,
3130  DefaultAttrsIntrinsic<[llvm_v2f32_ty],
3131            [llvm_i32_ty, llvm_i1_ty],
3132            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
3133
3134// llvm.amdgcn.cvt.pk.f32.fp8 float2 vdst, int srcA, imm word_sel.
3135def int_amdgcn_cvt_pk_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_fp8">,
3136  DefaultAttrsIntrinsic<[llvm_v2f32_ty],
3137            [llvm_i32_ty, llvm_i1_ty],
3138            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
3139
3140// llvm.amdgcn.cvt.pk.bf8.f32 int vdst, float srcA, float srcB, int old, imm word_sel
3141// word_sel = 1 selects 2 high bytes in the vdst, 0 selects 2 low bytes.
3142def int_amdgcn_cvt_pk_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f32">,
3143  DefaultAttrsIntrinsic<[llvm_i32_ty],
3144            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
3145            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
3146
3147// llvm.amdgcn.cvt.pk.fp8.f32 int vdst, float srcA, float srcB, int old, imm word_sel
3148def int_amdgcn_cvt_pk_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32">,
3149  DefaultAttrsIntrinsic<[llvm_i32_ty],
3150            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
3151            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
3152
3153// llvm.amdgcn.cvt.sr.bf8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
3154// byte_sel selects byte to write into vdst.
3155def int_amdgcn_cvt_sr_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f32">,
3156  DefaultAttrsIntrinsic<[llvm_i32_ty],
3157            [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
3158            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
3159
3160// llvm.amdgcn.cvt.sr.fp8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
3161def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">,
3162  DefaultAttrsIntrinsic<[llvm_i32_ty],
3163            [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
3164            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
3165
3166//===----------------------------------------------------------------------===//
3167// Special Intrinsics for backend internal use only. No frontend
3168// should emit calls to these.
3169// ===----------------------------------------------------------------------===//
3170def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_anyint_ty],
3171  [llvm_i1_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
3172>;
3173
3174def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_anyint_ty],
3175  [llvm_anyint_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
3176>;
3177
3178def int_amdgcn_if_break : Intrinsic<[llvm_anyint_ty],
3179  [llvm_i1_ty, LLVMMatchType<0>],
3180  [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
3181>;
3182
3183def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
3184  [llvm_anyint_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
3185>;
3186
3187def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty],
3188  [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
3189
3190// Represent unreachable in a divergent region.
3191def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent, IntrNoCallback, IntrNoFree]>;
3192
3193// Emit 2.5 ulp, no denormal division. Should only be inserted by
3194// pass based on !fpmath metadata.
3195def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
3196  [llvm_float_ty], [llvm_float_ty, llvm_float_ty],
3197  [IntrNoMem, IntrSpeculatable]
3198>;
3199}
3200