1//===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics -----*- tablegen -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines all of the R600-specific intrinsics.
10//
11//===----------------------------------------------------------------------===//
12
13class AMDGPUReadPreloadRegisterIntrinsic
14  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
15
16class AMDGPUReadPreloadRegisterIntrinsicNamed<string name>
17  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>, ClangBuiltin<name>;
18
19// Used to tag image and resource intrinsics with information used to generate
20// mem operands.
21class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = false> {
22  int RsrcArg = rsrcarg;
23  bit IsImage = isimage;
24}
25
26let TargetPrefix = "r600" in {
27
28multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz {
29  def _x : AMDGPUReadPreloadRegisterIntrinsic;
30  def _y : AMDGPUReadPreloadRegisterIntrinsic;
31  def _z : AMDGPUReadPreloadRegisterIntrinsic;
32}
33
34multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> {
35  def _x : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_x")>;
36  def _y : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_y")>;
37  def _z : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_z")>;
38}
39
40defm int_r600_read_global_size : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
41                                 <"__builtin_r600_read_global_size">;
42defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
43                             <"__builtin_r600_read_ngroups">;
44defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
45                          <"__builtin_r600_read_tgid">;
46
47defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz;
48defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz;
49
50def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">,
51  Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>;
52
53// AS 7 is PARAM_I_ADDRESS, used for kernel arguments
54def int_r600_implicitarg_ptr :
55  ClangBuiltin<"__builtin_r600_implicitarg_ptr">,
56  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 7>], [],
57  [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
58
59def int_r600_rat_store_typed :
60  // 1st parameter: Data
61  // 2nd parameter: Index
62  // 3rd parameter: Constant RAT ID
63  Intrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrWillReturn]>,
64  ClangBuiltin<"__builtin_r600_rat_store_typed">;
65
66def int_r600_recipsqrt_ieee :  Intrinsic<
67  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
68>;
69
70def int_r600_recipsqrt_clamped : Intrinsic<
71  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
72>;
73
74def int_r600_cube : Intrinsic<
75  [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
76>;
77
78def int_r600_store_stream_output : Intrinsic<
79  [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrWillReturn]
80>;
81
82class TextureIntrinsicFloatInput : Intrinsic<[llvm_v4f32_ty], [
83  llvm_v4f32_ty, // Coord
84  llvm_i32_ty,   // offset_x
85  llvm_i32_ty,   // offset_y,
86  llvm_i32_ty,   // offset_z,
87  llvm_i32_ty,   // resource_id
88  llvm_i32_ty,   // samplerid
89  llvm_i32_ty,   // coord_type_x
90  llvm_i32_ty,   // coord_type_y
91  llvm_i32_ty,   // coord_type_z
92  llvm_i32_ty],  // coord_type_w
93  [IntrNoMem, IntrWillReturn]
94>;
95
96class TextureIntrinsicInt32Input : Intrinsic<[llvm_v4i32_ty], [
97    llvm_v4i32_ty, // Coord
98    llvm_i32_ty,   // offset_x
99    llvm_i32_ty,   // offset_y,
100    llvm_i32_ty,   // offset_z,
101    llvm_i32_ty,   // resource_id
102    llvm_i32_ty,   // samplerid
103    llvm_i32_ty,   // coord_type_x
104    llvm_i32_ty,   // coord_type_y
105    llvm_i32_ty,   // coord_type_z
106    llvm_i32_ty],  // coord_type_w
107    [IntrNoMem, IntrWillReturn]
108>;
109
110def int_r600_store_swizzle :
111  Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrWillReturn]
112>;
113
114def int_r600_tex : TextureIntrinsicFloatInput;
115def int_r600_texc : TextureIntrinsicFloatInput;
116def int_r600_txl : TextureIntrinsicFloatInput;
117def int_r600_txlc : TextureIntrinsicFloatInput;
118def int_r600_txb : TextureIntrinsicFloatInput;
119def int_r600_txbc : TextureIntrinsicFloatInput;
120def int_r600_txf : TextureIntrinsicInt32Input;
121def int_r600_txq : TextureIntrinsicInt32Input;
122def int_r600_ddx : TextureIntrinsicFloatInput;
123def int_r600_ddy : TextureIntrinsicFloatInput;
124
125def int_r600_dot4 : Intrinsic<[llvm_float_ty],
126  [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
127>;
128
129def int_r600_kill : Intrinsic<[], [llvm_float_ty], [IntrWillReturn]>;
130
131} // End TargetPrefix = "r600"
132
133let TargetPrefix = "amdgcn" in {
134
135//===----------------------------------------------------------------------===//
136// ABI Special Intrinsics
137//===----------------------------------------------------------------------===//
138
139defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz;
140defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
141                               <"__builtin_amdgcn_workgroup_id">;
142
143def int_amdgcn_dispatch_ptr :
144  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
145  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
146
147def int_amdgcn_queue_ptr :
148  ClangBuiltin<"__builtin_amdgcn_queue_ptr">,
149  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
150  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
151
152def int_amdgcn_kernarg_segment_ptr :
153  ClangBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">,
154  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
155  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
156
157def int_amdgcn_implicitarg_ptr :
158  ClangBuiltin<"__builtin_amdgcn_implicitarg_ptr">,
159  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
160  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
161
162def int_amdgcn_groupstaticsize :
163  ClangBuiltin<"__builtin_amdgcn_groupstaticsize">,
164  Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
165
166def int_amdgcn_dispatch_id :
167  ClangBuiltin<"__builtin_amdgcn_dispatch_id">,
168  Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
169
170def int_amdgcn_implicit_buffer_ptr :
171  ClangBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
172  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
173  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
174
175// Set EXEC to the 64-bit value given.
176// This is always moved to the beginning of the basic block.
177// FIXME: Should be mangled for wave size.
178def int_amdgcn_init_exec : Intrinsic<[],
179  [llvm_i64_ty],      // 64-bit literal constant
180  [IntrConvergent, ImmArg<ArgIndex<0>>]>;
181
182// Set EXEC according to a thread count packed in an SGPR input:
183//    thread_count = (input >> bitoffset) & 0x7f;
184// This is always moved to the beginning of the basic block.
185// Note: only inreg arguments to the parent function are valid as
186// inputs to this intrinsic, computed values cannot be used.
187def int_amdgcn_init_exec_from_input : Intrinsic<[],
188  [llvm_i32_ty,       // 32-bit SGPR input
189   llvm_i32_ty],      // bit offset of the thread count
190  [IntrConvergent, ImmArg<ArgIndex<1>>]>;
191
192def int_amdgcn_wavefrontsize :
193  ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
194  Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
195
196
197//===----------------------------------------------------------------------===//
198// Instruction Intrinsics
199//===----------------------------------------------------------------------===//
200
201// The first parameter is s_sendmsg immediate (i16),
202// the second one is copied to m0
203def int_amdgcn_s_sendmsg : ClangBuiltin<"__builtin_amdgcn_s_sendmsg">,
204  Intrinsic <[], [llvm_i32_ty, llvm_i32_ty],
205  [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
206def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">,
207  Intrinsic <[], [llvm_i32_ty, llvm_i32_ty],
208  [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
209
210
211// gfx11 intrinsic
212// The first parameter is s_sendmsg immediate (i16). Return type is i32 or i64.
213def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty],
214  [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
215
216def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">,
217  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn]>;
218
219def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">,
220  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn]>;
221
222// The 1st parameter is a mask for the types of instructions that may be allowed
223// to cross the SCHED_BARRIER during scheduling.
224//     MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER.
225//     MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be
226//                         scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass.
227//     MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER.
228//     MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER.
229//     MASK = 0x0000 0008: MFMA instructions may be scheduled across SCHED_BARRIER.
230//     MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER.
231//     MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER.
232//     MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER.
233//     MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER.
234//     MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER.
235//     MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER.
236def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">,
237  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
238                                IntrWillReturn]>;
239
240def int_amdgcn_s_waitcnt : ClangBuiltin<"__builtin_amdgcn_s_waitcnt">,
241  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
242
243def int_amdgcn_div_scale : Intrinsic<
244  // 1st parameter: Numerator
245  // 2nd parameter: Denominator
246  // 3rd parameter: Select quotient. Must equal Numerator or Denominator.
247  //                (0 = Denominator, 1 = Numerator).
248  [llvm_anyfloat_ty, llvm_i1_ty],
249  [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
250  [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>, IntrWillReturn]
251>;
252
253def int_amdgcn_div_fmas : Intrinsic<[llvm_anyfloat_ty],
254  [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
255  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
256>;
257
258def int_amdgcn_div_fixup : Intrinsic<[llvm_anyfloat_ty],
259  [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
260  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
261>;
262
263// Look Up 2.0 / pi src0 with segment select src1[4:0]
264def int_amdgcn_trig_preop : Intrinsic<
265  [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty],
266  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
267>;
268
269def int_amdgcn_sin : Intrinsic<
270  [llvm_anyfloat_ty], [LLVMMatchType<0>],
271  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
272>;
273
274def int_amdgcn_cos : Intrinsic<
275  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
276>;
277
278def int_amdgcn_log_clamp : Intrinsic<
279  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
280>;
281
282def int_amdgcn_fmul_legacy : ClangBuiltin<"__builtin_amdgcn_fmul_legacy">,
283  Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
284  [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]
285>;
286
287// Fused single-precision multiply-add with legacy behaviour for the multiply,
288// which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is
289// intended for use on subtargets that have the v_fma_legacy_f32 and/or
290// v_fmac_legacy_f32 instructions. (Note that v_fma_legacy_f16 is unrelated and
291// has a completely different kind of legacy behaviour.)
292def int_amdgcn_fma_legacy :
293  Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
294  [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]
295>;
296
297def int_amdgcn_rcp : Intrinsic<
298  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
299>;
300
301def int_amdgcn_rcp_legacy : ClangBuiltin<"__builtin_amdgcn_rcp_legacy">,
302  Intrinsic<[llvm_float_ty], [llvm_float_ty],
303  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
304>;
305
306def int_amdgcn_sqrt :  Intrinsic<
307  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
308>;
309
310def int_amdgcn_rsq :  Intrinsic<
311  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
312>;
313
314def int_amdgcn_rsq_legacy :  ClangBuiltin<"__builtin_amdgcn_rsq_legacy">,
315  Intrinsic<
316  [llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
317>;
318
319// out = 1.0 / sqrt(a) result clamped to +/- max_float.
320def int_amdgcn_rsq_clamp : Intrinsic<
321  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
322
323def int_amdgcn_ldexp : Intrinsic<
324  [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty],
325  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
326>;
327
328def int_amdgcn_frexp_mant : Intrinsic<
329  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
330>;
331
332def int_amdgcn_frexp_exp : Intrinsic<
333  [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
334>;
335
336// v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0
337// and always uses rtz, so is not suitable for implementing the OpenCL
338// fract function. It should be ok on VI.
339def int_amdgcn_fract : Intrinsic<
340  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
341>;
342
343def int_amdgcn_cvt_pkrtz : ClangBuiltin<"__builtin_amdgcn_cvt_pkrtz">,
344  Intrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty],
345            [IntrNoMem, IntrSpeculatable, IntrWillReturn]
346>;
347
348def int_amdgcn_cvt_pknorm_i16 :
349  ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_i16">,
350  Intrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
351            [IntrNoMem, IntrSpeculatable, IntrWillReturn]
352>;
353
354def int_amdgcn_cvt_pknorm_u16 :
355  ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_u16">,
356  Intrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
357            [IntrNoMem, IntrSpeculatable, IntrWillReturn]
358>;
359
360def int_amdgcn_cvt_pk_i16 :
361    ClangBuiltin<"__builtin_amdgcn_cvt_pk_i16">,
362    Intrinsic<
363  [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
364  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
365>;
366
367def int_amdgcn_cvt_pk_u16 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_u16">,
368  Intrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
369    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
370>;
371
372def int_amdgcn_class : Intrinsic<
373  [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty],
374  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
375>;
376
377def int_amdgcn_fmed3 : ClangBuiltin<"__builtin_amdgcn_fmed3">,
378  Intrinsic<[llvm_anyfloat_ty],
379    [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
380    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
381>;
382
383def int_amdgcn_cubeid : ClangBuiltin<"__builtin_amdgcn_cubeid">,
384  Intrinsic<[llvm_float_ty],
385    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
386    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
387>;
388
389def int_amdgcn_cubema : ClangBuiltin<"__builtin_amdgcn_cubema">,
390  Intrinsic<[llvm_float_ty],
391  [llvm_float_ty, llvm_float_ty, llvm_float_ty],
392  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
393>;
394
395def int_amdgcn_cubesc : ClangBuiltin<"__builtin_amdgcn_cubesc">,
396  Intrinsic<[llvm_float_ty],
397    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
398    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
399>;
400
401def int_amdgcn_cubetc : ClangBuiltin<"__builtin_amdgcn_cubetc">,
402  Intrinsic<[llvm_float_ty],
403    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
404    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
405>;
406
407// v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz
408// should be used.
409def int_amdgcn_sffbh :
410  Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>],
411  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
412>;
413
414// v_mad_f32|f16/v_mac_f32|f16, selected regardless of denorm support.
415def int_amdgcn_fmad_ftz :
416  Intrinsic<[llvm_anyfloat_ty],
417            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
418            [IntrNoMem, IntrSpeculatable, IntrWillReturn]
419>;
420
421// Fields should mirror atomicrmw
422class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty],
423  [llvm_anyptr_ty,
424  LLVMMatchType<0>,
425  llvm_i32_ty, // ordering
426  llvm_i32_ty, // scope
427  llvm_i1_ty], // isVolatile
428  [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
429   ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "",
430  [SDNPMemOperand]
431>;
432
433def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin;
434def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin;
435
436class AMDGPULDSIntrin :
437  Intrinsic<[llvm_any_ty],
438    [LLVMQualPointerType<LLVMMatchType<0>, 3>,
439    LLVMMatchType<0>,
440    llvm_i32_ty, // ordering
441    llvm_i32_ty, // scope
442    llvm_i1_ty], // isVolatile
443    [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
444     ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]
445>;
446
447// FIXME: The m0 argument should be moved after the normal arguments
448class AMDGPUDSOrderedIntrinsic : Intrinsic<
449  [llvm_i32_ty],
450  // M0 = {hi16:address, lo16:waveID}. Allow passing M0 as a pointer, so that
451  // the bit packing can be optimized at the IR level.
452  [LLVMQualPointerType<llvm_i32_ty, 2>, // IntToPtr(M0)
453   llvm_i32_ty, // value to add or swap
454   llvm_i32_ty, // ordering
455   llvm_i32_ty, // scope
456   llvm_i1_ty,  // isVolatile
457   llvm_i32_ty, // ordered count index (OA index), also added to the address
458                // gfx10: bits 24-27 indicate the number of active threads/dwords
459   llvm_i1_ty,  // wave release, usually set to 1
460   llvm_i1_ty], // wave done, set to 1 for the last ordered instruction
461  [IntrWillReturn, NoCapture<ArgIndex<0>>,
462   ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>,
463   ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>
464  ]
465>;
466
467class AMDGPUDSAppendConsumedIntrinsic : Intrinsic<
468  [llvm_i32_ty],
469  [llvm_anyptr_ty, // LDS or GDS ptr
470   llvm_i1_ty], // isVolatile
471   [IntrConvergent, IntrWillReturn, IntrArgMemOnly,
472    NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>],
473   "",
474   [SDNPMemOperand]
475>;
476
477def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic;
478def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
479
480// The pointer argument is assumed to be dynamically uniform if a VGPR.
481def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
482def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
483
484def int_amdgcn_ds_fadd : AMDGPULDSIntrin;
485def int_amdgcn_ds_fmin : AMDGPULDSIntrin;
486def int_amdgcn_ds_fmax : AMDGPULDSIntrin;
487
488} // TargetPrefix = "amdgcn"
489
490// New-style image intrinsics
491
492//////////////////////////////////////////////////////////////////////////
493// Dimension-aware image intrinsics framework
494//////////////////////////////////////////////////////////////////////////
495
496// Helper class to represent (type, name) combinations of arguments. The
497// argument names are explanatory and used as DAG operand names for codegen
498// pattern matching.
499class AMDGPUArg<LLVMType ty, string name> {
500  LLVMType Type = ty;
501  string Name = name;
502}
503
504// Return [AMDGPUArg<basety, names[0]>, AMDGPUArg<LLVMMatchType<0>, names[1]>, ...]
505class makeArgList<list<string> names, LLVMType basety> {
506  list<AMDGPUArg> ret =
507    !listconcat([AMDGPUArg<basety, names[0]>],
508                !foreach(name, !tail(names), AMDGPUArg<LLVMMatchType<0>, name>));
509}
510
511// Return arglist, with LLVMMatchType's references shifted by 'shift'.
512class arglistmatchshift<list<AMDGPUArg> arglist, int shift> {
513  list<AMDGPUArg> ret =
514    !foreach(arg, arglist,
515             !if(!isa<LLVMMatchType>(arg.Type),
516                 AMDGPUArg<LLVMMatchType<!add(!cast<LLVMMatchType>(arg.Type).Number, shift)>,
517                           arg.Name>,
518                 arg));
519}
520
521// Return the concatenation of the given arglists. LLVMMatchType's are adjusted
522// accordingly, and shifted by an additional 'shift'.
523class arglistconcat<list<list<AMDGPUArg>> arglists, int shift = 0> {
524  list<AMDGPUArg> ret =
525    !foldl([]<AMDGPUArg>, arglists, lhs, rhs,
526           !listconcat(
527             lhs,
528             arglistmatchshift<rhs,
529                               !add(shift, !foldl(0, lhs, a, b,
530                                                  !add(a, b.Type.isAny)))>.ret));
531}
532
533// Represent texture/image types / dimensionality.
534class AMDGPUDimProps<bits<3> enc, string name, string asmsuffix,
535                     list<string> coord_names, list<string> slice_names,
536                     bit msaa = 0> {
537  AMDGPUDimProps Dim = !cast<AMDGPUDimProps>(NAME);
538  string Name = name; // e.g. "2darraymsaa"
539  string AsmSuffix = asmsuffix; // e.g. 2D_MSAA_ARRAY (used in assembly strings)
540  bits<3> Encoding = enc;
541  bit DA = 0; // DA bit in MIMG encoding
542  bit MSAA = msaa;
543
544  list<AMDGPUArg> CoordSliceArgs =
545    makeArgList<!listconcat(coord_names, slice_names), llvm_anyfloat_ty>.ret;
546  list<AMDGPUArg> CoordSliceIntArgs =
547    makeArgList<!listconcat(coord_names, slice_names), llvm_anyint_ty>.ret;
548  list<AMDGPUArg> GradientArgs =
549    makeArgList<!listconcat(!foreach(name, coord_names, "d" # name # "dh"),
550                            !foreach(name, coord_names, "d" # name # "dv")),
551                llvm_anyfloat_ty>.ret;
552
553  bits<8> NumCoords = !size(CoordSliceArgs);
554  bits<8> NumGradients = !size(GradientArgs);
555}
556
557def AMDGPUDim1D : AMDGPUDimProps<0x0, "1d", "1D", ["s"], []>;
558def AMDGPUDim2D : AMDGPUDimProps<0x1, "2d", "2D", ["s", "t"], []>;
559def AMDGPUDim3D : AMDGPUDimProps<0x2, "3d", "3D", ["s", "t", "r"], []>;
560let DA = 1 in {
561  def AMDGPUDimCube : AMDGPUDimProps<0x3, "cube", "CUBE", ["s", "t"], ["face"]>;
562  def AMDGPUDim1DArray : AMDGPUDimProps<0x4, "1darray", "1D_ARRAY", ["s"], ["slice"]>;
563  def AMDGPUDim2DArray : AMDGPUDimProps<0x5, "2darray", "2D_ARRAY", ["s", "t"], ["slice"]>;
564}
565def AMDGPUDim2DMsaa : AMDGPUDimProps<0x6, "2dmsaa", "2D_MSAA", ["s", "t"], ["fragid"], 1>;
566let DA = 1 in {
567  def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<0x7, "2darraymsaa", "2D_MSAA_ARRAY", ["s", "t"], ["slice", "fragid"], 1>;
568}
569
570def AMDGPUDims {
571  list<AMDGPUDimProps> NoMsaa = [AMDGPUDim1D, AMDGPUDim2D, AMDGPUDim3D,
572                                 AMDGPUDimCube, AMDGPUDim1DArray,
573                                 AMDGPUDim2DArray];
574  list<AMDGPUDimProps> Msaa = [AMDGPUDim2DMsaa, AMDGPUDim2DArrayMsaa];
575  list<AMDGPUDimProps> All = !listconcat(NoMsaa, Msaa);
576}
577
578// Represent sample variants, i.e. _C, _O, _B, ... and combinations thereof.
579class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr> {
580  string UpperCaseMod = ucmod;
581  string LowerCaseMod = lcmod;
582
583  // {offset} {bias} {z-compare}
584  list<AMDGPUArg> ExtraAddrArgs = extra_addr;
585  bit Offset = false;
586  bit Bias = false;
587  bit ZCompare = false;
588  bit Gradients = false;
589
590  // Name of the {lod} or {clamp} argument that is appended to the coordinates,
591  // if any.
592  string LodOrClamp = "";
593}
594
595// AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE
596// AMDGPUSampleVariantsNoGradients: variants supported by IMAGE_GATHER4
597defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = {
598  multiclass AMDGPUSampleHelper_Offset<string ucmod, string lcmod,
599                                       list<AMDGPUArg> extra_addr> {
600    def NAME#lcmod : AMDGPUSampleVariant<ucmod, lcmod, extra_addr>;
601    let Offset = true in
602    def NAME#lcmod#_o : AMDGPUSampleVariant<
603        ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg<llvm_i32_ty, "offset">], extra_addr)>;
604  }
605
606  multiclass AMDGPUSampleHelper_Compare<string ucmod, string lcmod,
607                                        list<AMDGPUArg> extra_addr> {
608    defm NAME : AMDGPUSampleHelper_Offset<ucmod, lcmod, extra_addr>;
609    let ZCompare = true in
610    defm NAME : AMDGPUSampleHelper_Offset<
611        "_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg<llvm_float_ty, "zcompare">])>;
612  }
613
614  multiclass AMDGPUSampleHelper_Clamp<string ucmod, string lcmod,
615                                      list<AMDGPUArg> extra_addr> {
616    defm NAME : AMDGPUSampleHelper_Compare<ucmod, lcmod, extra_addr>;
617    let LodOrClamp = "clamp" in
618    defm NAME : AMDGPUSampleHelper_Compare<ucmod#"_CL", lcmod#"_cl", extra_addr>;
619  }
620
621  defset list<AMDGPUSampleVariant> AMDGPUSampleVariantsNoGradients = {
622    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>;
623    let Bias = true in
624    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<
625        "_B", "_b", [AMDGPUArg<llvm_anyfloat_ty, "bias">]>;
626    let LodOrClamp = "lod" in
627    defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_L", "_l", []>;
628    defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_LZ", "_lz", []>;
629  }
630
631  let Gradients = true in {
632    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_D", "_d", []>;
633    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_CD", "_cd", []>;
634  }
635}
636
637// Helper class to capture the profile of a dimension-aware image intrinsic.
638// This information is used to generate the intrinsic's type and to inform
639// codegen pattern matching.
640class AMDGPUDimProfile<string opmod,
641                       AMDGPUDimProps dim> {
642  AMDGPUDimProps Dim = dim;
643  string OpMod = opmod; // the corresponding instruction is named IMAGE_OpMod
644
645  // These are intended to be overwritten by subclasses
646  bit IsSample = false;
647  bit IsAtomic = false;
648  list<LLVMType> RetTypes = [];
649  list<AMDGPUArg> DataArgs = [];
650  list<AMDGPUArg> ExtraAddrArgs = [];
651  bit Offset = false;
652  bit Bias = false;
653  bit ZCompare = false;
654  bit Gradients = false;
655  string LodClampMip = "";
656
657  int NumRetAndDataAnyTypes =
658    !foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b,
659           !add(a, b.isAny));
660
661  list<AMDGPUArg> AddrArgs =
662    arglistconcat<[ExtraAddrArgs,
663                   !if(Gradients, dim.GradientArgs, []),
664                   !listconcat(!if(IsSample, dim.CoordSliceArgs, dim.CoordSliceIntArgs),
665                               !if(!empty(LodClampMip),
666                                   []<AMDGPUArg>,
667                                   [AMDGPUArg<LLVMMatchType<0>, LodClampMip>]))],
668                  NumRetAndDataAnyTypes>.ret;
669  list<LLVMType> AddrTypes = !foreach(arg, AddrArgs, arg.Type);
670  list<AMDGPUArg> AddrDefaultArgs =
671    !foreach(arg, AddrArgs,
672             AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)),
673                           !if(IsSample, llvm_float_ty, llvm_i32_ty), arg.Type),
674                       arg.Name>);
675  list<AMDGPUArg> AddrA16Args =
676    !foreach(arg, AddrArgs,
677             AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)),
678                           !if(IsSample, llvm_half_ty, llvm_i16_ty), arg.Type),
679                       arg.Name>);
680}
681
682class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod, base.Dim> {
683  let IsSample = base.IsSample;
684  let IsAtomic = base.IsAtomic;
685  let RetTypes = base.RetTypes;
686  let DataArgs = base.DataArgs;
687  let ExtraAddrArgs = base.ExtraAddrArgs;
688  let Offset = base.Offset;
689  let Bias = base.Bias;
690  let ZCompare = base.ZCompare;
691  let Gradients = base.Gradients;
692  let LodClampMip = base.LodClampMip;
693}
694
695class AMDGPUDimSampleProfile<string opmod,
696                             AMDGPUDimProps dim,
697                             AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> {
698  let IsSample = true;
699  let RetTypes = [llvm_any_ty];
700  let ExtraAddrArgs = sample.ExtraAddrArgs;
701  let Offset = sample.Offset;
702  let Bias = sample.Bias;
703  let ZCompare = sample.ZCompare;
704  let Gradients = sample.Gradients;
705  let LodClampMip = sample.LodOrClamp;
706}
707
708class AMDGPUDimNoSampleProfile<string opmod,
709                               AMDGPUDimProps dim,
710                               list<LLVMType> retty,
711                               list<AMDGPUArg> dataargs,
712                               bit Mip = false> : AMDGPUDimProfile<opmod, dim> {
713  let RetTypes = retty;
714  let DataArgs = dataargs;
715  let LodClampMip = !if(Mip, "mip", "");
716}
717
718class AMDGPUDimAtomicProfile<string opmod,
719                             AMDGPUDimProps dim,
720                             list<AMDGPUArg> dataargs> : AMDGPUDimProfile<opmod, dim> {
721  let RetTypes = [llvm_anyint_ty];
722  let DataArgs = dataargs;
723  let IsAtomic = true;
724}
725
726class AMDGPUDimAtomicFloatProfile<string opmod, AMDGPUDimProps dim,
727                                  list<AMDGPUArg> dataargs>
728    : AMDGPUDimAtomicProfile<opmod, dim, dataargs> {
729  let RetTypes = [llvm_anyfloat_ty];
730}
731
732class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim>
733    : AMDGPUDimProfile<"GET_RESINFO", dim> {
734  let RetTypes = [llvm_anyfloat_ty];
735  let DataArgs = [];
736  let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">];
737  let LodClampMip = "mip";
738}
739
740// Helper class for figuring out image intrinsic argument indexes.
741class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> {
742  int NumDataArgs = !size(P_.DataArgs);
743  int NumDmaskArgs = !not(P_.IsAtomic);
744  int NumOffsetArgs = !if(P_.Offset, 1, 0);
745  int NumBiasArgs = !if(P_.Bias, 1, 0);
746  int NumZCompareArgs = !if(P_.ZCompare, 1, 0);
747  int NumExtraAddrArgs = !add(NumOffsetArgs, NumBiasArgs, NumZCompareArgs);
748  int NumVAddrArgs = !size(P_.AddrArgs);
749  int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0);
750  int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs));
751  int NumRSrcArgs = 1;
752  int NumSampArgs = !if(P_.IsSample, 2, 0);
753  int DmaskArgIndex = NumDataArgs;
754  int VAddrArgIndex = !add(DmaskArgIndex, NumDmaskArgs);
755  int OffsetArgIndex = VAddrArgIndex;
756  int BiasArgIndex = !add(VAddrArgIndex, NumOffsetArgs);
757  int ZCompareArgIndex = !add(BiasArgIndex, NumBiasArgs);
758  int GradientArgIndex = !add(VAddrArgIndex, NumExtraAddrArgs);
759  int CoordArgIndex = !add(GradientArgIndex, NumGradientArgs);
760  int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1);
761  int MipArgIndex = LodArgIndex;
762  int RsrcArgIndex = !add(VAddrArgIndex, NumVAddrArgs);
763  int SampArgIndex = !add(RsrcArgIndex, NumRSrcArgs);
764  int UnormArgIndex = !add(SampArgIndex, 1);
765  int TexFailCtrlArgIndex = !add(SampArgIndex, NumSampArgs);
766  int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1);
767}
768
769// All dimension-aware intrinsics are derived from this class.
770class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
771                              list<IntrinsicProperty> props,
772                              list<SDNodeProperty> sdnodeprops> : Intrinsic<
773    P_.RetTypes,        // vdata(VGPR) -- for load/atomic-with-return
774    !listconcat(
775      !foreach(arg, P_.DataArgs, arg.Type),      // vdata(VGPR) -- for store/atomic
776      !if(P_.IsAtomic, [], [llvm_i32_ty]),       // dmask(imm)
777      P_.AddrTypes,                              // vaddr(VGPR)
778      [llvm_v8i32_ty],                           // rsrc(SGPR)
779      !if(P_.IsSample, [llvm_v4i32_ty,           // samp(SGPR)
780                        llvm_i1_ty], []),        // unorm(imm)
781      [llvm_i32_ty,                              // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe)
782       llvm_i32_ty]),                            // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc)
783
784     !listconcat(props,
785          !if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]),
786          !if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []),
787          [IntrWillReturn],
788          [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>,
789           ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>]),
790
791
792      "", sdnodeprops>,
793  AMDGPURsrcIntrinsic<!add(!size(P_.DataArgs), !size(P_.AddrTypes),
794                           !if(P_.IsAtomic, 0, 1)), 1> {
795  AMDGPUDimProfile P = P_;
796
797  AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME);
798
799  let TargetPrefix = "amdgcn";
800}
801
802// Marker class for intrinsics with a DMask that determines the returned
803// channels.
804class AMDGPUImageDMaskIntrinsic;
805
806defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
807
808  //////////////////////////////////////////////////////////////////////////
809  // Load and store intrinsics
810  //////////////////////////////////////////////////////////////////////////
811  multiclass AMDGPUImageDimIntrinsicsNoMsaa<string opmod,
812                                            list<LLVMType> retty,
813                                            list<AMDGPUArg> dataargs,
814                                            list<IntrinsicProperty> props,
815                                            list<SDNodeProperty> sdnodeprops,
816                                            bit Mip = false> {
817    foreach dim = AMDGPUDims.NoMsaa in {
818      def !strconcat(NAME, "_", dim.Name)
819        : AMDGPUImageDimIntrinsic<
820            AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>,
821            props, sdnodeprops>;
822    }
823  }
824
825  multiclass AMDGPUImageDimIntrinsicsAll<string opmod,
826                                         list<LLVMType> retty,
827                                         list<AMDGPUArg> dataargs,
828                                         list<IntrinsicProperty> props,
829                                         list<SDNodeProperty> sdnodeprops,
830                                         bit Mip = false> {
831    foreach dim = AMDGPUDims.All in {
832      def !strconcat(NAME, "_", dim.Name)
833        : AMDGPUImageDimIntrinsic<
834            AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>,
835            props, sdnodeprops>;
836    }
837  }
838
839  defm int_amdgcn_image_load
840    : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem],
841                                  [SDNPMemOperand]>,
842      AMDGPUImageDMaskIntrinsic;
843  defm int_amdgcn_image_load_mip
844    : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [],
845                                     [IntrReadMem, IntrWillReturn], [SDNPMemOperand], 1>,
846      AMDGPUImageDMaskIntrinsic;
847
848  defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll<
849              "STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
850              [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>;
851  defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa<
852              "STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
853              [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>;
854
855  //////////////////////////////////////////////////////////////////////////
856  // MSAA intrinsics
857  //////////////////////////////////////////////////////////////////////////
858  foreach dim = AMDGPUDims.Msaa in {
859    def int_amdgcn_image_msaa_load_x # _ # dim.Name:
860        AMDGPUImageDimIntrinsic<
861            AMDGPUDimNoSampleProfile<"MSAA_LOAD_X", dim, [llvm_any_ty], []>,
862            [IntrReadMem], [SDNPMemOperand]>;
863  }
864
865  foreach dim = AMDGPUDims.Msaa in {
866    def int_amdgcn_image_msaa_load # _ # dim.Name:
867        AMDGPUImageDimIntrinsic<
868            AMDGPUDimNoSampleProfile<"MSAA_LOAD", dim, [llvm_any_ty], []>,
869            [IntrReadMem], [SDNPMemOperand]>;
870  }
871
872  //////////////////////////////////////////////////////////////////////////
873  // sample and getlod intrinsics
874  //////////////////////////////////////////////////////////////////////////
875  multiclass AMDGPUImageDimSampleDims<string opmod,
876                                      AMDGPUSampleVariant sample,
877                                      bit NoMem = false> {
878    foreach dim = AMDGPUDims.NoMsaa in {
879      def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic<
880          AMDGPUDimSampleProfile<opmod, dim, sample>,
881          !if(NoMem, [IntrNoMem], [IntrReadMem]),
882          !if(NoMem, [], [SDNPMemOperand])>;
883    }
884  }
885
886  foreach sample = AMDGPUSampleVariants in {
887    defm int_amdgcn_image_sample # sample.LowerCaseMod
888      : AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>,
889        AMDGPUImageDMaskIntrinsic;
890  }
891
892  defm int_amdgcn_image_getlod
893    : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>,
894      AMDGPUImageDMaskIntrinsic;
895
896  //////////////////////////////////////////////////////////////////////////
897  // getresinfo intrinsics
898  //////////////////////////////////////////////////////////////////////////
899  foreach dim = AMDGPUDims.All in {
900    def !strconcat("int_amdgcn_image_getresinfo_", dim.Name)
901      : AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>,
902        AMDGPUImageDMaskIntrinsic;
903  }
904
905  //////////////////////////////////////////////////////////////////////////
906  // gather4 intrinsics
907  //////////////////////////////////////////////////////////////////////////
908  foreach sample = AMDGPUSampleVariantsNoGradients in {
909    foreach dim = [AMDGPUDim2D, AMDGPUDimCube, AMDGPUDim2DArray] in {
910      def int_amdgcn_image_gather4 # sample.LowerCaseMod # _ # dim.Name:
911          AMDGPUImageDimIntrinsic<
912              AMDGPUDimSampleProfile<"GATHER4" # sample.UpperCaseMod, dim, sample>,
913              [IntrReadMem], [SDNPMemOperand]>;
914    }
915  }
916}
917
918//////////////////////////////////////////////////////////////////////////
919// atomic intrinsics
920//////////////////////////////////////////////////////////////////////////
921defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
922  multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs,
923                                   int isFloat = 0> {
924        foreach dim = AMDGPUDims.All in {
925          def !strconcat(NAME, "_", dim.Name): AMDGPUImageDimIntrinsic<
926              !if (isFloat, AMDGPUDimAtomicFloatProfile<opmod, dim, dataargs>,
927                   AMDGPUDimAtomicProfile<opmod, dim, dataargs>),
928              [], [SDNPMemOperand]>;
929        }
930  }
931
932  multiclass AMDGPUImageDimAtomic<string opmod, int isFloat = 0> {
933    defm ""
934        : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">],
935                                isFloat>;
936  }
937
938  multiclass AMDGPUImageDimFloatAtomic<string opmod> {
939    defm "" : AMDGPUImageDimAtomic<opmod, 1 /*isFloat*/>;
940  }
941
942  defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">;
943  defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">;
944  defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">;
945  defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">;
946  defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">;
947  defm int_amdgcn_image_atomic_fmin : AMDGPUImageDimFloatAtomic<"ATOMIC_FMIN">;
948  defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">;
949  defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">;
950  defm int_amdgcn_image_atomic_fmax : AMDGPUImageDimFloatAtomic<"ATOMIC_FMAX">;
951  defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">;
952  defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">;
953  defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">;
954  defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">;
955  defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">;
956
957  defm int_amdgcn_image_atomic_cmpswap :
958      AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">,
959                                               AMDGPUArg<LLVMMatchType<0>, "cmp">]>;
960}
961
962//////////////////////////////////////////////////////////////////////////
963// Buffer intrinsics
964//////////////////////////////////////////////////////////////////////////
965
966let TargetPrefix = "amdgcn" in {
967
968defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = {
969
970class AMDGPUBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
971  [data_ty],
972  [llvm_v4i32_ty,     // rsrc(SGPR)
973   llvm_i32_ty,       // vindex(VGPR)
974   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
975   llvm_i1_ty,        // glc(imm)
976   llvm_i1_ty],       // slc(imm)
977  [IntrReadMem, IntrWillReturn,
978   ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
979  AMDGPURsrcIntrinsic<0>;
980def int_amdgcn_buffer_load_format : AMDGPUBufferLoad<llvm_anyfloat_ty>;
981def int_amdgcn_buffer_load : AMDGPUBufferLoad;
982
983// Generate a buffer_load instruction that may be optimized to s_buffer_load if
984// the offset argument is uniform.
985def int_amdgcn_s_buffer_load : Intrinsic <
986  [llvm_any_ty],
987  [llvm_v4i32_ty,     // rsrc(SGPR)
988   llvm_i32_ty,       // byte offset
989   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 2 = dlc)
990  [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<2>>]>,
991  AMDGPURsrcIntrinsic<0>;
992
993class AMDGPUBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic <
994  [],
995  [data_ty,          // vdata(VGPR)
996   llvm_v4i32_ty,     // rsrc(SGPR)
997   llvm_i32_ty,       // vindex(VGPR)
998   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
999   llvm_i1_ty,        // glc(imm)
1000   llvm_i1_ty],       // slc(imm)
1001  [IntrWriteMem, IntrWillReturn,
1002   ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1003  AMDGPURsrcIntrinsic<1>;
1004def int_amdgcn_buffer_store_format : AMDGPUBufferStore<llvm_anyfloat_ty>;
1005def int_amdgcn_buffer_store : AMDGPUBufferStore;
1006
1007// New buffer intrinsics with separate raw and struct variants.  The raw
1008// variant never has an index. The struct variant always has an index, even if
1009// it is const 0. A struct intrinsic with constant 0 index is different to the
1010// corresponding raw intrinsic on gfx9+ because the behavior of bound checking
1011// and swizzling changes depending on whether idxen is set in the instruction.
1012// These new instrinsics also keep the offset and soffset arguments separate as
1013// they behave differently in bounds checking and swizzling.
1014class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1015  [data_ty],
1016  [llvm_v4i32_ty,     // rsrc(SGPR)
1017   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1018   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1019   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1020                      //                                       bit 1 = slc,
1021                      //                                       bit 2 = dlc on gfx10+),
1022                      //                      swizzled buffer (bit 3 = swz))
1023  [IntrReadMem, IntrWillReturn, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>,
1024  AMDGPURsrcIntrinsic<0>;
1025def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>;
1026def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad;
1027
1028class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1029  [data_ty],
1030  [llvm_v4i32_ty,     // rsrc(SGPR)
1031   llvm_i32_ty,       // vindex(VGPR)
1032   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1033   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1034   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1035                      //                                       bit 1 = slc,
1036                      //                                       bit 2 = dlc on gfx10+),
1037                      //                      swizzled buffer (bit 3 = swz))
1038  [IntrReadMem, IntrWillReturn, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1039  AMDGPURsrcIntrinsic<0>;
1040def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad;
1041def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad;
1042
1043class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1044  [],
1045  [data_ty,           // vdata(VGPR)
1046   llvm_v4i32_ty,     // rsrc(SGPR)
1047   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1048   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1049   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1050                      //                                       bit 1 = slc,
1051                      //                                       bit 2 = dlc on gfx10+),
1052                      //                      swizzled buffer (bit 3 = swz))
1053  [IntrWriteMem, IntrWillReturn, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1054  AMDGPURsrcIntrinsic<1>;
1055def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore<llvm_anyfloat_ty>;
1056def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore;
1057
1058class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1059  [],
1060  [data_ty,           // vdata(VGPR)
1061   llvm_v4i32_ty,     // rsrc(SGPR)
1062   llvm_i32_ty,       // vindex(VGPR)
1063   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1064   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1065   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1066                      //                                       bit 1 = slc,
1067                      //                                       bit 2 = dlc on gfx10+),
1068                      //                      swizzled buffer (bit 3 = swz))
1069  [IntrWriteMem, IntrWillReturn, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1070  AMDGPURsrcIntrinsic<1>;
1071def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore;
1072def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore;
1073
1074class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic <
1075  !if(NoRtn, [], [data_ty]),
1076  [!if(NoRtn, data_ty, LLVMMatchType<0>),  // vdata(VGPR)
1077   llvm_v4i32_ty,     // rsrc(SGPR)
1078   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1079   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1080   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc)
1081  [ImmArg<ArgIndex<4>>, IntrWillReturn], "", [SDNPMemOperand]>,
1082  AMDGPURsrcIntrinsic<1, 0>;
1083def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic;
1084def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic;
1085def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic;
1086def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic;
1087def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic;
1088def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
1089def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic;
1090def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic;
1091def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
1092def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic;
1093def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic;
1094def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
1095def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic;
1096def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic;
1097def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
1098  [llvm_anyint_ty],
1099  [LLVMMatchType<0>,  // src(VGPR)
1100   LLVMMatchType<0>,  // cmp(VGPR)
1101   llvm_v4i32_ty,     // rsrc(SGPR)
1102   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1103   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1104   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc)
1105  [ImmArg<ArgIndex<5>>, IntrWillReturn], "", [SDNPMemOperand]>,
1106  AMDGPURsrcIntrinsic<2, 0>;
1107
1108// gfx908 intrinsic
1109def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
1110
1111class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic <
1112  !if(NoRtn, [], [data_ty]),
1113  [!if(NoRtn, data_ty, LLVMMatchType<0>),  // vdata(VGPR)
1114   llvm_v4i32_ty,     // rsrc(SGPR)
1115   llvm_i32_ty,       // vindex(VGPR)
1116   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1117   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1118   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc)
1119  [ImmArg<ArgIndex<5>>, IntrWillReturn], "", [SDNPMemOperand]>,
1120  AMDGPURsrcIntrinsic<1, 0>;
1121def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic;
1122def int_amdgcn_struct_buffer_atomic_add : AMDGPUStructBufferAtomic;
1123def int_amdgcn_struct_buffer_atomic_sub : AMDGPUStructBufferAtomic;
1124def int_amdgcn_struct_buffer_atomic_smin : AMDGPUStructBufferAtomic;
1125def int_amdgcn_struct_buffer_atomic_umin : AMDGPUStructBufferAtomic;
1126def int_amdgcn_struct_buffer_atomic_smax : AMDGPUStructBufferAtomic;
1127def int_amdgcn_struct_buffer_atomic_umax : AMDGPUStructBufferAtomic;
1128def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic;
1129def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic;
1130def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic;
1131def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic;
1132def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic;
1133def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
1134  [llvm_anyint_ty],
1135  [LLVMMatchType<0>,  // src(VGPR)
1136   LLVMMatchType<0>,  // cmp(VGPR)
1137   llvm_v4i32_ty,     // rsrc(SGPR)
1138   llvm_i32_ty,       // vindex(VGPR)
1139   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1140   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1141   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc)
1142  [ImmArg<ArgIndex<6>>, IntrWillReturn], "", [SDNPMemOperand]>,
1143  AMDGPURsrcIntrinsic<2, 0>;
1144
1145// gfx908 intrinsic
1146def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
1147
1148// gfx90a intrinsics
1149def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
1150def int_amdgcn_struct_buffer_atomic_fmax : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
1151
1152
1153// Obsolescent tbuffer intrinsics.
1154def int_amdgcn_tbuffer_load : Intrinsic <
1155    [llvm_any_ty],    // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1156    [llvm_v4i32_ty,   // rsrc(SGPR)
1157     llvm_i32_ty,     // vindex(VGPR)
1158     llvm_i32_ty,     // voffset(VGPR)
1159     llvm_i32_ty,     // soffset(SGPR)
1160     llvm_i32_ty,     // offset(imm)
1161     llvm_i32_ty,     // dfmt(imm)
1162     llvm_i32_ty,     // nfmt(imm)
1163     llvm_i1_ty,     // glc(imm)
1164     llvm_i1_ty],    // slc(imm)
1165    [IntrReadMem, IntrWillReturn,
1166     ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>,
1167     ImmArg<ArgIndex<7>>, ImmArg<ArgIndex<8>>], "", [SDNPMemOperand]>,
1168  AMDGPURsrcIntrinsic<0>;
1169
1170def int_amdgcn_tbuffer_store : Intrinsic <
1171    [],
1172    [llvm_any_ty,    // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1173     llvm_v4i32_ty,  // rsrc(SGPR)
1174     llvm_i32_ty,    // vindex(VGPR)
1175     llvm_i32_ty,    // voffset(VGPR)
1176     llvm_i32_ty,    // soffset(SGPR)
1177     llvm_i32_ty,    // offset(imm)
1178     llvm_i32_ty,    // dfmt(imm)
1179     llvm_i32_ty,    // nfmt(imm)
1180     llvm_i1_ty,     // glc(imm)
1181     llvm_i1_ty],    // slc(imm)
1182    [IntrWriteMem, IntrWillReturn, ImmArg<ArgIndex<5>>,
1183     ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>,
1184     ImmArg<ArgIndex<8>>, ImmArg<ArgIndex<9>>], "", [SDNPMemOperand]>,
1185  AMDGPURsrcIntrinsic<1>;
1186
1187// New tbuffer intrinsics, with:
1188// - raw and struct variants
1189// - joint format field
1190// - joint cachepolicy field
1191def int_amdgcn_raw_tbuffer_load : Intrinsic <
1192    [llvm_any_ty],    // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1193    [llvm_v4i32_ty,   // rsrc(SGPR)
1194     llvm_i32_ty,     // offset(VGPR/imm, included in bounds checking and swizzling)
1195     llvm_i32_ty,     // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1196     llvm_i32_ty,     // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1197     llvm_i32_ty],    // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1198                      //                                       bit 1 = slc,
1199                      //                                       bit 2 = dlc on gfx10+),
1200                      //                      swizzled buffer (bit 3 = swz))
1201    [IntrReadMem, IntrWillReturn,
1202     ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1203  AMDGPURsrcIntrinsic<0>;
1204
1205def int_amdgcn_raw_tbuffer_store : Intrinsic <
1206    [],
1207    [llvm_any_ty,    // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1208     llvm_v4i32_ty,  // rsrc(SGPR)
1209     llvm_i32_ty,    // offset(VGPR/imm, included in bounds checking and swizzling)
1210     llvm_i32_ty,    // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1211     llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1212     llvm_i32_ty],   // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1213                     //                                       bit 1 = slc,
1214                     //                                       bit 2 = dlc on gfx10+),
1215                     //                      swizzled buffer (bit 3 = swz))
1216    [IntrWriteMem, IntrWillReturn,
1217     ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1218  AMDGPURsrcIntrinsic<1>;
1219
1220def int_amdgcn_struct_tbuffer_load : Intrinsic <
1221    [llvm_any_ty],    // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1222    [llvm_v4i32_ty,   // rsrc(SGPR)
1223     llvm_i32_ty,     // vindex(VGPR)
1224     llvm_i32_ty,     // offset(VGPR/imm, included in bounds checking and swizzling)
1225     llvm_i32_ty,     // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1226     llvm_i32_ty,     // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1227     llvm_i32_ty],    // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1228                      //                                       bit 1 = slc,
1229                      //                                       bit 2 = dlc on gfx10+),
1230                      //                      swizzled buffer (bit 3 = swz))
1231    [IntrReadMem, IntrWillReturn,
1232     ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1233  AMDGPURsrcIntrinsic<0>;
1234
1235def int_amdgcn_struct_tbuffer_store : Intrinsic <
1236    [],
1237    [llvm_any_ty,    // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1238     llvm_v4i32_ty,  // rsrc(SGPR)
1239     llvm_i32_ty,    // vindex(VGPR)
1240     llvm_i32_ty,    // offset(VGPR/imm, included in bounds checking and swizzling)
1241     llvm_i32_ty,    // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1242     llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1243     llvm_i32_ty],   // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1244                     //                                       bit 1 = slc,
1245                     //                                       bit 2 = dlc on gfx10+),
1246                     //                      swizzled buffer (bit 3 = swz))
1247    [IntrWriteMem, IntrWillReturn,
1248     ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>,
1249  AMDGPURsrcIntrinsic<1>;
1250
1251class AMDGPUBufferAtomic : Intrinsic <
1252  [llvm_anyint_ty],
1253  [LLVMMatchType<0>,       // vdata(VGPR)
1254   llvm_v4i32_ty,     // rsrc(SGPR)
1255   llvm_i32_ty,       // vindex(VGPR)
1256   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
1257   llvm_i1_ty],       // slc(imm)
1258  [ImmArg<ArgIndex<4>>, IntrWillReturn], "", [SDNPMemOperand]>,
1259  AMDGPURsrcIntrinsic<1, 0>;
1260def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic;
1261def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic;
1262def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic;
1263def int_amdgcn_buffer_atomic_smin : AMDGPUBufferAtomic;
1264def int_amdgcn_buffer_atomic_umin : AMDGPUBufferAtomic;
1265def int_amdgcn_buffer_atomic_smax : AMDGPUBufferAtomic;
1266def int_amdgcn_buffer_atomic_umax : AMDGPUBufferAtomic;
1267def int_amdgcn_buffer_atomic_and : AMDGPUBufferAtomic;
1268def int_amdgcn_buffer_atomic_or : AMDGPUBufferAtomic;
1269def int_amdgcn_buffer_atomic_xor : AMDGPUBufferAtomic;
1270def int_amdgcn_buffer_atomic_cmpswap : Intrinsic<
1271  [llvm_i32_ty],
1272  [llvm_i32_ty,       // src(VGPR)
1273   llvm_i32_ty,       // cmp(VGPR)
1274   llvm_v4i32_ty,     // rsrc(SGPR)
1275   llvm_i32_ty,       // vindex(VGPR)
1276   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
1277   llvm_i1_ty],       // slc(imm)
1278  [ImmArg<ArgIndex<5>>, IntrWillReturn], "", [SDNPMemOperand]>,
1279  AMDGPURsrcIntrinsic<2, 0>;
1280
1281def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic;
1282
1283class AMDGPUBufferAtomicFP : Intrinsic <
1284  [llvm_anyfloat_ty],
1285  [LLVMMatchType<0>, // vdata(VGPR)
1286   llvm_v4i32_ty,    // rsrc(SGPR)
1287   llvm_i32_ty,      // vindex(VGPR)
1288   llvm_i32_ty,      // offset(SGPR/VGPR/imm)
1289   llvm_i1_ty],      // slc(imm)
1290  [ImmArg<ArgIndex<4>>, IntrWillReturn], "", [SDNPMemOperand]>,
1291  AMDGPURsrcIntrinsic<1, 0>;
1292
1293// Legacy form of the intrinsic. raw and struct forms should be preferred.
1294def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP;
1295
1296class AMDGPURawBufferLoadLDS : Intrinsic <
1297  [],
1298  [llvm_v4i32_ty,                      // rsrc(SGPR)
1299   LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset
1300   llvm_i32_ty,                        // Data byte size: 1/2/4
1301   llvm_i32_ty,                        // voffset(VGPR, included in bounds checking and swizzling)
1302   llvm_i32_ty,                        // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1303   llvm_i32_ty,                        // imm offset(imm, included in bounds checking and swizzling)
1304   llvm_i32_ty],                       // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1305                                       //                                       bit 1 = slc,
1306                                       //                                       bit 2 = dlc on gfx10+))
1307                                       //                      swizzled buffer (bit 3 = swz))
1308  [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>,
1309   ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
1310def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS;
1311
1312class AMDGPUStructBufferLoadLDS : Intrinsic <
1313  [],
1314  [llvm_v4i32_ty,                      // rsrc(SGPR)
1315   LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset
1316   llvm_i32_ty,                        // Data byte size: 1/2/4
1317   llvm_i32_ty,                        // vindex(VGPR)
1318   llvm_i32_ty,                        // voffset(VGPR, included in bounds checking and swizzling)
1319   llvm_i32_ty,                        // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1320   llvm_i32_ty,                        // imm offset(imm, included in bounds checking and swizzling)
1321   llvm_i32_ty],                       // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1322                                       //                                       bit 1 = slc,
1323                                       //                                       bit 2 = dlc on gfx10+))
1324                                       //                      swizzled buffer (bit 3 = swz))
1325  [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>,
1326   ImmArg<ArgIndex<7>>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
1327def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS;
1328
1329} // defset AMDGPUBufferIntrinsics
1330
1331// Uses that do not set the done bit should set IntrWriteMem on the
1332// call site.
1333def int_amdgcn_exp : Intrinsic <[], [
1334  llvm_i32_ty,       // tgt,
1335  llvm_i32_ty,       // en
1336  llvm_any_ty,       // src0 (f32 or i32)
1337  LLVMMatchType<0>,  // src1
1338  LLVMMatchType<0>,  // src2
1339  LLVMMatchType<0>,  // src3
1340  llvm_i1_ty,        // done
1341  llvm_i1_ty         // vm (ignored on GFX11+)
1342  ],
1343  [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>,
1344   ImmArg<ArgIndex<7>>, IntrWriteMem, IntrInaccessibleMemOnly,
1345   IntrWillReturn]
1346>;
1347
1348// exp with row_en bit set. Only supported on GFX11+.
1349def int_amdgcn_exp_row : Intrinsic <[], [
1350  llvm_i32_ty,       // tgt,
1351  llvm_i32_ty,       // en
1352  llvm_any_ty,       // src0 (f32 or i32)
1353  LLVMMatchType<0>,  // src1
1354  LLVMMatchType<0>,  // src2
1355  LLVMMatchType<0>,  // src3
1356  llvm_i1_ty,        // done
1357  llvm_i32_ty],      // row number
1358  [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>,
1359   IntrWriteMem, IntrInaccessibleMemOnly, IntrWillReturn]
1360>;
1361
1362// exp with compr bit set. Not supported on GFX11+.
1363def int_amdgcn_exp_compr : Intrinsic <[], [
1364  llvm_i32_ty,       // tgt,
1365  llvm_i32_ty,       // en
1366  llvm_anyvector_ty, // src0 (v2f16 or v2i16)
1367  LLVMMatchType<0>,  // src1
1368  llvm_i1_ty,        // done
1369  llvm_i1_ty],       // vm
1370  [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>,
1371   ImmArg<ArgIndex<5>>, IntrWriteMem, IntrInaccessibleMemOnly,
1372   IntrWillReturn]
1373>;
1374
1375def int_amdgcn_buffer_wbinvl1_sc :
1376  ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">,
1377  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
1378
1379def int_amdgcn_buffer_wbinvl1 :
1380  ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1">,
1381  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
1382
1383def int_amdgcn_s_dcache_inv :
1384  ClangBuiltin<"__builtin_amdgcn_s_dcache_inv">,
1385  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
1386
1387def int_amdgcn_s_memtime :
1388  ClangBuiltin<"__builtin_amdgcn_s_memtime">,
1389  Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
1390
1391def int_amdgcn_s_sleep :
1392  ClangBuiltin<"__builtin_amdgcn_s_sleep">,
1393  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1394                                IntrHasSideEffects, IntrWillReturn]> {
1395}
1396
1397def int_amdgcn_s_incperflevel :
1398  ClangBuiltin<"__builtin_amdgcn_s_incperflevel">,
1399  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1400                                IntrHasSideEffects, IntrWillReturn]> {
1401}
1402
1403def int_amdgcn_s_decperflevel :
1404  ClangBuiltin<"__builtin_amdgcn_s_decperflevel">,
1405  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1406                                IntrHasSideEffects, IntrWillReturn]> {
1407}
1408
1409def int_amdgcn_s_sethalt :
1410  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1411                                IntrHasSideEffects, IntrWillReturn]>;
1412
1413def int_amdgcn_s_setprio :
1414  ClangBuiltin<"__builtin_amdgcn_s_setprio">,
1415  Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1416                                IntrHasSideEffects, IntrWillReturn]>;
1417
1418// This is IntrHasSideEffects so it can be used to read cycle counters.
1419def int_amdgcn_s_getreg :
1420  ClangBuiltin<"__builtin_amdgcn_s_getreg">,
1421  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
1422  [IntrNoMem, IntrHasSideEffects, IntrWillReturn, ImmArg<ArgIndex<0>>]
1423>;
1424
1425// Note this can be used to set FP environment properties that are
1426// unsafe to change in non-strictfp functions. The register properties
1427// available (and value required to access them) may differ per
1428// subtarget. llvm.amdgcn.s.setreg(hwmode, value)
1429def int_amdgcn_s_setreg :
1430  ClangBuiltin<"__builtin_amdgcn_s_setreg">,
1431  Intrinsic<[], [llvm_i32_ty, llvm_i32_ty],
1432  [IntrNoMem, IntrHasSideEffects, IntrWillReturn, ImmArg<ArgIndex<0>>]
1433>;
1434
1435// int_amdgcn_s_getpc is provided to allow a specific style of position
1436// independent code to determine the high part of its address when it is
1437// known (through convention) that the code and any data of interest does
1438// not cross a 4Gb address boundary. Use for any other purpose may not
1439// produce the desired results as optimizations may cause code movement,
1440// especially as we explicitly use IntrNoMem to allow optimizations.
1441def int_amdgcn_s_getpc :
1442  ClangBuiltin<"__builtin_amdgcn_s_getpc">,
1443  Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable,
1444                                IntrWillReturn]>;
1445
1446// __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0>
1447// param values: 0 = P10, 1 = P20, 2 = P0
1448def int_amdgcn_interp_mov :
1449  ClangBuiltin<"__builtin_amdgcn_interp_mov">,
1450  Intrinsic<[llvm_float_ty],
1451            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1452            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
1453              ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
1454
1455// __builtin_amdgcn_interp_p1 <i>, <attr_chan>, <attr>, <m0>
1456// This intrinsic reads from lds, but the memory values are constant,
1457// so it behaves like IntrNoMem.
1458def int_amdgcn_interp_p1 :
1459  ClangBuiltin<"__builtin_amdgcn_interp_p1">,
1460  Intrinsic<[llvm_float_ty],
1461            [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1462            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
1463             ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
1464
1465// __builtin_amdgcn_interp_p2 <p1>, <j>, <attr_chan>, <attr>, <m0>
1466def int_amdgcn_interp_p2 :
1467  ClangBuiltin<"__builtin_amdgcn_interp_p2">,
1468  Intrinsic<[llvm_float_ty],
1469            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1470            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
1471             ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
1472          // See int_amdgcn_v_interp_p1 for why this is IntrNoMem.
1473
1474// __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0>
1475// high selects whether high or low 16-bits are loaded from LDS
1476def int_amdgcn_interp_p1_f16 :
1477  ClangBuiltin<"__builtin_amdgcn_interp_p1_f16">,
1478  Intrinsic<[llvm_float_ty],
1479            [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
1480            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
1481             ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
1482
1483// __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0>
1484// high selects whether high or low 16-bits are loaded from LDS
1485def int_amdgcn_interp_p2_f16 :
1486  ClangBuiltin<"__builtin_amdgcn_interp_p2_f16">,
1487  Intrinsic<[llvm_half_ty],
1488            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
1489            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
1490             ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
1491
1492// llvm.amdgcn.lds.direct.load <m0>
1493// The input argument is m0, which contains a packed combination of address
1494// offset and flags describing the data type.
1495def int_amdgcn_lds_direct_load :
1496  Intrinsic<[llvm_any_ty], // overloaded for types u8, u16, i32/f32, i8, i16
1497            [llvm_i32_ty],
1498            [IntrReadMem, IntrSpeculatable, IntrWillReturn]>;
1499
1500// llvm.amdgcn.lds.param.load <attr_chan>, <attr>, <m0>
1501// Like interp intrinsics, this reads from lds, but the memory values are constant,
1502// so it behaves like IntrNoMem.
1503def int_amdgcn_lds_param_load :
1504  Intrinsic<[llvm_float_ty],
1505            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1506            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
1507             ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
1508
1509// llvm.amdgcn.interp.inreg.p10 <p>, <i>, <p0>
1510def int_amdgcn_interp_inreg_p10 :
1511  Intrinsic<[llvm_float_ty],
1512            [llvm_float_ty, llvm_float_ty, llvm_float_ty],
1513            [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
1514
1515// llvm.amdgcn.interp.inreg.p2 <p>, <j>, <tmp>
1516def int_amdgcn_interp_inreg_p2 :
1517  Intrinsic<[llvm_float_ty],
1518            [llvm_float_ty, llvm_float_ty, llvm_float_ty],
1519            [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
1520
1521// llvm.amdgcn.interp.inreg.p10.f16 <p>, <i>, <p0>, <high>
1522// high selects whether high or low 16-bits are used for p and p0 operands
1523def int_amdgcn_interp_inreg_p10_f16:
1524  Intrinsic<[llvm_float_ty],
1525            [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
1526            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
1527             ImmArg<ArgIndex<3>>]>;
1528
1529// llvm.amdgcn.interp.inreg.p2.f16 <p>, <j>, <tmp>, <high>
1530// high selects whether high or low 16-bits are used for p operand
1531def int_amdgcn_interp_inreg_p2_f16 :
1532  Intrinsic<[llvm_half_ty],
1533            [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
1534            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
1535             ImmArg<ArgIndex<3>>]>;
1536
1537// Deprecated: use llvm.amdgcn.live.mask instead.
1538def int_amdgcn_ps_live : Intrinsic <
1539  [llvm_i1_ty],
1540  [],
1541  [IntrNoMem, IntrWillReturn]>;
1542
1543// Query currently live lanes.
1544// Returns true if lane is live (and not a helper lane).
1545def int_amdgcn_live_mask : Intrinsic <[llvm_i1_ty],
1546  [], [IntrReadMem, IntrInaccessibleMemOnly, IntrWillReturn]
1547>;
1548
1549def int_amdgcn_mbcnt_lo :
1550  ClangBuiltin<"__builtin_amdgcn_mbcnt_lo">,
1551  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
1552   [IntrNoMem, IntrWillReturn]>;
1553
1554def int_amdgcn_mbcnt_hi :
1555  ClangBuiltin<"__builtin_amdgcn_mbcnt_hi">,
1556  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
1557            [IntrNoMem, IntrWillReturn]>;
1558
1559// llvm.amdgcn.ds.swizzle src offset
1560def int_amdgcn_ds_swizzle :
1561  ClangBuiltin<"__builtin_amdgcn_ds_swizzle">,
1562  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
1563            [IntrNoMem, IntrConvergent, IntrWillReturn,
1564             ImmArg<ArgIndex<1>>]>;
1565
1566def int_amdgcn_ubfe : Intrinsic<[llvm_anyint_ty],
1567    [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
1568    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1569>;
1570
1571def int_amdgcn_sbfe : Intrinsic<[llvm_anyint_ty],
1572    [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
1573    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1574>;
1575
1576def int_amdgcn_lerp :
1577  ClangBuiltin<"__builtin_amdgcn_lerp">,
1578  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1579  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1580>;
1581
1582def int_amdgcn_sad_u8 :
1583  ClangBuiltin<"__builtin_amdgcn_sad_u8">,
1584  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1585  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1586>;
1587
1588def int_amdgcn_msad_u8 :
1589  ClangBuiltin<"__builtin_amdgcn_msad_u8">,
1590  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1591  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1592>;
1593
1594def int_amdgcn_sad_hi_u8 :
1595  ClangBuiltin<"__builtin_amdgcn_sad_hi_u8">,
1596  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1597  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1598>;
1599
1600def int_amdgcn_sad_u16 :
1601  ClangBuiltin<"__builtin_amdgcn_sad_u16">,
1602  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1603  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1604>;
1605
1606def int_amdgcn_qsad_pk_u16_u8 :
1607  ClangBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">,
1608  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
1609  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1610>;
1611
1612def int_amdgcn_mqsad_pk_u16_u8 :
1613  ClangBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">,
1614  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
1615  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1616>;
1617
1618def int_amdgcn_mqsad_u32_u8 :
1619  ClangBuiltin<"__builtin_amdgcn_mqsad_u32_u8">,
1620  Intrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty],
1621  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1622>;
1623
1624def int_amdgcn_cvt_pk_u8_f32 :
1625  ClangBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">,
1626  Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
1627  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1628>;
1629
1630def int_amdgcn_icmp :
1631  Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>, llvm_i32_ty],
1632            [IntrNoMem, IntrConvergent, IntrWillReturn,
1633             ImmArg<ArgIndex<2>>]>;
1634
1635def int_amdgcn_fcmp :
1636  Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>, llvm_i32_ty],
1637            [IntrNoMem, IntrConvergent, IntrWillReturn,
1638             ImmArg<ArgIndex<2>>]>;
1639
1640def int_amdgcn_ballot :
1641  Intrinsic<[llvm_anyint_ty], [llvm_i1_ty],
1642            [IntrNoMem, IntrConvergent, IntrWillReturn]>;
1643
1644def int_amdgcn_readfirstlane :
1645  ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
1646  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
1647            [IntrNoMem, IntrConvergent, IntrWillReturn]>;
1648
1649// The lane argument must be uniform across the currently active threads of the
1650// current wave. Otherwise, the result is undefined.
1651def int_amdgcn_readlane :
1652  ClangBuiltin<"__builtin_amdgcn_readlane">,
1653  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
1654            [IntrNoMem, IntrConvergent, IntrWillReturn]>;
1655
1656// The value to write and lane select arguments must be uniform across the
1657// currently active threads of the current wave. Otherwise, the result is
1658// undefined.
1659def int_amdgcn_writelane :
1660  ClangBuiltin<"__builtin_amdgcn_writelane">,
1661  Intrinsic<[llvm_i32_ty], [
1662    llvm_i32_ty,    // uniform value to write: returned by the selected lane
1663    llvm_i32_ty,    // uniform lane select
1664    llvm_i32_ty     // returned by all lanes other than the selected one
1665  ],
1666  [IntrNoMem, IntrConvergent, IntrWillReturn]
1667>;
1668
1669def int_amdgcn_alignbyte : ClangBuiltin<"__builtin_amdgcn_alignbyte">,
1670  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1671  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1672>;
1673
1674def int_amdgcn_mul_i24 : Intrinsic<[llvm_i32_ty],
1675  [llvm_i32_ty, llvm_i32_ty],
1676  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1677>;
1678
1679def int_amdgcn_mul_u24 : Intrinsic<[llvm_i32_ty],
1680  [llvm_i32_ty, llvm_i32_ty],
1681  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1682>;
1683
1684def int_amdgcn_mulhi_i24 : Intrinsic<[llvm_i32_ty],
1685  [llvm_i32_ty, llvm_i32_ty],
1686  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1687>;
1688
1689def int_amdgcn_mulhi_u24 : Intrinsic<[llvm_i32_ty],
1690  [llvm_i32_ty, llvm_i32_ty],
1691  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1692>;
1693
1694// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id)
1695//
1696// bar_val is the total number of waves that will wait on this
1697// barrier, minus 1.
1698def int_amdgcn_ds_gws_init :
1699  ClangBuiltin<"__builtin_amdgcn_ds_gws_init">,
1700  Intrinsic<[],
1701  [llvm_i32_ty, llvm_i32_ty],
1702  [IntrConvergent, IntrWriteMem,
1703   IntrInaccessibleMemOnly, IntrWillReturn], "",
1704  [SDNPMemOperand]
1705>;
1706
1707// llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id)
1708// bar_val is the total number of waves that will wait on this
1709// barrier, minus 1.
1710def int_amdgcn_ds_gws_barrier :
1711  ClangBuiltin<"__builtin_amdgcn_ds_gws_barrier">,
1712  Intrinsic<[],
1713  [llvm_i32_ty, llvm_i32_ty],
1714  [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "",
1715  [SDNPMemOperand]
1716>;
1717
1718// llvm.amdgcn.ds.gws.sema.v(i32 resource_id)
1719def int_amdgcn_ds_gws_sema_v :
1720  ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_v">,
1721  Intrinsic<[],
1722  [llvm_i32_ty],
1723  [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "",
1724  [SDNPMemOperand]
1725>;
1726
1727// llvm.amdgcn.ds.gws.sema.br(i32 vsrc, i32 resource_id)
1728def int_amdgcn_ds_gws_sema_br :
1729  ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_br">,
1730  Intrinsic<[],
1731  [llvm_i32_ty, llvm_i32_ty],
1732  [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "",
1733  [SDNPMemOperand]
1734>;
1735
1736// llvm.amdgcn.ds.gws.sema.p(i32 resource_id)
1737def int_amdgcn_ds_gws_sema_p :
1738  ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_p">,
1739  Intrinsic<[],
1740  [llvm_i32_ty],
1741  [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "",
1742  [SDNPMemOperand]
1743>;
1744
1745// llvm.amdgcn.ds.gws.sema.release.all(i32 resource_id)
1746def int_amdgcn_ds_gws_sema_release_all :
1747  ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_release_all">,
1748  Intrinsic<[],
1749  [llvm_i32_ty],
1750  [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "",
1751  [SDNPMemOperand]
1752>;
1753
1754
1755// Copies the source value to the destination value, with the guarantee that
1756// the source value is computed as if the entire program were executed in WQM.
1757def int_amdgcn_wqm : Intrinsic<[llvm_any_ty],
1758  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1759>;
1760
1761// Copies the source value to the destination value, such that the source
1762// is computed as if the entire program were executed in WQM if any other
1763// program code executes in WQM.
1764def int_amdgcn_softwqm : Intrinsic<[llvm_any_ty],
1765  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1766>;
1767
1768// Return true if at least one thread within the pixel quad passes true into
1769// the function.
1770def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty],
1771  [llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn]
1772>;
1773
1774// If false, set EXEC=0 for the current thread until the end of program.
1775// FIXME: Should this be IntrNoMem, IntrHasSideEffects, or IntrWillReturn?
1776def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>;
1777
1778def int_amdgcn_endpgm : ClangBuiltin<"__builtin_amdgcn_endpgm">,
1779  Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects]
1780>;
1781
1782// If false, mark all active lanes as helper lanes until the end of program.
1783def int_amdgcn_wqm_demote : Intrinsic<[],
1784  [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly]
1785>;
1786
1787// Copies the active channels of the source value to the destination value,
1788// with the guarantee that the source value is computed as if the entire
1789// program were executed in Whole Wavefront Mode, i.e. with all channels
1790// enabled, with a few exceptions: - Phi nodes which require WWM return an
1791// undefined value.
1792def int_amdgcn_strict_wwm : Intrinsic<[llvm_any_ty],
1793  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
1794                       IntrConvergent, IntrWillReturn]
1795>;
1796// Deprecated. Use int_amdgcn_strict_wwm instead.
1797def int_amdgcn_wwm : Intrinsic<[llvm_any_ty],
1798  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
1799                       IntrConvergent, IntrWillReturn]
1800>;
1801def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty],
1802  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
1803                       IntrConvergent, IntrWillReturn]
1804>;
1805
1806// Given a value, copies it while setting all the inactive lanes to a given
1807// value. Note that OpenGL helper lanes are considered active, so if the
1808// program ever uses WQM, then the instruction and the first source will be
1809// computed in WQM.
1810def int_amdgcn_set_inactive :
1811  Intrinsic<[llvm_anyint_ty],
1812            [LLVMMatchType<0>, // value to be copied
1813             LLVMMatchType<0>], // value for the inactive lanes to take
1814            [IntrNoMem, IntrConvergent, IntrWillReturn]>;
1815
1816// Return if the given flat pointer points to a local memory address.
1817def int_amdgcn_is_shared : ClangBuiltin<"__builtin_amdgcn_is_shared">,
1818  Intrinsic<[llvm_i1_ty], [llvm_ptr_ty],
1819  [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>, IntrWillReturn]
1820>;
1821
1822// Return if the given flat pointer points to a prvate memory address.
1823def int_amdgcn_is_private : ClangBuiltin<"__builtin_amdgcn_is_private">,
1824  Intrinsic<[llvm_i1_ty], [llvm_ptr_ty],
1825  [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>, IntrWillReturn]
1826>;
1827
1828//===----------------------------------------------------------------------===//
1829// CI+ Intrinsics
1830//===----------------------------------------------------------------------===//
1831
1832def int_amdgcn_s_dcache_inv_vol :
1833  ClangBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">,
1834  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
1835
1836def int_amdgcn_buffer_wbinvl1_vol :
1837  ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">,
1838  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
1839
1840//===----------------------------------------------------------------------===//
1841// VI Intrinsics
1842//===----------------------------------------------------------------------===//
1843
1844// llvm.amdgcn.mov.dpp.i32 <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
1845def int_amdgcn_mov_dpp :
1846  Intrinsic<[llvm_anyint_ty],
1847            [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
1848             llvm_i1_ty],
1849             [IntrNoMem, IntrConvergent, IntrWillReturn,
1850             ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>,
1851             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
1852
1853// llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
1854// Should be equivalent to:
1855// v_mov_b32 <dest> <old>
1856// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
1857def int_amdgcn_update_dpp :
1858  Intrinsic<[llvm_anyint_ty],
1859            [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty,
1860            llvm_i32_ty, llvm_i32_ty, llvm_i1_ty],
1861             [IntrNoMem, IntrConvergent, IntrWillReturn,
1862              ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>,
1863              ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
1864
1865def int_amdgcn_s_dcache_wb :
1866  ClangBuiltin<"__builtin_amdgcn_s_dcache_wb">,
1867  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
1868
1869def int_amdgcn_s_dcache_wb_vol :
1870  ClangBuiltin<"__builtin_amdgcn_s_dcache_wb_vol">,
1871  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
1872
1873def int_amdgcn_s_memrealtime :
1874  ClangBuiltin<"__builtin_amdgcn_s_memrealtime">,
1875  Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
1876
1877// llvm.amdgcn.ds.permute <index> <src>
1878def int_amdgcn_ds_permute :
1879  ClangBuiltin<"__builtin_amdgcn_ds_permute">,
1880  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
1881    [IntrNoMem, IntrConvergent, IntrWillReturn]>;
1882
1883// llvm.amdgcn.ds.bpermute <index> <src>
1884def int_amdgcn_ds_bpermute :
1885  ClangBuiltin<"__builtin_amdgcn_ds_bpermute">,
1886  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
1887     [IntrNoMem, IntrConvergent, IntrWillReturn]>;
1888
1889// llvm.amdgcn.perm <src0> <src1> <selector>
1890def int_amdgcn_perm :
1891  ClangBuiltin<"__builtin_amdgcn_perm">,
1892  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1893     [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
1894
1895//===----------------------------------------------------------------------===//
1896// GFX9 Intrinsics
1897//===----------------------------------------------------------------------===//
1898
1899class AMDGPUGlobalLoadLDS : Intrinsic <
1900  [],
1901  [LLVMQualPointerType<llvm_i8_ty, 1>, // Base global pointer to load from
1902   LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base pointer to store to
1903   llvm_i32_ty,                        // Data byte size: 1/2/4
1904   llvm_i32_ty,                        // imm offset (applied to both global and LDS address)
1905   llvm_i32_ty],                       // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0,
1906                                       //                                   bit 1 = slc/sc1,
1907                                       //                                   bit 2 = dlc on gfx10+))
1908                                       //                                   bit 4 = scc/nt on gfx90a+))
1909  [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
1910   ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>],
1911  "", [SDNPMemOperand]>;
1912def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
1913
1914//===----------------------------------------------------------------------===//
1915// GFX10 Intrinsics
1916//===----------------------------------------------------------------------===//
1917
1918// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control>
1919def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">,
1920  Intrinsic<[llvm_i32_ty],
1921            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
1922            [IntrNoMem, IntrConvergent, IntrWillReturn,
1923             ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
1924
1925// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
1926def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">,
1927  Intrinsic<[llvm_i32_ty],
1928            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
1929            [IntrNoMem, IntrConvergent, IntrWillReturn,
1930             ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
1931
1932// llvm.amdgcn.mov.dpp8.i32 <src> <sel>
1933// <sel> is a 32-bit constant whose high 8 bits must be zero which selects
1934// the lanes to read from.
1935def int_amdgcn_mov_dpp8 :
1936  Intrinsic<[llvm_anyint_ty],
1937            [LLVMMatchType<0>, llvm_i32_ty],
1938            [IntrNoMem, IntrConvergent, IntrWillReturn,
1939             ImmArg<ArgIndex<1>>]>;
1940
1941def int_amdgcn_s_get_waveid_in_workgroup :
1942  ClangBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">,
1943  Intrinsic<[llvm_i32_ty], [],
1944    [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
1945
1946class AMDGPUGlobalAtomicRtn<LLVMType vt> : Intrinsic <
1947  [vt],
1948  [llvm_anyptr_ty,    // vaddr
1949   vt],               // vdata(VGPR)
1950  [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>], "",
1951  [SDNPMemOperand]>;
1952
1953def int_amdgcn_global_atomic_csub : AMDGPUGlobalAtomicRtn<llvm_i32_ty>;
1954
1955// uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>,
1956//                                           <ray_dir>, <ray_inv_dir>, <texture_descr>
1957// <node_ptr> is i32 or i64.
1958// <ray_dir> and <ray_inv_dir> are both v3f16 or both v3f32.
1959def int_amdgcn_image_bvh_intersect_ray :
1960  Intrinsic<[llvm_v4i32_ty],
1961            [llvm_anyint_ty, llvm_float_ty, llvm_v3f32_ty, llvm_anyvector_ty,
1962             LLVMMatchType<1>, llvm_v4i32_ty],
1963            [IntrReadMem, IntrWillReturn]>;
1964
1965//===----------------------------------------------------------------------===//
1966// GFX11 Intrinsics
1967//===----------------------------------------------------------------------===//
1968
1969// llvm.amdgcn.permlane64 <src0>
1970def int_amdgcn_permlane64 :
1971  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
1972            [IntrNoMem, IntrConvergent, IntrWillReturn]>;
1973
1974def int_amdgcn_ds_add_gs_reg_rtn :
1975  ClangBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">,
1976  Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
1977            [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn]>;
1978
1979def int_amdgcn_ds_sub_gs_reg_rtn :
1980  ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">,
1981  Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
1982            [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn]>;
1983
1984// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
1985//
1986// These operations perform a matrix multiplication and accumulation of
1987// the form: D = A * B + C .
1988
1989class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
1990  Intrinsic<
1991    [CD],               // %D
1992    [
1993      AB,               // %A
1994      AB,               // %B
1995      LLVMMatchType<0>, // %C
1996    ],
1997    [IntrNoMem, IntrConvergent, IntrWillReturn]
1998>;
1999
2000class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> :
2001  Intrinsic<
2002    [CD],               // %D
2003    [
2004      AB,               // %A
2005      AB,               // %B
2006      LLVMMatchType<0>, // %C
2007      llvm_i1_ty,       // %high
2008    ],
2009    [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<3>>]
2010>;
2011
2012class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
2013  Intrinsic<
2014    [CD],               // %D
2015    [
2016      llvm_i1_ty,       // %A_sign
2017      AB,               // %A
2018      llvm_i1_ty,       // %B_sign
2019      AB,               // %B
2020      LLVMMatchType<0>, // %C
2021      llvm_i1_ty,       // %clamp
2022    ],
2023    [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
2024>;
2025
2026def int_amdgcn_wmma_f32_16x16x16_f16   : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llvm_anyfloat_ty>;
2027def int_amdgcn_wmma_f32_16x16x16_bf16  : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>;
2028def int_amdgcn_wmma_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
2029def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
2030def int_amdgcn_wmma_i32_16x16x16_iu8   : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>;
2031def int_amdgcn_wmma_i32_16x16x16_iu4   : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>;
2032
2033
2034//===----------------------------------------------------------------------===//
2035// Deep learning intrinsics.
2036//===----------------------------------------------------------------------===//
2037
2038// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp)
2039//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2040def int_amdgcn_fdot2 :
2041  ClangBuiltin<"__builtin_amdgcn_fdot2">,
2042  Intrinsic<
2043    [llvm_float_ty], // %r
2044    [
2045      llvm_v2f16_ty, // %a
2046      llvm_v2f16_ty, // %b
2047      llvm_float_ty, // %c
2048      llvm_i1_ty     // %clamp
2049    ],
2050    [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
2051  >;
2052
2053// f16 %r = llvm.amdgcn.fdot2.f16.f16(v2f16 %a, v2f16 %b, f16 %c)
2054//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2055def int_amdgcn_fdot2_f16_f16 :
2056  ClangBuiltin<"__builtin_amdgcn_fdot2_f16_f16">,
2057  Intrinsic<
2058    [llvm_half_ty],  // %r
2059    [
2060      llvm_v2f16_ty, // %a
2061      llvm_v2f16_ty, // %b
2062      llvm_half_ty   // %c
2063    ],
2064    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
2065  >;
2066
2067// bf16 %r = llvm.amdgcn.fdot2.bf16.bf16(v2bf16 %a, v2bf16 %b, bf16 %c)
2068//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2069def int_amdgcn_fdot2_bf16_bf16 :
2070  ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">,
2071  Intrinsic<
2072    [llvm_i16_ty],   // %r
2073    [
2074      llvm_v2i16_ty, // %a
2075      llvm_v2i16_ty, // %b
2076      llvm_i16_ty    // %c
2077    ],
2078    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
2079  >;
2080
2081// f32 %r = llvm.amdgcn.fdot2.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp)
2082//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2083def int_amdgcn_fdot2_f32_bf16 :
2084  ClangBuiltin<"__builtin_amdgcn_fdot2_f32_bf16">,
2085  Intrinsic<
2086    [llvm_float_ty], // %r
2087    [
2088      llvm_v2i16_ty, // %a
2089      llvm_v2i16_ty, // %b
2090      llvm_float_ty, // %c
2091      llvm_i1_ty     // %clamp
2092    ],
2093    [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
2094  >;
2095
2096// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp)
2097//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2098def int_amdgcn_sdot2 :
2099  ClangBuiltin<"__builtin_amdgcn_sdot2">,
2100  Intrinsic<
2101    [llvm_i32_ty], // %r
2102    [
2103      llvm_v2i16_ty, // %a
2104      llvm_v2i16_ty, // %b
2105      llvm_i32_ty,   // %c
2106      llvm_i1_ty     // %clamp
2107    ],
2108    [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
2109  >;
2110
2111// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp)
2112//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2113def int_amdgcn_udot2 :
2114  ClangBuiltin<"__builtin_amdgcn_udot2">,
2115  Intrinsic<
2116    [llvm_i32_ty], // %r
2117    [
2118      llvm_v2i16_ty, // %a
2119      llvm_v2i16_ty, // %b
2120      llvm_i32_ty,   // %c
2121      llvm_i1_ty     // %clamp
2122    ],
2123    [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
2124  >;
2125
2126// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp)
2127//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2128def int_amdgcn_sdot4 :
2129  ClangBuiltin<"__builtin_amdgcn_sdot4">,
2130  Intrinsic<
2131    [llvm_i32_ty], // %r
2132    [
2133      llvm_i32_ty, // %a
2134      llvm_i32_ty, // %b
2135      llvm_i32_ty, // %c
2136      llvm_i1_ty   // %clamp
2137    ],
2138    [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
2139  >;
2140
2141// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp)
2142//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2143def int_amdgcn_udot4 :
2144  ClangBuiltin<"__builtin_amdgcn_udot4">,
2145  Intrinsic<
2146    [llvm_i32_ty], // %r
2147    [
2148      llvm_i32_ty, // %a
2149      llvm_i32_ty, // %b
2150      llvm_i32_ty, // %c
2151      llvm_i1_ty   // %clamp
2152    ],
2153    [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
2154  >;
2155
2156// i32 %r = llvm.amdgcn.sudot4(i1 %a_sign, v4i8 (as i32) %a, i1 %b_sign, v4i8 (as i32) %b, i32 %c, i1 %clamp)
2157// Treat input as signed (_sign = 1) or unsigned (_sign = 0).
2158// a[i in 0. . . 3] = (%a_sign ? a.i8[i] : promoteToSigned(a.u8[i]));
2159// b[i in 0. . . 3] = (%b_sign ? b.i8[i] : promoteToSigned(b.u8[i]));
2160//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2161def int_amdgcn_sudot4 :
2162  ClangBuiltin<"__builtin_amdgcn_sudot4">,
2163  Intrinsic<
2164    [llvm_i32_ty], // %r
2165    [
2166      llvm_i1_ty,  // %a_sign
2167      llvm_i32_ty, // %a
2168      llvm_i1_ty,  // %b_sign
2169      llvm_i32_ty, // %b
2170      llvm_i32_ty, // %c
2171      llvm_i1_ty   // %clamp
2172    ],
2173    [IntrNoMem, IntrSpeculatable, IntrWillReturn,
2174     ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
2175  >;
2176
2177// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp)
2178//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
2179//        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
2180def int_amdgcn_sdot8 :
2181  ClangBuiltin<"__builtin_amdgcn_sdot8">,
2182  Intrinsic<
2183    [llvm_i32_ty], // %r
2184    [
2185      llvm_i32_ty, // %a
2186      llvm_i32_ty, // %b
2187      llvm_i32_ty, // %c
2188      llvm_i1_ty   // %clamp
2189    ],
2190    [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
2191  >;
2192
2193// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp)
2194//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
2195//        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
2196def int_amdgcn_udot8 :
2197  ClangBuiltin<"__builtin_amdgcn_udot8">,
2198  Intrinsic<
2199    [llvm_i32_ty], // %r
2200    [
2201      llvm_i32_ty, // %a
2202      llvm_i32_ty, // %b
2203      llvm_i32_ty, // %c
2204      llvm_i1_ty   // %clamp
2205    ],
2206    [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
2207  >;
2208
2209// i32 %r = llvm.amdgcn.sudot8(i1 %a_sign, v8i4 (as i32) %a, i1 %b_sign, v8i4 (as i32) %b, i32 %c, i1 %clamp)
2210// Treat input as signed (_sign = 1) or unsigned (_sign = 0).
2211// a[i in 0. . . 7] = (%a_sign ? a.i4[i] : promoteToSigned(a.u4[i]));
2212// b[i in 0. . . 7] = (%b_sign ? b.i4[i] : promoteToSigned(b.u4[i]));
2213//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
2214//        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
2215  def int_amdgcn_sudot8 :
2216  ClangBuiltin<"__builtin_amdgcn_sudot8">,
2217  Intrinsic<
2218    [llvm_i32_ty], // %r
2219    [
2220      llvm_i1_ty,  // %a_sign
2221      llvm_i32_ty, // %a
2222      llvm_i1_ty,  // %b_sign
2223      llvm_i32_ty, // %b
2224      llvm_i32_ty, // %c
2225      llvm_i1_ty   // %clamp
2226    ],
2227    [IntrNoMem, IntrSpeculatable, IntrWillReturn,
2228     ImmArg<ArgIndex<0>>,  ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
2229  >;
2230
2231//===----------------------------------------------------------------------===//
2232// gfx908 intrinsics
2233// ===----------------------------------------------------------------------===//
2234
2235def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
2236
2237// llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
2238class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
2239  ClangBuiltin<!subst("int", "__builtin", NAME)>,
2240  Intrinsic<[DestTy],
2241            [SrcABTy, SrcABTy, DestTy,
2242             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
2243            [IntrConvergent, IntrNoMem, IntrWillReturn,
2244             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
2245
2246def int_amdgcn_mfma_f32_32x32x1f32  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>;
2247def int_amdgcn_mfma_f32_16x16x1f32  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>;
2248def int_amdgcn_mfma_f32_4x4x1f32    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_float_ty>;
2249def int_amdgcn_mfma_f32_32x32x2f32  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>;
2250def int_amdgcn_mfma_f32_16x16x4f32  : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_float_ty>;
2251def int_amdgcn_mfma_f32_32x32x4f16  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4f16_ty>;
2252def int_amdgcn_mfma_f32_16x16x4f16  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>;
2253def int_amdgcn_mfma_f32_4x4x4f16    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4f16_ty>;
2254def int_amdgcn_mfma_f32_32x32x8f16  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>;
2255def int_amdgcn_mfma_f32_16x16x16f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4f16_ty>;
2256def int_amdgcn_mfma_i32_32x32x4i8   : AMDGPUMfmaIntrinsic<llvm_v32i32_ty, llvm_i32_ty>;
2257def int_amdgcn_mfma_i32_16x16x4i8   : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>;
2258def int_amdgcn_mfma_i32_4x4x4i8     : AMDGPUMfmaIntrinsic<llvm_v4i32_ty,  llvm_i32_ty>;
2259def int_amdgcn_mfma_i32_32x32x8i8   : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>;
2260def int_amdgcn_mfma_i32_16x16x16i8  : AMDGPUMfmaIntrinsic<llvm_v4i32_ty,  llvm_i32_ty>;
2261def int_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v2i16_ty>;
2262def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>;
2263def int_amdgcn_mfma_f32_4x4x2bf16   : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2i16_ty>;
2264def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>;
2265def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2i16_ty>;
2266
2267//===----------------------------------------------------------------------===//
2268// gfx90a intrinsics
2269// ===----------------------------------------------------------------------===//
2270
2271def int_amdgcn_global_atomic_fmin : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
2272def int_amdgcn_global_atomic_fmax : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
2273def int_amdgcn_flat_atomic_fadd   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
2274def int_amdgcn_flat_atomic_fmin   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
2275def int_amdgcn_flat_atomic_fmax   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
2276
2277def int_amdgcn_mfma_f32_32x32x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>;
2278def int_amdgcn_mfma_f32_16x16x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
2279def int_amdgcn_mfma_f32_4x4x4bf16_1k    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty>;
2280def int_amdgcn_mfma_f32_32x32x8bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
2281def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty>;
2282
2283// Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA.
2284//       Three bits corresponding to the neg modifier applied to the respective
2285//       source operand.
2286def int_amdgcn_mfma_f64_16x16x4f64      : AMDGPUMfmaIntrinsic<llvm_v4f64_ty,  llvm_double_ty>;
2287def int_amdgcn_mfma_f64_4x4x4f64        : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>;
2288
2289//===----------------------------------------------------------------------===//
2290// gfx940 intrinsics
2291// ===----------------------------------------------------------------------===//
2292
2293// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
2294def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>;
2295def int_amdgcn_flat_atomic_fadd_v2bf16   : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>;
2296def int_amdgcn_ds_fadd_v2bf16 : Intrinsic<
2297    [llvm_v2i16_ty],
2298    [LLVMQualPointerType<llvm_v2i16_ty, 3>, llvm_v2i16_ty],
2299    [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>]>,
2300    ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;
2301
2302def int_amdgcn_mfma_i32_16x16x32_i8     : AMDGPUMfmaIntrinsic<llvm_v4i32_ty,  llvm_i64_ty>;
2303def int_amdgcn_mfma_i32_32x32x16_i8     : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
2304def int_amdgcn_mfma_f32_16x16x8_xf32    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2f32_ty>;
2305def int_amdgcn_mfma_f32_32x32x4_xf32    : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;
2306
2307// llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid
2308class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> :
2309  ClangBuiltin<!subst("int", "__builtin", NAME)>,
2310  Intrinsic<[DestTy],
2311            [SrcA, SrcB, DestTy, llvm_i32_ty,
2312             llvm_i32_ty, llvm_i32_ty],
2313            [IntrConvergent, IntrNoMem, IntrWillReturn,
2314             ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
2315
2316def int_amdgcn_smfmac_f32_16x16x32_f16  : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty,  llvm_v4f16_ty, llvm_v8f16_ty>;
2317def int_amdgcn_smfmac_f32_32x32x16_f16  : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
2318def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty, llvm_v8i16_ty>;
2319def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
2320def int_amdgcn_smfmac_i32_16x16x64_i8   : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty,  llvm_v2i32_ty, llvm_v4i32_ty>;
2321def int_amdgcn_smfmac_i32_32x32x32_i8   : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
2322
2323//===----------------------------------------------------------------------===//
2324// Special Intrinsics for backend internal use only. No frontend
2325// should emit calls to these.
2326// ===----------------------------------------------------------------------===//
2327def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_anyint_ty],
2328  [llvm_i1_ty], [IntrConvergent, IntrWillReturn]
2329>;
2330
2331def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_anyint_ty],
2332  [llvm_anyint_ty], [IntrConvergent, IntrWillReturn]
2333>;
2334
2335def int_amdgcn_if_break : Intrinsic<[llvm_anyint_ty],
2336  [llvm_i1_ty, LLVMMatchType<0>],
2337  [IntrNoMem, IntrConvergent, IntrWillReturn]
2338>;
2339
2340def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
2341  [llvm_anyint_ty], [IntrConvergent, IntrWillReturn]
2342>;
2343
2344def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty],
2345  [IntrConvergent, IntrWillReturn]>;
2346
2347// Represent unreachable in a divergent region.
2348def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent]>;
2349
2350// Emit 2.5 ulp, no denormal division. Should only be inserted by
2351// pass based on !fpmath metadata.
2352def int_amdgcn_fdiv_fast : Intrinsic<
2353  [llvm_float_ty], [llvm_float_ty, llvm_float_ty],
2354  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
2355>;
2356
2357// Represent a relocation constant.
2358def int_amdgcn_reloc_constant : Intrinsic<
2359  [llvm_i32_ty], [llvm_metadata_ty],
2360  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
2361>;
2362}
2363