1//===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics -----*- tablegen -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines all of the R600-specific intrinsics.
10//
11//===----------------------------------------------------------------------===//
12
13class AMDGPUReadPreloadRegisterIntrinsic
14  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
15
16class AMDGPUReadPreloadRegisterIntrinsicNamed<string name>
17  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>, ClangBuiltin<name>;
18
19// Used to tag image and resource intrinsics with information used to generate
20// mem operands.
21class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = false> {
22  int RsrcArg = rsrcarg;
23  bit IsImage = isimage;
24}
25
26let TargetPrefix = "r600" in {
27
28multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz {
29  def _x : AMDGPUReadPreloadRegisterIntrinsic;
30  def _y : AMDGPUReadPreloadRegisterIntrinsic;
31  def _z : AMDGPUReadPreloadRegisterIntrinsic;
32}
33
34multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> {
35  def _x : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_x")>;
36  def _y : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_y")>;
37  def _z : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_z")>;
38}
39
40defm int_r600_read_global_size : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
41                                 <"__builtin_r600_read_global_size">;
42defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
43                             <"__builtin_r600_read_ngroups">;
44defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
45                          <"__builtin_r600_read_tgid">;
46
47defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz;
48defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz;
49
50def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">,
51  Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>;
52
53// AS 7 is PARAM_I_ADDRESS, used for kernel arguments
54def int_r600_implicitarg_ptr :
55  ClangBuiltin<"__builtin_r600_implicitarg_ptr">,
56  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 7>], [],
57  [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
58
59def int_r600_rat_store_typed :
60  // 1st parameter: Data
61  // 2nd parameter: Index
62  // 3rd parameter: Constant RAT ID
63  Intrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrWillReturn]>,
64  ClangBuiltin<"__builtin_r600_rat_store_typed">;
65
66def int_r600_recipsqrt_ieee :  Intrinsic<
67  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
68>;
69
70def int_r600_recipsqrt_clamped : Intrinsic<
71  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
72>;
73
74def int_r600_cube : Intrinsic<
75  [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
76>;
77
78def int_r600_store_stream_output : Intrinsic<
79  [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrWillReturn]
80>;
81
82class TextureIntrinsicFloatInput : Intrinsic<[llvm_v4f32_ty], [
83  llvm_v4f32_ty, // Coord
84  llvm_i32_ty,   // offset_x
85  llvm_i32_ty,   // offset_y,
86  llvm_i32_ty,   // offset_z,
87  llvm_i32_ty,   // resource_id
88  llvm_i32_ty,   // samplerid
89  llvm_i32_ty,   // coord_type_x
90  llvm_i32_ty,   // coord_type_y
91  llvm_i32_ty,   // coord_type_z
92  llvm_i32_ty],  // coord_type_w
93  [IntrNoMem, IntrWillReturn]
94>;
95
96class TextureIntrinsicInt32Input : Intrinsic<[llvm_v4i32_ty], [
97    llvm_v4i32_ty, // Coord
98    llvm_i32_ty,   // offset_x
99    llvm_i32_ty,   // offset_y,
100    llvm_i32_ty,   // offset_z,
101    llvm_i32_ty,   // resource_id
102    llvm_i32_ty,   // samplerid
103    llvm_i32_ty,   // coord_type_x
104    llvm_i32_ty,   // coord_type_y
105    llvm_i32_ty,   // coord_type_z
106    llvm_i32_ty],  // coord_type_w
107    [IntrNoMem, IntrWillReturn]
108>;
109
110def int_r600_store_swizzle :
111  Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrWillReturn]
112>;
113
114def int_r600_tex : TextureIntrinsicFloatInput;
115def int_r600_texc : TextureIntrinsicFloatInput;
116def int_r600_txl : TextureIntrinsicFloatInput;
117def int_r600_txlc : TextureIntrinsicFloatInput;
118def int_r600_txb : TextureIntrinsicFloatInput;
119def int_r600_txbc : TextureIntrinsicFloatInput;
120def int_r600_txf : TextureIntrinsicInt32Input;
121def int_r600_txq : TextureIntrinsicInt32Input;
122def int_r600_ddx : TextureIntrinsicFloatInput;
123def int_r600_ddy : TextureIntrinsicFloatInput;
124
125def int_r600_dot4 : Intrinsic<[llvm_float_ty],
126  [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
127>;
128
129def int_r600_kill : Intrinsic<[], [llvm_float_ty], [IntrWillReturn]>;
130
131} // End TargetPrefix = "r600"
132
133let TargetPrefix = "amdgcn" in {
134
135//===----------------------------------------------------------------------===//
136// ABI Special Intrinsics
137//===----------------------------------------------------------------------===//
138
139defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz;
140defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
141                               <"__builtin_amdgcn_workgroup_id">;
142
143def int_amdgcn_dispatch_ptr :
144  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
145  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
146
147def int_amdgcn_queue_ptr :
148  ClangBuiltin<"__builtin_amdgcn_queue_ptr">,
149  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
150  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
151
152def int_amdgcn_kernarg_segment_ptr :
153  ClangBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">,
154  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
155  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
156
157def int_amdgcn_implicitarg_ptr :
158  ClangBuiltin<"__builtin_amdgcn_implicitarg_ptr">,
159  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
160  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
161
162def int_amdgcn_groupstaticsize :
163  ClangBuiltin<"__builtin_amdgcn_groupstaticsize">,
164  Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
165
166def int_amdgcn_dispatch_id :
167  ClangBuiltin<"__builtin_amdgcn_dispatch_id">,
168  Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
169
170// For internal use. Coordinates LDS lowering between IR transform and backend.
171def int_amdgcn_lds_kernel_id :
172  Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
173
174def int_amdgcn_implicit_buffer_ptr :
175  ClangBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
176  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
177  [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
178
179// Set EXEC to the 64-bit value given.
180// This is always moved to the beginning of the basic block.
181// FIXME: Should be mangled for wave size.
182def int_amdgcn_init_exec : Intrinsic<[],
183  [llvm_i64_ty],      // 64-bit literal constant
184  [IntrConvergent, ImmArg<ArgIndex<0>>]>;
185
186// Set EXEC according to a thread count packed in an SGPR input:
187//    thread_count = (input >> bitoffset) & 0x7f;
188// This is always moved to the beginning of the basic block.
189// Note: only inreg arguments to the parent function are valid as
190// inputs to this intrinsic, computed values cannot be used.
191def int_amdgcn_init_exec_from_input : Intrinsic<[],
192  [llvm_i32_ty,       // 32-bit SGPR input
193   llvm_i32_ty],      // bit offset of the thread count
194  [IntrConvergent, ImmArg<ArgIndex<1>>]>;
195
196def int_amdgcn_wavefrontsize :
197  ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
198  Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
199
200
201//===----------------------------------------------------------------------===//
202// Instruction Intrinsics
203//===----------------------------------------------------------------------===//
204
205// The first parameter is s_sendmsg immediate (i16),
206// the second one is copied to m0
207def int_amdgcn_s_sendmsg : ClangBuiltin<"__builtin_amdgcn_s_sendmsg">,
208  Intrinsic <[], [llvm_i32_ty, llvm_i32_ty],
209  [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
210def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">,
211  Intrinsic <[], [llvm_i32_ty, llvm_i32_ty],
212  [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
213
214
215// gfx11 intrinsic
216// The first parameter is s_sendmsg immediate (i16). Return type is i32 or i64.
217def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty],
218  [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
219
220def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">,
221  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn]>;
222
223def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">,
224  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn]>;
225
226// The 1st parameter is a mask for the types of instructions that may be allowed
227// to cross the SCHED_BARRIER during scheduling.
228//     MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER.
229//     MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be
230//                         scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass.
231//     MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER.
232//     MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER.
233//     MASK = 0x0000 0008: MFMA instructions may be scheduled across SCHED_BARRIER.
234//     MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER.
235//     MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER.
236//     MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER.
237//     MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER.
238//     MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER.
239//     MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER.
240def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">,
241  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
242                                IntrWillReturn]>;
243
244def int_amdgcn_s_waitcnt : ClangBuiltin<"__builtin_amdgcn_s_waitcnt">,
245  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
246
247def int_amdgcn_div_scale : Intrinsic<
248  // 1st parameter: Numerator
249  // 2nd parameter: Denominator
250  // 3rd parameter: Select quotient. Must equal Numerator or Denominator.
251  //                (0 = Denominator, 1 = Numerator).
252  [llvm_anyfloat_ty, llvm_i1_ty],
253  [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
254  [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>, IntrWillReturn]
255>;
256
257def int_amdgcn_div_fmas : Intrinsic<[llvm_anyfloat_ty],
258  [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
259  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
260>;
261
262def int_amdgcn_div_fixup : Intrinsic<[llvm_anyfloat_ty],
263  [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
264  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
265>;
266
267// Look Up 2.0 / pi src0 with segment select src1[4:0]
268def int_amdgcn_trig_preop : Intrinsic<
269  [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty],
270  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
271>;
272
273def int_amdgcn_sin : Intrinsic<
274  [llvm_anyfloat_ty], [LLVMMatchType<0>],
275  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
276>;
277
278def int_amdgcn_cos : Intrinsic<
279  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
280>;
281
282def int_amdgcn_log_clamp : Intrinsic<
283  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
284>;
285
286def int_amdgcn_fmul_legacy : ClangBuiltin<"__builtin_amdgcn_fmul_legacy">,
287  Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
288  [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]
289>;
290
291// Fused single-precision multiply-add with legacy behaviour for the multiply,
292// which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is
293// intended for use on subtargets that have the v_fma_legacy_f32 and/or
294// v_fmac_legacy_f32 instructions. (Note that v_fma_legacy_f16 is unrelated and
295// has a completely different kind of legacy behaviour.)
296def int_amdgcn_fma_legacy :
297  Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],
298  [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative]
299>;
300
301def int_amdgcn_rcp : Intrinsic<
302  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
303>;
304
305def int_amdgcn_rcp_legacy : ClangBuiltin<"__builtin_amdgcn_rcp_legacy">,
306  Intrinsic<[llvm_float_ty], [llvm_float_ty],
307  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
308>;
309
310def int_amdgcn_sqrt :  Intrinsic<
311  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
312>;
313
314def int_amdgcn_rsq :  Intrinsic<
315  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
316>;
317
318def int_amdgcn_rsq_legacy :  ClangBuiltin<"__builtin_amdgcn_rsq_legacy">,
319  Intrinsic<
320  [llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
321>;
322
323// out = 1.0 / sqrt(a) result clamped to +/- max_float.
324def int_amdgcn_rsq_clamp : Intrinsic<
325  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
326
327def int_amdgcn_ldexp : Intrinsic<
328  [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty],
329  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
330>;
331
332def int_amdgcn_frexp_mant : Intrinsic<
333  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
334>;
335
336def int_amdgcn_frexp_exp : Intrinsic<
337  [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
338>;
339
340// v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0
341// and always uses rtz, so is not suitable for implementing the OpenCL
342// fract function. It should be ok on VI.
343def int_amdgcn_fract : Intrinsic<
344  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
345>;
346
347def int_amdgcn_cvt_pkrtz : ClangBuiltin<"__builtin_amdgcn_cvt_pkrtz">,
348  Intrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty],
349            [IntrNoMem, IntrSpeculatable, IntrWillReturn]
350>;
351
352def int_amdgcn_cvt_pknorm_i16 :
353  ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_i16">,
354  Intrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
355            [IntrNoMem, IntrSpeculatable, IntrWillReturn]
356>;
357
358def int_amdgcn_cvt_pknorm_u16 :
359  ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_u16">,
360  Intrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
361            [IntrNoMem, IntrSpeculatable, IntrWillReturn]
362>;
363
364def int_amdgcn_cvt_pk_i16 :
365    ClangBuiltin<"__builtin_amdgcn_cvt_pk_i16">,
366    Intrinsic<
367  [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
368  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
369>;
370
371def int_amdgcn_cvt_pk_u16 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_u16">,
372  Intrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
373    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
374>;
375
376def int_amdgcn_class : Intrinsic<
377  [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty],
378  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
379>;
380
381def int_amdgcn_fmed3 : ClangBuiltin<"__builtin_amdgcn_fmed3">,
382  Intrinsic<[llvm_anyfloat_ty],
383    [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
384    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
385>;
386
387def int_amdgcn_cubeid : ClangBuiltin<"__builtin_amdgcn_cubeid">,
388  Intrinsic<[llvm_float_ty],
389    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
390    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
391>;
392
393def int_amdgcn_cubema : ClangBuiltin<"__builtin_amdgcn_cubema">,
394  Intrinsic<[llvm_float_ty],
395  [llvm_float_ty, llvm_float_ty, llvm_float_ty],
396  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
397>;
398
399def int_amdgcn_cubesc : ClangBuiltin<"__builtin_amdgcn_cubesc">,
400  Intrinsic<[llvm_float_ty],
401    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
402    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
403>;
404
405def int_amdgcn_cubetc : ClangBuiltin<"__builtin_amdgcn_cubetc">,
406  Intrinsic<[llvm_float_ty],
407    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
408    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
409>;
410
411// v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz
412// should be used.
413def int_amdgcn_sffbh :
414  Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>],
415  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
416>;
417
418// v_mad_f32|f16/v_mac_f32|f16, selected regardless of denorm support.
419def int_amdgcn_fmad_ftz :
420  Intrinsic<[llvm_anyfloat_ty],
421            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
422            [IntrNoMem, IntrSpeculatable, IntrWillReturn]
423>;
424
425// Fields should mirror atomicrmw
426class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty],
427  [llvm_anyptr_ty,
428  LLVMMatchType<0>,
429  llvm_i32_ty, // ordering
430  llvm_i32_ty, // scope
431  llvm_i1_ty], // isVolatile
432  [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
433   ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "",
434  [SDNPMemOperand]
435>;
436
437def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin;
438def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin;
439
440class AMDGPULDSIntrin :
441  Intrinsic<[llvm_any_ty],
442    [LLVMQualPointerType<LLVMMatchType<0>, 3>,
443    LLVMMatchType<0>,
444    llvm_i32_ty, // ordering
445    llvm_i32_ty, // scope
446    llvm_i1_ty], // isVolatile
447    [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
448     ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]
449>;
450
451// FIXME: The m0 argument should be moved after the normal arguments
452class AMDGPUDSOrderedIntrinsic : Intrinsic<
453  [llvm_i32_ty],
454  // M0 = {hi16:address, lo16:waveID}. Allow passing M0 as a pointer, so that
455  // the bit packing can be optimized at the IR level.
456  [LLVMQualPointerType<llvm_i32_ty, 2>, // IntToPtr(M0)
457   llvm_i32_ty, // value to add or swap
458   llvm_i32_ty, // ordering
459   llvm_i32_ty, // scope
460   llvm_i1_ty,  // isVolatile
461   llvm_i32_ty, // ordered count index (OA index), also added to the address
462                // gfx10: bits 24-27 indicate the number of active threads/dwords
463   llvm_i1_ty,  // wave release, usually set to 1
464   llvm_i1_ty], // wave done, set to 1 for the last ordered instruction
465  [IntrWillReturn, NoCapture<ArgIndex<0>>,
466   ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>,
467   ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>
468  ]
469>;
470
471class AMDGPUDSAppendConsumedIntrinsic : Intrinsic<
472  [llvm_i32_ty],
473  [llvm_anyptr_ty, // LDS or GDS ptr
474   llvm_i1_ty], // isVolatile
475   [IntrConvergent, IntrWillReturn, IntrArgMemOnly,
476    NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>],
477   "",
478   [SDNPMemOperand]
479>;
480
481def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic;
482def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
483
484// The pointer argument is assumed to be dynamically uniform if a VGPR.
485def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
486def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
487
488def int_amdgcn_ds_fadd : AMDGPULDSIntrin;
489def int_amdgcn_ds_fmin : AMDGPULDSIntrin;
490def int_amdgcn_ds_fmax : AMDGPULDSIntrin;
491
492} // TargetPrefix = "amdgcn"
493
494// New-style image intrinsics
495
496//////////////////////////////////////////////////////////////////////////
497// Dimension-aware image intrinsics framework
498//////////////////////////////////////////////////////////////////////////
499
500// Helper class to represent (type, name) combinations of arguments. The
501// argument names are explanatory and used as DAG operand names for codegen
502// pattern matching.
503class AMDGPUArg<LLVMType ty, string name> {
504  LLVMType Type = ty;
505  string Name = name;
506}
507
508// Return [AMDGPUArg<basety, names[0]>, AMDGPUArg<LLVMMatchType<0>, names[1]>, ...]
509class makeArgList<list<string> names, LLVMType basety> {
510  list<AMDGPUArg> ret =
511    !listconcat([AMDGPUArg<basety, names[0]>],
512                !foreach(name, !tail(names), AMDGPUArg<LLVMMatchType<0>, name>));
513}
514
515// Return arglist, with LLVMMatchType's references shifted by 'shift'.
516class arglistmatchshift<list<AMDGPUArg> arglist, int shift> {
517  list<AMDGPUArg> ret =
518    !foreach(arg, arglist,
519             !if(!isa<LLVMMatchType>(arg.Type),
520                 AMDGPUArg<LLVMMatchType<!add(!cast<LLVMMatchType>(arg.Type).Number, shift)>,
521                           arg.Name>,
522                 arg));
523}
524
525// Return the concatenation of the given arglists. LLVMMatchType's are adjusted
526// accordingly, and shifted by an additional 'shift'.
527class arglistconcat<list<list<AMDGPUArg>> arglists, int shift = 0> {
528  list<AMDGPUArg> ret =
529    !foldl([]<AMDGPUArg>, arglists, lhs, rhs,
530           !listconcat(
531             lhs,
532             arglistmatchshift<rhs,
533                               !add(shift, !foldl(0, lhs, a, b,
534                                                  !add(a, b.Type.isAny)))>.ret));
535}
536
537// Represent texture/image types / dimensionality.
538class AMDGPUDimProps<bits<3> enc, string name, string asmsuffix,
539                     list<string> coord_names, list<string> slice_names,
540                     bit msaa = 0> {
541  AMDGPUDimProps Dim = !cast<AMDGPUDimProps>(NAME);
542  string Name = name; // e.g. "2darraymsaa"
543  string AsmSuffix = asmsuffix; // e.g. 2D_MSAA_ARRAY (used in assembly strings)
544  bits<3> Encoding = enc;
545  bit DA = 0; // DA bit in MIMG encoding
546  bit MSAA = msaa;
547
548  list<AMDGPUArg> CoordSliceArgs =
549    makeArgList<!listconcat(coord_names, slice_names), llvm_anyfloat_ty>.ret;
550  list<AMDGPUArg> CoordSliceIntArgs =
551    makeArgList<!listconcat(coord_names, slice_names), llvm_anyint_ty>.ret;
552  list<AMDGPUArg> GradientArgs =
553    makeArgList<!listconcat(!foreach(name, coord_names, "d" # name # "dh"),
554                            !foreach(name, coord_names, "d" # name # "dv")),
555                llvm_anyfloat_ty>.ret;
556
557  bits<8> NumCoords = !size(CoordSliceArgs);
558  bits<8> NumGradients = !size(GradientArgs);
559}
560
561def AMDGPUDim1D : AMDGPUDimProps<0x0, "1d", "1D", ["s"], []>;
562def AMDGPUDim2D : AMDGPUDimProps<0x1, "2d", "2D", ["s", "t"], []>;
563def AMDGPUDim3D : AMDGPUDimProps<0x2, "3d", "3D", ["s", "t", "r"], []>;
564let DA = 1 in {
565  def AMDGPUDimCube : AMDGPUDimProps<0x3, "cube", "CUBE", ["s", "t"], ["face"]>;
566  def AMDGPUDim1DArray : AMDGPUDimProps<0x4, "1darray", "1D_ARRAY", ["s"], ["slice"]>;
567  def AMDGPUDim2DArray : AMDGPUDimProps<0x5, "2darray", "2D_ARRAY", ["s", "t"], ["slice"]>;
568}
569def AMDGPUDim2DMsaa : AMDGPUDimProps<0x6, "2dmsaa", "2D_MSAA", ["s", "t"], ["fragid"], 1>;
570let DA = 1 in {
571  def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<0x7, "2darraymsaa", "2D_MSAA_ARRAY", ["s", "t"], ["slice", "fragid"], 1>;
572}
573
574def AMDGPUDims {
575  list<AMDGPUDimProps> NoMsaa = [AMDGPUDim1D, AMDGPUDim2D, AMDGPUDim3D,
576                                 AMDGPUDimCube, AMDGPUDim1DArray,
577                                 AMDGPUDim2DArray];
578  list<AMDGPUDimProps> Msaa = [AMDGPUDim2DMsaa, AMDGPUDim2DArrayMsaa];
579  list<AMDGPUDimProps> All = !listconcat(NoMsaa, Msaa);
580}
581
582// Represent sample variants, i.e. _C, _O, _B, ... and combinations thereof.
583class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr> {
584  string UpperCaseMod = ucmod;
585  string LowerCaseMod = lcmod;
586
587  // {offset} {bias} {z-compare}
588  list<AMDGPUArg> ExtraAddrArgs = extra_addr;
589  bit Offset = false;
590  bit Bias = false;
591  bit ZCompare = false;
592  bit Gradients = false;
593
594  // Name of the {lod} or {clamp} argument that is appended to the coordinates,
595  // if any.
596  string LodOrClamp = "";
597}
598
599// AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE
600// AMDGPUSampleVariantsNoGradients: variants supported by IMAGE_GATHER4
601defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = {
602  multiclass AMDGPUSampleHelper_Offset<string ucmod, string lcmod,
603                                       list<AMDGPUArg> extra_addr> {
604    def NAME#lcmod : AMDGPUSampleVariant<ucmod, lcmod, extra_addr>;
605    let Offset = true in
606    def NAME#lcmod#_o : AMDGPUSampleVariant<
607        ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg<llvm_i32_ty, "offset">], extra_addr)>;
608  }
609
610  multiclass AMDGPUSampleHelper_Compare<string ucmod, string lcmod,
611                                        list<AMDGPUArg> extra_addr> {
612    defm NAME : AMDGPUSampleHelper_Offset<ucmod, lcmod, extra_addr>;
613    let ZCompare = true in
614    defm NAME : AMDGPUSampleHelper_Offset<
615        "_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg<llvm_float_ty, "zcompare">])>;
616  }
617
618  multiclass AMDGPUSampleHelper_Clamp<string ucmod, string lcmod,
619                                      list<AMDGPUArg> extra_addr> {
620    defm NAME : AMDGPUSampleHelper_Compare<ucmod, lcmod, extra_addr>;
621    let LodOrClamp = "clamp" in
622    defm NAME : AMDGPUSampleHelper_Compare<ucmod#"_CL", lcmod#"_cl", extra_addr>;
623  }
624
625  defset list<AMDGPUSampleVariant> AMDGPUSampleVariantsNoGradients = {
626    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>;
627    let Bias = true in
628    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<
629        "_B", "_b", [AMDGPUArg<llvm_anyfloat_ty, "bias">]>;
630    let LodOrClamp = "lod" in
631    defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_L", "_l", []>;
632    defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_LZ", "_lz", []>;
633  }
634
635  let Gradients = true in {
636    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_D", "_d", []>;
637    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_CD", "_cd", []>;
638  }
639}
640
641// Helper class to capture the profile of a dimension-aware image intrinsic.
642// This information is used to generate the intrinsic's type and to inform
643// codegen pattern matching.
644class AMDGPUDimProfile<string opmod,
645                       AMDGPUDimProps dim> {
646  AMDGPUDimProps Dim = dim;
647  string OpMod = opmod; // the corresponding instruction is named IMAGE_OpMod
648
649  // These are intended to be overwritten by subclasses
650  bit IsSample = false;
651  bit IsAtomic = false;
652  list<LLVMType> RetTypes = [];
653  list<AMDGPUArg> DataArgs = [];
654  list<AMDGPUArg> ExtraAddrArgs = [];
655  bit Offset = false;
656  bit Bias = false;
657  bit ZCompare = false;
658  bit Gradients = false;
659  string LodClampMip = "";
660
661  int NumRetAndDataAnyTypes =
662    !foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b,
663           !add(a, b.isAny));
664
665  list<AMDGPUArg> AddrArgs =
666    arglistconcat<[ExtraAddrArgs,
667                   !if(Gradients, dim.GradientArgs, []),
668                   !listconcat(!if(IsSample, dim.CoordSliceArgs, dim.CoordSliceIntArgs),
669                               !if(!empty(LodClampMip),
670                                   []<AMDGPUArg>,
671                                   [AMDGPUArg<LLVMMatchType<0>, LodClampMip>]))],
672                  NumRetAndDataAnyTypes>.ret;
673  list<LLVMType> AddrTypes = !foreach(arg, AddrArgs, arg.Type);
674  list<AMDGPUArg> AddrDefaultArgs =
675    !foreach(arg, AddrArgs,
676             AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)),
677                           !if(IsSample, llvm_float_ty, llvm_i32_ty), arg.Type),
678                       arg.Name>);
679  list<AMDGPUArg> AddrA16Args =
680    !foreach(arg, AddrArgs,
681             AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)),
682                           !if(IsSample, llvm_half_ty, llvm_i16_ty), arg.Type),
683                       arg.Name>);
684}
685
686class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod, base.Dim> {
687  let IsSample = base.IsSample;
688  let IsAtomic = base.IsAtomic;
689  let RetTypes = base.RetTypes;
690  let DataArgs = base.DataArgs;
691  let ExtraAddrArgs = base.ExtraAddrArgs;
692  let Offset = base.Offset;
693  let Bias = base.Bias;
694  let ZCompare = base.ZCompare;
695  let Gradients = base.Gradients;
696  let LodClampMip = base.LodClampMip;
697}
698
699class AMDGPUDimSampleProfile<string opmod,
700                             AMDGPUDimProps dim,
701                             AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> {
702  let IsSample = true;
703  let RetTypes = [llvm_any_ty];
704  let ExtraAddrArgs = sample.ExtraAddrArgs;
705  let Offset = sample.Offset;
706  let Bias = sample.Bias;
707  let ZCompare = sample.ZCompare;
708  let Gradients = sample.Gradients;
709  let LodClampMip = sample.LodOrClamp;
710}
711
712class AMDGPUDimNoSampleProfile<string opmod,
713                               AMDGPUDimProps dim,
714                               list<LLVMType> retty,
715                               list<AMDGPUArg> dataargs,
716                               bit Mip = false> : AMDGPUDimProfile<opmod, dim> {
717  let RetTypes = retty;
718  let DataArgs = dataargs;
719  let LodClampMip = !if(Mip, "mip", "");
720}
721
722class AMDGPUDimAtomicProfile<string opmod,
723                             AMDGPUDimProps dim,
724                             list<AMDGPUArg> dataargs> : AMDGPUDimProfile<opmod, dim> {
725  let RetTypes = [llvm_anyint_ty];
726  let DataArgs = dataargs;
727  let IsAtomic = true;
728}
729
730class AMDGPUDimAtomicFloatProfile<string opmod, AMDGPUDimProps dim,
731                                  list<AMDGPUArg> dataargs>
732    : AMDGPUDimAtomicProfile<opmod, dim, dataargs> {
733  let RetTypes = [llvm_anyfloat_ty];
734}
735
736class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim>
737    : AMDGPUDimProfile<"GET_RESINFO", dim> {
738  let RetTypes = [llvm_anyfloat_ty];
739  let DataArgs = [];
740  let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">];
741  let LodClampMip = "mip";
742}
743
744// Helper class for figuring out image intrinsic argument indexes.
745class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> {
746  int NumDataArgs = !size(P_.DataArgs);
747  int NumDmaskArgs = !not(P_.IsAtomic);
748  int NumOffsetArgs = !if(P_.Offset, 1, 0);
749  int NumBiasArgs = !if(P_.Bias, 1, 0);
750  int NumZCompareArgs = !if(P_.ZCompare, 1, 0);
751  int NumExtraAddrArgs = !add(NumOffsetArgs, NumBiasArgs, NumZCompareArgs);
752  int NumVAddrArgs = !size(P_.AddrArgs);
753  int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0);
754  int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs));
755  int NumRSrcArgs = 1;
756  int NumSampArgs = !if(P_.IsSample, 2, 0);
757  int DmaskArgIndex = NumDataArgs;
758  int VAddrArgIndex = !add(DmaskArgIndex, NumDmaskArgs);
759  int OffsetArgIndex = VAddrArgIndex;
760  int BiasArgIndex = !add(VAddrArgIndex, NumOffsetArgs);
761  int ZCompareArgIndex = !add(BiasArgIndex, NumBiasArgs);
762  int GradientArgIndex = !add(VAddrArgIndex, NumExtraAddrArgs);
763  int CoordArgIndex = !add(GradientArgIndex, NumGradientArgs);
764  int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1);
765  int MipArgIndex = LodArgIndex;
766  int RsrcArgIndex = !add(VAddrArgIndex, NumVAddrArgs);
767  int SampArgIndex = !add(RsrcArgIndex, NumRSrcArgs);
768  int UnormArgIndex = !add(SampArgIndex, 1);
769  int TexFailCtrlArgIndex = !add(SampArgIndex, NumSampArgs);
770  int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1);
771}
772
773// All dimension-aware intrinsics are derived from this class.
774class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
775                              list<IntrinsicProperty> props,
776                              list<SDNodeProperty> sdnodeprops> : Intrinsic<
777    P_.RetTypes,        // vdata(VGPR) -- for load/atomic-with-return
778    !listconcat(
779      !foreach(arg, P_.DataArgs, arg.Type),      // vdata(VGPR) -- for store/atomic
780      !if(P_.IsAtomic, [], [llvm_i32_ty]),       // dmask(imm)
781      P_.AddrTypes,                              // vaddr(VGPR)
782      [llvm_v8i32_ty],                           // rsrc(SGPR)
783      !if(P_.IsSample, [llvm_v4i32_ty,           // samp(SGPR)
784                        llvm_i1_ty], []),        // unorm(imm)
785      [llvm_i32_ty,                              // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe)
786       llvm_i32_ty]),                            // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc)
787
788     !listconcat(props,
789          !if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]),
790          !if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []),
791          [IntrWillReturn],
792          [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>,
793           ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>]),
794
795
796      "", sdnodeprops>,
797  AMDGPURsrcIntrinsic<!add(!size(P_.DataArgs), !size(P_.AddrTypes),
798                           !if(P_.IsAtomic, 0, 1)), 1> {
799  AMDGPUDimProfile P = P_;
800
801  AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME);
802
803  let TargetPrefix = "amdgcn";
804}
805
806// Marker class for intrinsics with a DMask that determines the returned
807// channels.
808class AMDGPUImageDMaskIntrinsic;
809
810defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
811
812  //////////////////////////////////////////////////////////////////////////
813  // Load and store intrinsics
814  //////////////////////////////////////////////////////////////////////////
815  multiclass AMDGPUImageDimIntrinsicsNoMsaa<string opmod,
816                                            list<LLVMType> retty,
817                                            list<AMDGPUArg> dataargs,
818                                            list<IntrinsicProperty> props,
819                                            list<SDNodeProperty> sdnodeprops,
820                                            bit Mip = false> {
821    foreach dim = AMDGPUDims.NoMsaa in {
822      def !strconcat(NAME, "_", dim.Name)
823        : AMDGPUImageDimIntrinsic<
824            AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>,
825            props, sdnodeprops>;
826    }
827  }
828
829  multiclass AMDGPUImageDimIntrinsicsAll<string opmod,
830                                         list<LLVMType> retty,
831                                         list<AMDGPUArg> dataargs,
832                                         list<IntrinsicProperty> props,
833                                         list<SDNodeProperty> sdnodeprops,
834                                         bit Mip = false> {
835    foreach dim = AMDGPUDims.All in {
836      def !strconcat(NAME, "_", dim.Name)
837        : AMDGPUImageDimIntrinsic<
838            AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>,
839            props, sdnodeprops>;
840    }
841  }
842
843  defm int_amdgcn_image_load
844    : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem],
845                                  [SDNPMemOperand]>,
846      AMDGPUImageDMaskIntrinsic;
847  defm int_amdgcn_image_load_mip
848    : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [],
849                                     [IntrReadMem, IntrWillReturn], [SDNPMemOperand], 1>,
850      AMDGPUImageDMaskIntrinsic;
851
852  defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll<
853              "STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
854              [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>;
855  defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa<
856              "STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
857              [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>;
858
859  //////////////////////////////////////////////////////////////////////////
860  // MSAA intrinsics
861  //////////////////////////////////////////////////////////////////////////
862  foreach dim = AMDGPUDims.Msaa in {
863    def int_amdgcn_image_msaa_load_x # _ # dim.Name:
864        AMDGPUImageDimIntrinsic<
865            AMDGPUDimNoSampleProfile<"MSAA_LOAD_X", dim, [llvm_any_ty], []>,
866            [IntrReadMem], [SDNPMemOperand]>;
867  }
868
869  foreach dim = AMDGPUDims.Msaa in {
870    def int_amdgcn_image_msaa_load # _ # dim.Name:
871        AMDGPUImageDimIntrinsic<
872            AMDGPUDimNoSampleProfile<"MSAA_LOAD", dim, [llvm_any_ty], []>,
873            [IntrReadMem], [SDNPMemOperand]>;
874  }
875
876  //////////////////////////////////////////////////////////////////////////
877  // sample and getlod intrinsics
878  //////////////////////////////////////////////////////////////////////////
879  multiclass AMDGPUImageDimSampleDims<string opmod,
880                                      AMDGPUSampleVariant sample,
881                                      bit NoMem = false> {
882    foreach dim = AMDGPUDims.NoMsaa in {
883      def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic<
884          AMDGPUDimSampleProfile<opmod, dim, sample>,
885          !if(NoMem, [IntrNoMem], [IntrReadMem]),
886          !if(NoMem, [], [SDNPMemOperand])>;
887    }
888  }
889
890  foreach sample = AMDGPUSampleVariants in {
891    defm int_amdgcn_image_sample # sample.LowerCaseMod
892      : AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>,
893        AMDGPUImageDMaskIntrinsic;
894  }
895
896  defm int_amdgcn_image_getlod
897    : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>,
898      AMDGPUImageDMaskIntrinsic;
899
900  //////////////////////////////////////////////////////////////////////////
901  // getresinfo intrinsics
902  //////////////////////////////////////////////////////////////////////////
903  foreach dim = AMDGPUDims.All in {
904    def !strconcat("int_amdgcn_image_getresinfo_", dim.Name)
905      : AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>,
906        AMDGPUImageDMaskIntrinsic;
907  }
908
909  //////////////////////////////////////////////////////////////////////////
910  // gather4 intrinsics
911  //////////////////////////////////////////////////////////////////////////
912  foreach sample = AMDGPUSampleVariantsNoGradients in {
913    foreach dim = [AMDGPUDim2D, AMDGPUDimCube, AMDGPUDim2DArray] in {
914      def int_amdgcn_image_gather4 # sample.LowerCaseMod # _ # dim.Name:
915          AMDGPUImageDimIntrinsic<
916              AMDGPUDimSampleProfile<"GATHER4" # sample.UpperCaseMod, dim, sample>,
917              [IntrReadMem], [SDNPMemOperand]>;
918    }
919  }
920}
921
922//////////////////////////////////////////////////////////////////////////
923// atomic intrinsics
924//////////////////////////////////////////////////////////////////////////
925defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
926  multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs,
927                                   int isFloat = 0> {
928        foreach dim = AMDGPUDims.All in {
929          def !strconcat(NAME, "_", dim.Name): AMDGPUImageDimIntrinsic<
930              !if (isFloat, AMDGPUDimAtomicFloatProfile<opmod, dim, dataargs>,
931                   AMDGPUDimAtomicProfile<opmod, dim, dataargs>),
932              [], [SDNPMemOperand]>;
933        }
934  }
935
936  multiclass AMDGPUImageDimAtomic<string opmod, int isFloat = 0> {
937    defm ""
938        : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">],
939                                isFloat>;
940  }
941
942  multiclass AMDGPUImageDimFloatAtomic<string opmod> {
943    defm "" : AMDGPUImageDimAtomic<opmod, 1 /*isFloat*/>;
944  }
945
946  defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">;
947  defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">;
948  defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">;
949  defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">;
950  defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">;
951  defm int_amdgcn_image_atomic_fmin : AMDGPUImageDimFloatAtomic<"ATOMIC_FMIN">;
952  defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">;
953  defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">;
954  defm int_amdgcn_image_atomic_fmax : AMDGPUImageDimFloatAtomic<"ATOMIC_FMAX">;
955  defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">;
956  defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">;
957  defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">;
958  defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">;
959  defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">;
960
961  defm int_amdgcn_image_atomic_cmpswap :
962      AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">,
963                                               AMDGPUArg<LLVMMatchType<0>, "cmp">]>;
964}
965
966//////////////////////////////////////////////////////////////////////////
967// Buffer intrinsics
968//////////////////////////////////////////////////////////////////////////
969
970let TargetPrefix = "amdgcn" in {
971
972defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = {
973
974class AMDGPUBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
975  [data_ty],
976  [llvm_v4i32_ty,     // rsrc(SGPR)
977   llvm_i32_ty,       // vindex(VGPR)
978   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
979   llvm_i1_ty,        // glc(imm)
980   llvm_i1_ty],       // slc(imm)
981  [IntrReadMem, IntrWillReturn,
982   ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
983  AMDGPURsrcIntrinsic<0>;
984def int_amdgcn_buffer_load_format : AMDGPUBufferLoad<llvm_anyfloat_ty>;
985def int_amdgcn_buffer_load : AMDGPUBufferLoad;
986
987// Generate a buffer_load instruction that may be optimized to s_buffer_load if
988// the offset argument is uniform.
989def int_amdgcn_s_buffer_load : Intrinsic <
990  [llvm_any_ty],
991  [llvm_v4i32_ty,     // rsrc(SGPR)
992   llvm_i32_ty,       // byte offset
993   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 2 = dlc)
994  [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<2>>]>,
995  AMDGPURsrcIntrinsic<0>;
996
997class AMDGPUBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic <
998  [],
999  [data_ty,          // vdata(VGPR)
1000   llvm_v4i32_ty,     // rsrc(SGPR)
1001   llvm_i32_ty,       // vindex(VGPR)
1002   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
1003   llvm_i1_ty,        // glc(imm)
1004   llvm_i1_ty],       // slc(imm)
1005  [IntrWriteMem, IntrWillReturn,
1006   ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1007  AMDGPURsrcIntrinsic<1>;
1008def int_amdgcn_buffer_store_format : AMDGPUBufferStore<llvm_anyfloat_ty>;
1009def int_amdgcn_buffer_store : AMDGPUBufferStore;
1010
1011// New buffer intrinsics with separate raw and struct variants.  The raw
1012// variant never has an index. The struct variant always has an index, even if
1013// it is const 0. A struct intrinsic with constant 0 index is different to the
1014// corresponding raw intrinsic on gfx9+ because the behavior of bound checking
1015// and swizzling changes depending on whether idxen is set in the instruction.
1016// These new instrinsics also keep the offset and soffset arguments separate as
1017// they behave differently in bounds checking and swizzling.
1018class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1019  [data_ty],
1020  [llvm_v4i32_ty,     // rsrc(SGPR)
1021   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1022   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1023   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1024                      //                                       bit 1 = slc,
1025                      //                                       bit 2 = dlc on gfx10+),
1026                      //                      swizzled buffer (bit 3 = swz))
1027  [IntrReadMem, IntrWillReturn, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>,
1028  AMDGPURsrcIntrinsic<0>;
1029def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>;
1030def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad;
1031
1032class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1033  [data_ty],
1034  [llvm_v4i32_ty,     // rsrc(SGPR)
1035   llvm_i32_ty,       // vindex(VGPR)
1036   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1037   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1038   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1039                      //                                       bit 1 = slc,
1040                      //                                       bit 2 = dlc on gfx10+),
1041                      //                      swizzled buffer (bit 3 = swz))
1042  [IntrReadMem, IntrWillReturn, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1043  AMDGPURsrcIntrinsic<0>;
1044def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad;
1045def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad;
1046
1047class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1048  [],
1049  [data_ty,           // vdata(VGPR)
1050   llvm_v4i32_ty,     // rsrc(SGPR)
1051   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1052   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1053   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1054                      //                                       bit 1 = slc,
1055                      //                                       bit 2 = dlc on gfx10+),
1056                      //                      swizzled buffer (bit 3 = swz))
1057  [IntrWriteMem, IntrWillReturn, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1058  AMDGPURsrcIntrinsic<1>;
1059def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore<llvm_anyfloat_ty>;
1060def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore;
1061
1062class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic <
1063  [],
1064  [data_ty,           // vdata(VGPR)
1065   llvm_v4i32_ty,     // rsrc(SGPR)
1066   llvm_i32_ty,       // vindex(VGPR)
1067   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1068   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1069   llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1070                      //                                       bit 1 = slc,
1071                      //                                       bit 2 = dlc on gfx10+),
1072                      //                      swizzled buffer (bit 3 = swz))
1073  [IntrWriteMem, IntrWillReturn, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1074  AMDGPURsrcIntrinsic<1>;
1075def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore;
1076def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore;
1077
1078class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic <
1079  !if(NoRtn, [], [data_ty]),
1080  [!if(NoRtn, data_ty, LLVMMatchType<0>),  // vdata(VGPR)
1081   llvm_v4i32_ty,     // rsrc(SGPR)
1082   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1083   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1084   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc)
1085  [ImmArg<ArgIndex<4>>, IntrWillReturn], "", [SDNPMemOperand]>,
1086  AMDGPURsrcIntrinsic<1, 0>;
1087def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic;
1088def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic;
1089def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic;
1090def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic;
1091def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic;
1092def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
1093def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic;
1094def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic;
1095def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
1096def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic;
1097def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic;
1098def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
1099def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic;
1100def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic;
1101def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
1102  [llvm_anyint_ty],
1103  [LLVMMatchType<0>,  // src(VGPR)
1104   LLVMMatchType<0>,  // cmp(VGPR)
1105   llvm_v4i32_ty,     // rsrc(SGPR)
1106   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1107   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1108   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc)
1109  [ImmArg<ArgIndex<5>>, IntrWillReturn], "", [SDNPMemOperand]>,
1110  AMDGPURsrcIntrinsic<2, 0>;
1111
1112// gfx908 intrinsic
1113def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
1114
1115class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic <
1116  !if(NoRtn, [], [data_ty]),
1117  [!if(NoRtn, data_ty, LLVMMatchType<0>),  // vdata(VGPR)
1118   llvm_v4i32_ty,     // rsrc(SGPR)
1119   llvm_i32_ty,       // vindex(VGPR)
1120   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1121   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1122   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc)
1123  [ImmArg<ArgIndex<5>>, IntrWillReturn], "", [SDNPMemOperand]>,
1124  AMDGPURsrcIntrinsic<1, 0>;
1125def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic;
1126def int_amdgcn_struct_buffer_atomic_add : AMDGPUStructBufferAtomic;
1127def int_amdgcn_struct_buffer_atomic_sub : AMDGPUStructBufferAtomic;
1128def int_amdgcn_struct_buffer_atomic_smin : AMDGPUStructBufferAtomic;
1129def int_amdgcn_struct_buffer_atomic_umin : AMDGPUStructBufferAtomic;
1130def int_amdgcn_struct_buffer_atomic_smax : AMDGPUStructBufferAtomic;
1131def int_amdgcn_struct_buffer_atomic_umax : AMDGPUStructBufferAtomic;
1132def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic;
1133def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic;
1134def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic;
1135def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic;
1136def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic;
1137def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
1138  [llvm_anyint_ty],
1139  [LLVMMatchType<0>,  // src(VGPR)
1140   LLVMMatchType<0>,  // cmp(VGPR)
1141   llvm_v4i32_ty,     // rsrc(SGPR)
1142   llvm_i32_ty,       // vindex(VGPR)
1143   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1144   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1145   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc)
1146  [ImmArg<ArgIndex<6>>, IntrWillReturn], "", [SDNPMemOperand]>,
1147  AMDGPURsrcIntrinsic<2, 0>;
1148
1149// gfx908 intrinsic
1150def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
1151
1152// gfx90a intrinsics
1153def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
1154def int_amdgcn_struct_buffer_atomic_fmax : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
1155
1156
1157// Obsolescent tbuffer intrinsics.
1158def int_amdgcn_tbuffer_load : Intrinsic <
1159    [llvm_any_ty],    // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1160    [llvm_v4i32_ty,   // rsrc(SGPR)
1161     llvm_i32_ty,     // vindex(VGPR)
1162     llvm_i32_ty,     // voffset(VGPR)
1163     llvm_i32_ty,     // soffset(SGPR)
1164     llvm_i32_ty,     // offset(imm)
1165     llvm_i32_ty,     // dfmt(imm)
1166     llvm_i32_ty,     // nfmt(imm)
1167     llvm_i1_ty,     // glc(imm)
1168     llvm_i1_ty],    // slc(imm)
1169    [IntrReadMem, IntrWillReturn,
1170     ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>,
1171     ImmArg<ArgIndex<7>>, ImmArg<ArgIndex<8>>], "", [SDNPMemOperand]>,
1172  AMDGPURsrcIntrinsic<0>;
1173
1174def int_amdgcn_tbuffer_store : Intrinsic <
1175    [],
1176    [llvm_any_ty,    // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1177     llvm_v4i32_ty,  // rsrc(SGPR)
1178     llvm_i32_ty,    // vindex(VGPR)
1179     llvm_i32_ty,    // voffset(VGPR)
1180     llvm_i32_ty,    // soffset(SGPR)
1181     llvm_i32_ty,    // offset(imm)
1182     llvm_i32_ty,    // dfmt(imm)
1183     llvm_i32_ty,    // nfmt(imm)
1184     llvm_i1_ty,     // glc(imm)
1185     llvm_i1_ty],    // slc(imm)
1186    [IntrWriteMem, IntrWillReturn, ImmArg<ArgIndex<5>>,
1187     ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>,
1188     ImmArg<ArgIndex<8>>, ImmArg<ArgIndex<9>>], "", [SDNPMemOperand]>,
1189  AMDGPURsrcIntrinsic<1>;
1190
1191// New tbuffer intrinsics, with:
1192// - raw and struct variants
1193// - joint format field
1194// - joint cachepolicy field
1195def int_amdgcn_raw_tbuffer_load : Intrinsic <
1196    [llvm_any_ty],    // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1197    [llvm_v4i32_ty,   // rsrc(SGPR)
1198     llvm_i32_ty,     // offset(VGPR/imm, included in bounds checking and swizzling)
1199     llvm_i32_ty,     // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1200     llvm_i32_ty,     // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1201     llvm_i32_ty],    // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1202                      //                                       bit 1 = slc,
1203                      //                                       bit 2 = dlc on gfx10+),
1204                      //                      swizzled buffer (bit 3 = swz))
1205    [IntrReadMem, IntrWillReturn,
1206     ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
1207  AMDGPURsrcIntrinsic<0>;
1208
1209def int_amdgcn_raw_tbuffer_store : Intrinsic <
1210    [],
1211    [llvm_any_ty,    // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1212     llvm_v4i32_ty,  // rsrc(SGPR)
1213     llvm_i32_ty,    // offset(VGPR/imm, included in bounds checking and swizzling)
1214     llvm_i32_ty,    // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1215     llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1216     llvm_i32_ty],   // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1217                     //                                       bit 1 = slc,
1218                     //                                       bit 2 = dlc on gfx10+),
1219                     //                      swizzled buffer (bit 3 = swz))
1220    [IntrWriteMem, IntrWillReturn,
1221     ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1222  AMDGPURsrcIntrinsic<1>;
1223
1224def int_amdgcn_struct_tbuffer_load : Intrinsic <
1225    [llvm_any_ty],    // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1226    [llvm_v4i32_ty,   // rsrc(SGPR)
1227     llvm_i32_ty,     // vindex(VGPR)
1228     llvm_i32_ty,     // offset(VGPR/imm, included in bounds checking and swizzling)
1229     llvm_i32_ty,     // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1230     llvm_i32_ty,     // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1231     llvm_i32_ty],    // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1232                      //                                       bit 1 = slc,
1233                      //                                       bit 2 = dlc on gfx10+),
1234                      //                      swizzled buffer (bit 3 = swz))
1235    [IntrReadMem, IntrWillReturn,
1236     ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
1237  AMDGPURsrcIntrinsic<0>;
1238
1239def int_amdgcn_struct_tbuffer_store : Intrinsic <
1240    [],
1241    [llvm_any_ty,    // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1242     llvm_v4i32_ty,  // rsrc(SGPR)
1243     llvm_i32_ty,    // vindex(VGPR)
1244     llvm_i32_ty,    // offset(VGPR/imm, included in bounds checking and swizzling)
1245     llvm_i32_ty,    // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1246     llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1247     llvm_i32_ty],   // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1248                     //                                       bit 1 = slc,
1249                     //                                       bit 2 = dlc on gfx10+),
1250                     //                      swizzled buffer (bit 3 = swz))
1251    [IntrWriteMem, IntrWillReturn,
1252     ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>,
1253  AMDGPURsrcIntrinsic<1>;
1254
1255class AMDGPUBufferAtomic : Intrinsic <
1256  [llvm_anyint_ty],
1257  [LLVMMatchType<0>,       // vdata(VGPR)
1258   llvm_v4i32_ty,     // rsrc(SGPR)
1259   llvm_i32_ty,       // vindex(VGPR)
1260   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
1261   llvm_i1_ty],       // slc(imm)
1262  [ImmArg<ArgIndex<4>>, IntrWillReturn], "", [SDNPMemOperand]>,
1263  AMDGPURsrcIntrinsic<1, 0>;
1264def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic;
1265def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic;
1266def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic;
1267def int_amdgcn_buffer_atomic_smin : AMDGPUBufferAtomic;
1268def int_amdgcn_buffer_atomic_umin : AMDGPUBufferAtomic;
1269def int_amdgcn_buffer_atomic_smax : AMDGPUBufferAtomic;
1270def int_amdgcn_buffer_atomic_umax : AMDGPUBufferAtomic;
1271def int_amdgcn_buffer_atomic_and : AMDGPUBufferAtomic;
1272def int_amdgcn_buffer_atomic_or : AMDGPUBufferAtomic;
1273def int_amdgcn_buffer_atomic_xor : AMDGPUBufferAtomic;
1274def int_amdgcn_buffer_atomic_cmpswap : Intrinsic<
1275  [llvm_i32_ty],
1276  [llvm_i32_ty,       // src(VGPR)
1277   llvm_i32_ty,       // cmp(VGPR)
1278   llvm_v4i32_ty,     // rsrc(SGPR)
1279   llvm_i32_ty,       // vindex(VGPR)
1280   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
1281   llvm_i1_ty],       // slc(imm)
1282  [ImmArg<ArgIndex<5>>, IntrWillReturn], "", [SDNPMemOperand]>,
1283  AMDGPURsrcIntrinsic<2, 0>;
1284
1285def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic;
1286
1287class AMDGPUBufferAtomicFP : Intrinsic <
1288  [llvm_anyfloat_ty],
1289  [LLVMMatchType<0>, // vdata(VGPR)
1290   llvm_v4i32_ty,    // rsrc(SGPR)
1291   llvm_i32_ty,      // vindex(VGPR)
1292   llvm_i32_ty,      // offset(SGPR/VGPR/imm)
1293   llvm_i1_ty],      // slc(imm)
1294  [ImmArg<ArgIndex<4>>, IntrWillReturn], "", [SDNPMemOperand]>,
1295  AMDGPURsrcIntrinsic<1, 0>;
1296
1297// Legacy form of the intrinsic. raw and struct forms should be preferred.
1298def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP;
1299
1300class AMDGPURawBufferLoadLDS : Intrinsic <
1301  [],
1302  [llvm_v4i32_ty,                      // rsrc(SGPR)
1303   LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset
1304   llvm_i32_ty,                        // Data byte size: 1/2/4
1305   llvm_i32_ty,                        // voffset(VGPR, included in bounds checking and swizzling)
1306   llvm_i32_ty,                        // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1307   llvm_i32_ty,                        // imm offset(imm, included in bounds checking and swizzling)
1308   llvm_i32_ty],                       // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1309                                       //                                       bit 1 = slc,
1310                                       //                                       bit 2 = dlc on gfx10+))
1311                                       //                      swizzled buffer (bit 3 = swz))
1312  [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>,
1313   ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
1314def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS;
1315
1316class AMDGPUStructBufferLoadLDS : Intrinsic <
1317  [],
1318  [llvm_v4i32_ty,                      // rsrc(SGPR)
1319   LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset
1320   llvm_i32_ty,                        // Data byte size: 1/2/4
1321   llvm_i32_ty,                        // vindex(VGPR)
1322   llvm_i32_ty,                        // voffset(VGPR, included in bounds checking and swizzling)
1323   llvm_i32_ty,                        // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1324   llvm_i32_ty,                        // imm offset(imm, included in bounds checking and swizzling)
1325   llvm_i32_ty],                       // auxiliary data (imm, cachepolicy     (bit 0 = glc,
1326                                       //                                       bit 1 = slc,
1327                                       //                                       bit 2 = dlc on gfx10+))
1328                                       //                      swizzled buffer (bit 3 = swz))
1329  [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>,
1330   ImmArg<ArgIndex<7>>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
1331def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS;
1332
1333} // defset AMDGPUBufferIntrinsics
1334
1335// Uses that do not set the done bit should set IntrWriteMem on the
1336// call site.
1337def int_amdgcn_exp : Intrinsic <[], [
1338  llvm_i32_ty,       // tgt,
1339  llvm_i32_ty,       // en
1340  llvm_any_ty,       // src0 (f32 or i32)
1341  LLVMMatchType<0>,  // src1
1342  LLVMMatchType<0>,  // src2
1343  LLVMMatchType<0>,  // src3
1344  llvm_i1_ty,        // done
1345  llvm_i1_ty         // vm (ignored on GFX11+)
1346  ],
1347  [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>,
1348   ImmArg<ArgIndex<7>>, IntrWriteMem, IntrInaccessibleMemOnly,
1349   IntrWillReturn]
1350>;
1351
1352// exp with row_en bit set. Only supported on GFX11+.
1353def int_amdgcn_exp_row : Intrinsic <[], [
1354  llvm_i32_ty,       // tgt,
1355  llvm_i32_ty,       // en
1356  llvm_any_ty,       // src0 (f32 or i32)
1357  LLVMMatchType<0>,  // src1
1358  LLVMMatchType<0>,  // src2
1359  LLVMMatchType<0>,  // src3
1360  llvm_i1_ty,        // done
1361  llvm_i32_ty],      // row number
1362  [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>,
1363   IntrWriteMem, IntrInaccessibleMemOnly, IntrWillReturn]
1364>;
1365
1366// exp with compr bit set. Not supported on GFX11+.
1367def int_amdgcn_exp_compr : Intrinsic <[], [
1368  llvm_i32_ty,       // tgt,
1369  llvm_i32_ty,       // en
1370  llvm_anyvector_ty, // src0 (v2f16 or v2i16)
1371  LLVMMatchType<0>,  // src1
1372  llvm_i1_ty,        // done
1373  llvm_i1_ty],       // vm
1374  [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>,
1375   ImmArg<ArgIndex<5>>, IntrWriteMem, IntrInaccessibleMemOnly,
1376   IntrWillReturn]
1377>;
1378
1379def int_amdgcn_buffer_wbinvl1_sc :
1380  ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">,
1381  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
1382
1383def int_amdgcn_buffer_wbinvl1 :
1384  ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1">,
1385  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
1386
1387def int_amdgcn_s_dcache_inv :
1388  ClangBuiltin<"__builtin_amdgcn_s_dcache_inv">,
1389  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
1390
1391def int_amdgcn_s_memtime :
1392  ClangBuiltin<"__builtin_amdgcn_s_memtime">,
1393  Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
1394
1395def int_amdgcn_s_sleep :
1396  ClangBuiltin<"__builtin_amdgcn_s_sleep">,
1397  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1398                                IntrHasSideEffects, IntrWillReturn]> {
1399}
1400
1401def int_amdgcn_s_incperflevel :
1402  ClangBuiltin<"__builtin_amdgcn_s_incperflevel">,
1403  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1404                                IntrHasSideEffects, IntrWillReturn]> {
1405}
1406
1407def int_amdgcn_s_decperflevel :
1408  ClangBuiltin<"__builtin_amdgcn_s_decperflevel">,
1409  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1410                                IntrHasSideEffects, IntrWillReturn]> {
1411}
1412
1413def int_amdgcn_s_sethalt :
1414  Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1415                                IntrHasSideEffects, IntrWillReturn]>;
1416
1417def int_amdgcn_s_setprio :
1418  ClangBuiltin<"__builtin_amdgcn_s_setprio">,
1419  Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
1420                                IntrHasSideEffects, IntrWillReturn]>;
1421
1422// This is IntrHasSideEffects so it can be used to read cycle counters.
1423def int_amdgcn_s_getreg :
1424  ClangBuiltin<"__builtin_amdgcn_s_getreg">,
1425  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
1426  [IntrNoMem, IntrHasSideEffects, IntrWillReturn, ImmArg<ArgIndex<0>>]
1427>;
1428
1429// Note this can be used to set FP environment properties that are
1430// unsafe to change in non-strictfp functions. The register properties
1431// available (and value required to access them) may differ per
1432// subtarget. llvm.amdgcn.s.setreg(hwmode, value)
1433def int_amdgcn_s_setreg :
1434  ClangBuiltin<"__builtin_amdgcn_s_setreg">,
1435  Intrinsic<[], [llvm_i32_ty, llvm_i32_ty],
1436  [IntrNoMem, IntrHasSideEffects, IntrWillReturn, ImmArg<ArgIndex<0>>]
1437>;
1438
1439// int_amdgcn_s_getpc is provided to allow a specific style of position
1440// independent code to determine the high part of its address when it is
1441// known (through convention) that the code and any data of interest does
1442// not cross a 4Gb address boundary. Use for any other purpose may not
1443// produce the desired results as optimizations may cause code movement,
1444// especially as we explicitly use IntrNoMem to allow optimizations.
1445def int_amdgcn_s_getpc :
1446  ClangBuiltin<"__builtin_amdgcn_s_getpc">,
1447  Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable,
1448                                IntrWillReturn]>;
1449
1450// __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0>
1451// param values: 0 = P10, 1 = P20, 2 = P0
1452def int_amdgcn_interp_mov :
1453  ClangBuiltin<"__builtin_amdgcn_interp_mov">,
1454  Intrinsic<[llvm_float_ty],
1455            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1456            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
1457              ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
1458
1459// __builtin_amdgcn_interp_p1 <i>, <attr_chan>, <attr>, <m0>
1460// This intrinsic reads from lds, but the memory values are constant,
1461// so it behaves like IntrNoMem.
1462def int_amdgcn_interp_p1 :
1463  ClangBuiltin<"__builtin_amdgcn_interp_p1">,
1464  Intrinsic<[llvm_float_ty],
1465            [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1466            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
1467             ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
1468
1469// __builtin_amdgcn_interp_p2 <p1>, <j>, <attr_chan>, <attr>, <m0>
1470def int_amdgcn_interp_p2 :
1471  ClangBuiltin<"__builtin_amdgcn_interp_p2">,
1472  Intrinsic<[llvm_float_ty],
1473            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1474            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
1475             ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
1476          // See int_amdgcn_v_interp_p1 for why this is IntrNoMem.
1477
1478// __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0>
1479// high selects whether high or low 16-bits are loaded from LDS
1480def int_amdgcn_interp_p1_f16 :
1481  ClangBuiltin<"__builtin_amdgcn_interp_p1_f16">,
1482  Intrinsic<[llvm_float_ty],
1483            [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
1484            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
1485             ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
1486
1487// __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0>
1488// high selects whether high or low 16-bits are loaded from LDS
1489def int_amdgcn_interp_p2_f16 :
1490  ClangBuiltin<"__builtin_amdgcn_interp_p2_f16">,
1491  Intrinsic<[llvm_half_ty],
1492            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
1493            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
1494             ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
1495
1496// llvm.amdgcn.lds.direct.load <m0>
1497// The input argument is m0, which contains a packed combination of address
1498// offset and flags describing the data type.
1499def int_amdgcn_lds_direct_load :
1500  Intrinsic<[llvm_any_ty], // overloaded for types u8, u16, i32/f32, i8, i16
1501            [llvm_i32_ty],
1502            [IntrReadMem, IntrSpeculatable, IntrWillReturn]>;
1503
1504// llvm.amdgcn.lds.param.load <attr_chan>, <attr>, <m0>
1505// Like interp intrinsics, this reads from lds, but the memory values are constant,
1506// so it behaves like IntrNoMem.
1507def int_amdgcn_lds_param_load :
1508  Intrinsic<[llvm_float_ty],
1509            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1510            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
1511             ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
1512
1513// llvm.amdgcn.interp.inreg.p10 <p>, <i>, <p0>
1514def int_amdgcn_interp_inreg_p10 :
1515  Intrinsic<[llvm_float_ty],
1516            [llvm_float_ty, llvm_float_ty, llvm_float_ty],
1517            [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
1518
1519// llvm.amdgcn.interp.inreg.p2 <p>, <j>, <tmp>
1520def int_amdgcn_interp_inreg_p2 :
1521  Intrinsic<[llvm_float_ty],
1522            [llvm_float_ty, llvm_float_ty, llvm_float_ty],
1523            [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
1524
1525// llvm.amdgcn.interp.inreg.p10.f16 <p>, <i>, <p0>, <high>
1526// high selects whether high or low 16-bits are used for p and p0 operands
1527def int_amdgcn_interp_inreg_p10_f16:
1528  Intrinsic<[llvm_float_ty],
1529            [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
1530            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
1531             ImmArg<ArgIndex<3>>]>;
1532
1533// llvm.amdgcn.interp.inreg.p2.f16 <p>, <j>, <tmp>, <high>
1534// high selects whether high or low 16-bits are used for p operand
1535def int_amdgcn_interp_inreg_p2_f16 :
1536  Intrinsic<[llvm_half_ty],
1537            [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
1538            [IntrNoMem, IntrSpeculatable, IntrWillReturn,
1539             ImmArg<ArgIndex<3>>]>;
1540
1541// Deprecated: use llvm.amdgcn.live.mask instead.
1542def int_amdgcn_ps_live : Intrinsic <
1543  [llvm_i1_ty],
1544  [],
1545  [IntrNoMem, IntrWillReturn]>;
1546
1547// Query currently live lanes.
1548// Returns true if lane is live (and not a helper lane).
1549def int_amdgcn_live_mask : Intrinsic <[llvm_i1_ty],
1550  [], [IntrReadMem, IntrInaccessibleMemOnly, IntrWillReturn]
1551>;
1552
1553def int_amdgcn_mbcnt_lo :
1554  ClangBuiltin<"__builtin_amdgcn_mbcnt_lo">,
1555  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
1556   [IntrNoMem, IntrWillReturn]>;
1557
1558def int_amdgcn_mbcnt_hi :
1559  ClangBuiltin<"__builtin_amdgcn_mbcnt_hi">,
1560  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
1561            [IntrNoMem, IntrWillReturn]>;
1562
1563// llvm.amdgcn.ds.swizzle src offset
1564def int_amdgcn_ds_swizzle :
1565  ClangBuiltin<"__builtin_amdgcn_ds_swizzle">,
1566  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
1567            [IntrNoMem, IntrConvergent, IntrWillReturn,
1568             ImmArg<ArgIndex<1>>]>;
1569
1570def int_amdgcn_ubfe : Intrinsic<[llvm_anyint_ty],
1571    [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
1572    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1573>;
1574
1575def int_amdgcn_sbfe : Intrinsic<[llvm_anyint_ty],
1576    [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
1577    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1578>;
1579
1580def int_amdgcn_lerp :
1581  ClangBuiltin<"__builtin_amdgcn_lerp">,
1582  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1583  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1584>;
1585
1586def int_amdgcn_sad_u8 :
1587  ClangBuiltin<"__builtin_amdgcn_sad_u8">,
1588  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1589  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1590>;
1591
1592def int_amdgcn_msad_u8 :
1593  ClangBuiltin<"__builtin_amdgcn_msad_u8">,
1594  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1595  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1596>;
1597
1598def int_amdgcn_sad_hi_u8 :
1599  ClangBuiltin<"__builtin_amdgcn_sad_hi_u8">,
1600  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1601  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1602>;
1603
1604def int_amdgcn_sad_u16 :
1605  ClangBuiltin<"__builtin_amdgcn_sad_u16">,
1606  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1607  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1608>;
1609
1610def int_amdgcn_qsad_pk_u16_u8 :
1611  ClangBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">,
1612  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
1613  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1614>;
1615
1616def int_amdgcn_mqsad_pk_u16_u8 :
1617  ClangBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">,
1618  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
1619  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1620>;
1621
1622def int_amdgcn_mqsad_u32_u8 :
1623  ClangBuiltin<"__builtin_amdgcn_mqsad_u32_u8">,
1624  Intrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty],
1625  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1626>;
1627
1628def int_amdgcn_cvt_pk_u8_f32 :
1629  ClangBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">,
1630  Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
1631  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1632>;
1633
1634def int_amdgcn_icmp :
1635  Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>, llvm_i32_ty],
1636            [IntrNoMem, IntrConvergent, IntrWillReturn,
1637             ImmArg<ArgIndex<2>>]>;
1638
1639def int_amdgcn_fcmp :
1640  Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>, llvm_i32_ty],
1641            [IntrNoMem, IntrConvergent, IntrWillReturn,
1642             ImmArg<ArgIndex<2>>]>;
1643
1644def int_amdgcn_ballot :
1645  Intrinsic<[llvm_anyint_ty], [llvm_i1_ty],
1646            [IntrNoMem, IntrConvergent, IntrWillReturn]>;
1647
1648def int_amdgcn_readfirstlane :
1649  ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
1650  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
1651            [IntrNoMem, IntrConvergent, IntrWillReturn]>;
1652
1653// The lane argument must be uniform across the currently active threads of the
1654// current wave. Otherwise, the result is undefined.
1655def int_amdgcn_readlane :
1656  ClangBuiltin<"__builtin_amdgcn_readlane">,
1657  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
1658            [IntrNoMem, IntrConvergent, IntrWillReturn]>;
1659
1660// The value to write and lane select arguments must be uniform across the
1661// currently active threads of the current wave. Otherwise, the result is
1662// undefined.
1663def int_amdgcn_writelane :
1664  ClangBuiltin<"__builtin_amdgcn_writelane">,
1665  Intrinsic<[llvm_i32_ty], [
1666    llvm_i32_ty,    // uniform value to write: returned by the selected lane
1667    llvm_i32_ty,    // uniform lane select
1668    llvm_i32_ty     // returned by all lanes other than the selected one
1669  ],
1670  [IntrNoMem, IntrConvergent, IntrWillReturn]
1671>;
1672
1673def int_amdgcn_alignbyte : ClangBuiltin<"__builtin_amdgcn_alignbyte">,
1674  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1675  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1676>;
1677
1678def int_amdgcn_mul_i24 : Intrinsic<[llvm_i32_ty],
1679  [llvm_i32_ty, llvm_i32_ty],
1680  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1681>;
1682
1683def int_amdgcn_mul_u24 : Intrinsic<[llvm_i32_ty],
1684  [llvm_i32_ty, llvm_i32_ty],
1685  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1686>;
1687
1688def int_amdgcn_mulhi_i24 : Intrinsic<[llvm_i32_ty],
1689  [llvm_i32_ty, llvm_i32_ty],
1690  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1691>;
1692
1693def int_amdgcn_mulhi_u24 : Intrinsic<[llvm_i32_ty],
1694  [llvm_i32_ty, llvm_i32_ty],
1695  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1696>;
1697
1698// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id)
1699//
1700// bar_val is the total number of waves that will wait on this
1701// barrier, minus 1.
1702def int_amdgcn_ds_gws_init :
1703  ClangBuiltin<"__builtin_amdgcn_ds_gws_init">,
1704  Intrinsic<[],
1705  [llvm_i32_ty, llvm_i32_ty],
1706  [IntrConvergent, IntrWriteMem,
1707   IntrInaccessibleMemOnly, IntrWillReturn], "",
1708  [SDNPMemOperand]
1709>;
1710
1711// llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id)
1712// bar_val is the total number of waves that will wait on this
1713// barrier, minus 1.
1714def int_amdgcn_ds_gws_barrier :
1715  ClangBuiltin<"__builtin_amdgcn_ds_gws_barrier">,
1716  Intrinsic<[],
1717  [llvm_i32_ty, llvm_i32_ty],
1718  [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "",
1719  [SDNPMemOperand]
1720>;
1721
1722// llvm.amdgcn.ds.gws.sema.v(i32 resource_id)
1723def int_amdgcn_ds_gws_sema_v :
1724  ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_v">,
1725  Intrinsic<[],
1726  [llvm_i32_ty],
1727  [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "",
1728  [SDNPMemOperand]
1729>;
1730
1731// llvm.amdgcn.ds.gws.sema.br(i32 vsrc, i32 resource_id)
1732def int_amdgcn_ds_gws_sema_br :
1733  ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_br">,
1734  Intrinsic<[],
1735  [llvm_i32_ty, llvm_i32_ty],
1736  [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "",
1737  [SDNPMemOperand]
1738>;
1739
1740// llvm.amdgcn.ds.gws.sema.p(i32 resource_id)
1741def int_amdgcn_ds_gws_sema_p :
1742  ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_p">,
1743  Intrinsic<[],
1744  [llvm_i32_ty],
1745  [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "",
1746  [SDNPMemOperand]
1747>;
1748
1749// llvm.amdgcn.ds.gws.sema.release.all(i32 resource_id)
1750def int_amdgcn_ds_gws_sema_release_all :
1751  ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_release_all">,
1752  Intrinsic<[],
1753  [llvm_i32_ty],
1754  [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "",
1755  [SDNPMemOperand]
1756>;
1757
1758
1759// Copies the source value to the destination value, with the guarantee that
1760// the source value is computed as if the entire program were executed in WQM.
1761def int_amdgcn_wqm : Intrinsic<[llvm_any_ty],
1762  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1763>;
1764
1765// Copies the source value to the destination value, such that the source
1766// is computed as if the entire program were executed in WQM if any other
1767// program code executes in WQM.
1768def int_amdgcn_softwqm : Intrinsic<[llvm_any_ty],
1769  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]
1770>;
1771
1772// Return true if at least one thread within the pixel quad passes true into
1773// the function.
1774def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty],
1775  [llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn]
1776>;
1777
1778// If false, set EXEC=0 for the current thread until the end of program.
1779// FIXME: Should this be IntrNoMem, IntrHasSideEffects, or IntrWillReturn?
1780def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>;
1781
1782def int_amdgcn_endpgm : ClangBuiltin<"__builtin_amdgcn_endpgm">,
1783  Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects]
1784>;
1785
1786// If false, mark all active lanes as helper lanes until the end of program.
1787def int_amdgcn_wqm_demote : Intrinsic<[],
1788  [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly]
1789>;
1790
1791// Copies the active channels of the source value to the destination value,
1792// with the guarantee that the source value is computed as if the entire
1793// program were executed in Whole Wavefront Mode, i.e. with all channels
1794// enabled, with a few exceptions: - Phi nodes which require WWM return an
1795// undefined value.
1796def int_amdgcn_strict_wwm : Intrinsic<[llvm_any_ty],
1797  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
1798                       IntrConvergent, IntrWillReturn]
1799>;
1800// Deprecated. Use int_amdgcn_strict_wwm instead.
1801def int_amdgcn_wwm : Intrinsic<[llvm_any_ty],
1802  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
1803                       IntrConvergent, IntrWillReturn]
1804>;
1805def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty],
1806  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
1807                       IntrConvergent, IntrWillReturn]
1808>;
1809
1810// Given a value, copies it while setting all the inactive lanes to a given
1811// value. Note that OpenGL helper lanes are considered active, so if the
1812// program ever uses WQM, then the instruction and the first source will be
1813// computed in WQM.
1814def int_amdgcn_set_inactive :
1815  Intrinsic<[llvm_anyint_ty],
1816            [LLVMMatchType<0>, // value to be copied
1817             LLVMMatchType<0>], // value for the inactive lanes to take
1818            [IntrNoMem, IntrConvergent, IntrWillReturn]>;
1819
1820// Return if the given flat pointer points to a local memory address.
1821def int_amdgcn_is_shared : ClangBuiltin<"__builtin_amdgcn_is_shared">,
1822  Intrinsic<[llvm_i1_ty], [llvm_ptr_ty],
1823  [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>, IntrWillReturn]
1824>;
1825
1826// Return if the given flat pointer points to a prvate memory address.
1827def int_amdgcn_is_private : ClangBuiltin<"__builtin_amdgcn_is_private">,
1828  Intrinsic<[llvm_i1_ty], [llvm_ptr_ty],
1829  [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>, IntrWillReturn]
1830>;
1831
1832//===----------------------------------------------------------------------===//
1833// CI+ Intrinsics
1834//===----------------------------------------------------------------------===//
1835
1836def int_amdgcn_s_dcache_inv_vol :
1837  ClangBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">,
1838  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
1839
1840def int_amdgcn_buffer_wbinvl1_vol :
1841  ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">,
1842  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
1843
1844//===----------------------------------------------------------------------===//
1845// VI Intrinsics
1846//===----------------------------------------------------------------------===//
1847
1848// llvm.amdgcn.mov.dpp.i32 <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
1849def int_amdgcn_mov_dpp :
1850  Intrinsic<[llvm_anyint_ty],
1851            [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
1852             llvm_i1_ty],
1853             [IntrNoMem, IntrConvergent, IntrWillReturn,
1854             ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>,
1855             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
1856
1857// llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
1858// Should be equivalent to:
1859// v_mov_b32 <dest> <old>
1860// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
1861def int_amdgcn_update_dpp :
1862  Intrinsic<[llvm_anyint_ty],
1863            [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty,
1864            llvm_i32_ty, llvm_i32_ty, llvm_i1_ty],
1865             [IntrNoMem, IntrConvergent, IntrWillReturn,
1866              ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>,
1867              ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
1868
1869def int_amdgcn_s_dcache_wb :
1870  ClangBuiltin<"__builtin_amdgcn_s_dcache_wb">,
1871  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
1872
1873def int_amdgcn_s_dcache_wb_vol :
1874  ClangBuiltin<"__builtin_amdgcn_s_dcache_wb_vol">,
1875  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
1876
1877def int_amdgcn_s_memrealtime :
1878  ClangBuiltin<"__builtin_amdgcn_s_memrealtime">,
1879  Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
1880
1881// llvm.amdgcn.ds.permute <index> <src>
1882def int_amdgcn_ds_permute :
1883  ClangBuiltin<"__builtin_amdgcn_ds_permute">,
1884  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
1885    [IntrNoMem, IntrConvergent, IntrWillReturn]>;
1886
1887// llvm.amdgcn.ds.bpermute <index> <src>
1888def int_amdgcn_ds_bpermute :
1889  ClangBuiltin<"__builtin_amdgcn_ds_bpermute">,
1890  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
1891     [IntrNoMem, IntrConvergent, IntrWillReturn]>;
1892
1893// llvm.amdgcn.perm <src0> <src1> <selector>
1894def int_amdgcn_perm :
1895  ClangBuiltin<"__builtin_amdgcn_perm">,
1896  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1897     [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
1898
1899//===----------------------------------------------------------------------===//
1900// GFX9 Intrinsics
1901//===----------------------------------------------------------------------===//
1902
1903class AMDGPUGlobalLoadLDS : Intrinsic <
1904  [],
1905  [LLVMQualPointerType<llvm_i8_ty, 1>, // Base global pointer to load from
1906   LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base pointer to store to
1907   llvm_i32_ty,                        // Data byte size: 1/2/4
1908   llvm_i32_ty,                        // imm offset (applied to both global and LDS address)
1909   llvm_i32_ty],                       // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0,
1910                                       //                                   bit 1 = slc/sc1,
1911                                       //                                   bit 2 = dlc on gfx10+))
1912                                       //                                   bit 4 = scc/nt on gfx90a+))
1913  [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
1914   ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>],
1915  "", [SDNPMemOperand]>;
1916def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
1917
1918//===----------------------------------------------------------------------===//
1919// GFX10 Intrinsics
1920//===----------------------------------------------------------------------===//
1921
1922// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control>
1923def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">,
1924  Intrinsic<[llvm_i32_ty],
1925            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
1926            [IntrNoMem, IntrConvergent, IntrWillReturn,
1927             ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
1928
1929// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
1930def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">,
1931  Intrinsic<[llvm_i32_ty],
1932            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
1933            [IntrNoMem, IntrConvergent, IntrWillReturn,
1934             ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
1935
1936// llvm.amdgcn.mov.dpp8.i32 <src> <sel>
1937// <sel> is a 32-bit constant whose high 8 bits must be zero which selects
1938// the lanes to read from.
1939def int_amdgcn_mov_dpp8 :
1940  Intrinsic<[llvm_anyint_ty],
1941            [LLVMMatchType<0>, llvm_i32_ty],
1942            [IntrNoMem, IntrConvergent, IntrWillReturn,
1943             ImmArg<ArgIndex<1>>]>;
1944
1945def int_amdgcn_s_get_waveid_in_workgroup :
1946  ClangBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">,
1947  Intrinsic<[llvm_i32_ty], [],
1948    [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>;
1949
1950class AMDGPUGlobalAtomicRtn<LLVMType vt> : Intrinsic <
1951  [vt],
1952  [llvm_anyptr_ty,    // vaddr
1953   vt],               // vdata(VGPR)
1954  [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>], "",
1955  [SDNPMemOperand]>;
1956
1957def int_amdgcn_global_atomic_csub : AMDGPUGlobalAtomicRtn<llvm_i32_ty>;
1958
1959// uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>,
1960//                                           <ray_dir>, <ray_inv_dir>, <texture_descr>
1961// <node_ptr> is i32 or i64.
1962// <ray_dir> and <ray_inv_dir> are both v3f16 or both v3f32.
1963def int_amdgcn_image_bvh_intersect_ray :
1964  Intrinsic<[llvm_v4i32_ty],
1965            [llvm_anyint_ty, llvm_float_ty, llvm_v3f32_ty, llvm_anyvector_ty,
1966             LLVMMatchType<1>, llvm_v4i32_ty],
1967            [IntrReadMem, IntrWillReturn]>;
1968
1969//===----------------------------------------------------------------------===//
1970// GFX11 Intrinsics
1971//===----------------------------------------------------------------------===//
1972
1973// llvm.amdgcn.permlane64 <src0>
1974def int_amdgcn_permlane64 :
1975  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
1976            [IntrNoMem, IntrConvergent, IntrWillReturn]>;
1977
1978def int_amdgcn_ds_add_gs_reg_rtn :
1979  ClangBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">,
1980  Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
1981            [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn]>;
1982
1983def int_amdgcn_ds_sub_gs_reg_rtn :
1984  ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">,
1985  Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty],
1986            [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn]>;
1987
1988// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
1989//
1990// These operations perform a matrix multiplication and accumulation of
1991// the form: D = A * B + C .
1992
1993class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
1994  Intrinsic<
1995    [CD],               // %D
1996    [
1997      AB,               // %A
1998      AB,               // %B
1999      LLVMMatchType<0>, // %C
2000    ],
2001    [IntrNoMem, IntrConvergent, IntrWillReturn]
2002>;
2003
2004class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> :
2005  Intrinsic<
2006    [CD],               // %D
2007    [
2008      AB,               // %A
2009      AB,               // %B
2010      LLVMMatchType<0>, // %C
2011      llvm_i1_ty,       // %high
2012    ],
2013    [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<3>>]
2014>;
2015
2016class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
2017  Intrinsic<
2018    [CD],               // %D
2019    [
2020      llvm_i1_ty,       // %A_sign
2021      AB,               // %A
2022      llvm_i1_ty,       // %B_sign
2023      AB,               // %B
2024      LLVMMatchType<0>, // %C
2025      llvm_i1_ty,       // %clamp
2026    ],
2027    [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
2028>;
2029
2030def int_amdgcn_wmma_f32_16x16x16_f16   : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llvm_anyfloat_ty>;
2031def int_amdgcn_wmma_f32_16x16x16_bf16  : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>;
2032def int_amdgcn_wmma_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
2033def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
2034def int_amdgcn_wmma_i32_16x16x16_iu8   : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>;
2035def int_amdgcn_wmma_i32_16x16x16_iu4   : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>;
2036
2037
2038//===----------------------------------------------------------------------===//
2039// Deep learning intrinsics.
2040//===----------------------------------------------------------------------===//
2041
2042// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp)
2043//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2044def int_amdgcn_fdot2 :
2045  ClangBuiltin<"__builtin_amdgcn_fdot2">,
2046  Intrinsic<
2047    [llvm_float_ty], // %r
2048    [
2049      llvm_v2f16_ty, // %a
2050      llvm_v2f16_ty, // %b
2051      llvm_float_ty, // %c
2052      llvm_i1_ty     // %clamp
2053    ],
2054    [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
2055  >;
2056
2057// f16 %r = llvm.amdgcn.fdot2.f16.f16(v2f16 %a, v2f16 %b, f16 %c)
2058//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2059def int_amdgcn_fdot2_f16_f16 :
2060  ClangBuiltin<"__builtin_amdgcn_fdot2_f16_f16">,
2061  Intrinsic<
2062    [llvm_half_ty],  // %r
2063    [
2064      llvm_v2f16_ty, // %a
2065      llvm_v2f16_ty, // %b
2066      llvm_half_ty   // %c
2067    ],
2068    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
2069  >;
2070
2071// bf16 %r = llvm.amdgcn.fdot2.bf16.bf16(v2bf16 %a, v2bf16 %b, bf16 %c)
2072//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2073def int_amdgcn_fdot2_bf16_bf16 :
2074  ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">,
2075  Intrinsic<
2076    [llvm_i16_ty],   // %r
2077    [
2078      llvm_v2i16_ty, // %a
2079      llvm_v2i16_ty, // %b
2080      llvm_i16_ty    // %c
2081    ],
2082    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
2083  >;
2084
2085// f32 %r = llvm.amdgcn.fdot2.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp)
2086//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2087def int_amdgcn_fdot2_f32_bf16 :
2088  ClangBuiltin<"__builtin_amdgcn_fdot2_f32_bf16">,
2089  Intrinsic<
2090    [llvm_float_ty], // %r
2091    [
2092      llvm_v2i16_ty, // %a
2093      llvm_v2i16_ty, // %b
2094      llvm_float_ty, // %c
2095      llvm_i1_ty     // %clamp
2096    ],
2097    [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
2098  >;
2099
2100// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp)
2101//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2102def int_amdgcn_sdot2 :
2103  ClangBuiltin<"__builtin_amdgcn_sdot2">,
2104  Intrinsic<
2105    [llvm_i32_ty], // %r
2106    [
2107      llvm_v2i16_ty, // %a
2108      llvm_v2i16_ty, // %b
2109      llvm_i32_ty,   // %c
2110      llvm_i1_ty     // %clamp
2111    ],
2112    [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
2113  >;
2114
2115// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp)
2116//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
2117def int_amdgcn_udot2 :
2118  ClangBuiltin<"__builtin_amdgcn_udot2">,
2119  Intrinsic<
2120    [llvm_i32_ty], // %r
2121    [
2122      llvm_v2i16_ty, // %a
2123      llvm_v2i16_ty, // %b
2124      llvm_i32_ty,   // %c
2125      llvm_i1_ty     // %clamp
2126    ],
2127    [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
2128  >;
2129
2130// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp)
2131//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2132def int_amdgcn_sdot4 :
2133  ClangBuiltin<"__builtin_amdgcn_sdot4">,
2134  Intrinsic<
2135    [llvm_i32_ty], // %r
2136    [
2137      llvm_i32_ty, // %a
2138      llvm_i32_ty, // %b
2139      llvm_i32_ty, // %c
2140      llvm_i1_ty   // %clamp
2141    ],
2142    [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
2143  >;
2144
2145// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp)
2146//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2147def int_amdgcn_udot4 :
2148  ClangBuiltin<"__builtin_amdgcn_udot4">,
2149  Intrinsic<
2150    [llvm_i32_ty], // %r
2151    [
2152      llvm_i32_ty, // %a
2153      llvm_i32_ty, // %b
2154      llvm_i32_ty, // %c
2155      llvm_i1_ty   // %clamp
2156    ],
2157    [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
2158  >;
2159
2160// i32 %r = llvm.amdgcn.sudot4(i1 %a_sign, v4i8 (as i32) %a, i1 %b_sign, v4i8 (as i32) %b, i32 %c, i1 %clamp)
2161// Treat input as signed (_sign = 1) or unsigned (_sign = 0).
2162// a[i in 0. . . 3] = (%a_sign ? a.i8[i] : promoteToSigned(a.u8[i]));
2163// b[i in 0. . . 3] = (%b_sign ? b.i8[i] : promoteToSigned(b.u8[i]));
2164//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
2165def int_amdgcn_sudot4 :
2166  ClangBuiltin<"__builtin_amdgcn_sudot4">,
2167  Intrinsic<
2168    [llvm_i32_ty], // %r
2169    [
2170      llvm_i1_ty,  // %a_sign
2171      llvm_i32_ty, // %a
2172      llvm_i1_ty,  // %b_sign
2173      llvm_i32_ty, // %b
2174      llvm_i32_ty, // %c
2175      llvm_i1_ty   // %clamp
2176    ],
2177    [IntrNoMem, IntrSpeculatable, IntrWillReturn,
2178     ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
2179  >;
2180
2181// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp)
2182//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
2183//        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
2184def int_amdgcn_sdot8 :
2185  ClangBuiltin<"__builtin_amdgcn_sdot8">,
2186  Intrinsic<
2187    [llvm_i32_ty], // %r
2188    [
2189      llvm_i32_ty, // %a
2190      llvm_i32_ty, // %b
2191      llvm_i32_ty, // %c
2192      llvm_i1_ty   // %clamp
2193    ],
2194    [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
2195  >;
2196
2197// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp)
2198//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
2199//        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
2200def int_amdgcn_udot8 :
2201  ClangBuiltin<"__builtin_amdgcn_udot8">,
2202  Intrinsic<
2203    [llvm_i32_ty], // %r
2204    [
2205      llvm_i32_ty, // %a
2206      llvm_i32_ty, // %b
2207      llvm_i32_ty, // %c
2208      llvm_i1_ty   // %clamp
2209    ],
2210    [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>]
2211  >;
2212
2213// i32 %r = llvm.amdgcn.sudot8(i1 %a_sign, v8i4 (as i32) %a, i1 %b_sign, v8i4 (as i32) %b, i32 %c, i1 %clamp)
2214// Treat input as signed (_sign = 1) or unsigned (_sign = 0).
2215// a[i in 0. . . 7] = (%a_sign ? a.i4[i] : promoteToSigned(a.u4[i]));
2216// b[i in 0. . . 7] = (%b_sign ? b.i4[i] : promoteToSigned(b.u4[i]));
2217//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
2218//        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
2219  def int_amdgcn_sudot8 :
2220  ClangBuiltin<"__builtin_amdgcn_sudot8">,
2221  Intrinsic<
2222    [llvm_i32_ty], // %r
2223    [
2224      llvm_i1_ty,  // %a_sign
2225      llvm_i32_ty, // %a
2226      llvm_i1_ty,  // %b_sign
2227      llvm_i32_ty, // %b
2228      llvm_i32_ty, // %c
2229      llvm_i1_ty   // %clamp
2230    ],
2231    [IntrNoMem, IntrSpeculatable, IntrWillReturn,
2232     ImmArg<ArgIndex<0>>,  ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
2233  >;
2234
2235//===----------------------------------------------------------------------===//
2236// gfx908 intrinsics
2237// ===----------------------------------------------------------------------===//
2238
2239def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
2240
2241// llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
2242class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
2243  ClangBuiltin<!subst("int", "__builtin", NAME)>,
2244  Intrinsic<[DestTy],
2245            [SrcABTy, SrcABTy, DestTy,
2246             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
2247            [IntrConvergent, IntrNoMem, IntrWillReturn,
2248             ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
2249
2250def int_amdgcn_mfma_f32_32x32x1f32  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>;
2251def int_amdgcn_mfma_f32_16x16x1f32  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>;
2252def int_amdgcn_mfma_f32_4x4x1f32    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_float_ty>;
2253def int_amdgcn_mfma_f32_32x32x2f32  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>;
2254def int_amdgcn_mfma_f32_16x16x4f32  : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_float_ty>;
2255def int_amdgcn_mfma_f32_32x32x4f16  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4f16_ty>;
2256def int_amdgcn_mfma_f32_16x16x4f16  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>;
2257def int_amdgcn_mfma_f32_4x4x4f16    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4f16_ty>;
2258def int_amdgcn_mfma_f32_32x32x8f16  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>;
2259def int_amdgcn_mfma_f32_16x16x16f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4f16_ty>;
2260def int_amdgcn_mfma_i32_32x32x4i8   : AMDGPUMfmaIntrinsic<llvm_v32i32_ty, llvm_i32_ty>;
2261def int_amdgcn_mfma_i32_16x16x4i8   : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>;
2262def int_amdgcn_mfma_i32_4x4x4i8     : AMDGPUMfmaIntrinsic<llvm_v4i32_ty,  llvm_i32_ty>;
2263def int_amdgcn_mfma_i32_32x32x8i8   : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>;
2264def int_amdgcn_mfma_i32_16x16x16i8  : AMDGPUMfmaIntrinsic<llvm_v4i32_ty,  llvm_i32_ty>;
2265def int_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v2i16_ty>;
2266def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>;
2267def int_amdgcn_mfma_f32_4x4x2bf16   : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2i16_ty>;
2268def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>;
2269def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2i16_ty>;
2270
2271//===----------------------------------------------------------------------===//
2272// gfx90a intrinsics
2273// ===----------------------------------------------------------------------===//
2274
2275def int_amdgcn_global_atomic_fmin : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
2276def int_amdgcn_global_atomic_fmax : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
2277def int_amdgcn_flat_atomic_fadd   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
2278def int_amdgcn_flat_atomic_fmin   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
2279def int_amdgcn_flat_atomic_fmax   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
2280
2281def int_amdgcn_mfma_f32_32x32x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>;
2282def int_amdgcn_mfma_f32_16x16x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
2283def int_amdgcn_mfma_f32_4x4x4bf16_1k    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty>;
2284def int_amdgcn_mfma_f32_32x32x8bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
2285def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty>;
2286
2287// Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA.
2288//       Three bits corresponding to the neg modifier applied to the respective
2289//       source operand.
2290def int_amdgcn_mfma_f64_16x16x4f64      : AMDGPUMfmaIntrinsic<llvm_v4f64_ty,  llvm_double_ty>;
2291def int_amdgcn_mfma_f64_4x4x4f64        : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>;
2292
2293//===----------------------------------------------------------------------===//
2294// gfx940 intrinsics
2295// ===----------------------------------------------------------------------===//
2296
2297// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
2298def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>;
2299def int_amdgcn_flat_atomic_fadd_v2bf16   : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>;
2300def int_amdgcn_ds_fadd_v2bf16 : Intrinsic<
2301    [llvm_v2i16_ty],
2302    [LLVMQualPointerType<llvm_v2i16_ty, 3>, llvm_v2i16_ty],
2303    [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>]>,
2304    ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;
2305
2306def int_amdgcn_mfma_i32_16x16x32_i8     : AMDGPUMfmaIntrinsic<llvm_v4i32_ty,  llvm_i64_ty>;
2307def int_amdgcn_mfma_i32_32x32x16_i8     : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
2308def int_amdgcn_mfma_f32_16x16x8_xf32    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2f32_ty>;
2309def int_amdgcn_mfma_f32_32x32x4_xf32    : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;
2310
2311class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> :
2312  AMDGPUMfmaIntrinsic<DestTy, llvm_i64_ty>;
2313
2314multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> {
2315  foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in
2316    def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>;
2317}
2318
2319defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>;
2320defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>;
2321
2322// llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid
2323class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> :
2324  ClangBuiltin<!subst("int", "__builtin", NAME)>,
2325  Intrinsic<[DestTy],
2326            [SrcA, SrcB, DestTy, llvm_i32_ty,
2327             llvm_i32_ty, llvm_i32_ty],
2328            [IntrConvergent, IntrNoMem, IntrWillReturn,
2329             ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
2330
2331def int_amdgcn_smfmac_f32_16x16x32_f16  : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty,  llvm_v4f16_ty, llvm_v8f16_ty>;
2332def int_amdgcn_smfmac_f32_32x32x16_f16  : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
2333def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty, llvm_v8i16_ty>;
2334def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
2335def int_amdgcn_smfmac_i32_16x16x64_i8   : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty,  llvm_v2i32_ty, llvm_v4i32_ty>;
2336def int_amdgcn_smfmac_i32_32x32x32_i8   : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
2337
2338class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> :
2339  AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>;
2340
2341multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> {
2342  foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in
2343    def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>;
2344}
2345
2346defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>;
2347defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>;
2348
2349// llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3]
2350// byte_sel selects byte from srcA.
2351def int_amdgcn_cvt_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_bf8">,
2352  Intrinsic<[llvm_float_ty],
2353            [llvm_i32_ty, llvm_i32_ty],
2354            [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
2355
2356// llvm.amdgcn.cvt.f32.fp8 float vdst, int srcA, imm byte_sel [0..3]
2357def int_amdgcn_cvt_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_fp8">,
2358  Intrinsic<[llvm_float_ty],
2359            [llvm_i32_ty, llvm_i32_ty],
2360            [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
2361
2362// llvm.amdgcn.cvt.pk.f32.bf8 float2 vdst, int srcA, imm word_sel
2363// word_sel = 1 selects 2 high bytes, 0 selects 2 low bytes.
2364def int_amdgcn_cvt_pk_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_bf8">,
2365  Intrinsic<[llvm_v2f32_ty],
2366            [llvm_i32_ty, llvm_i1_ty],
2367            [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
2368
2369// llvm.amdgcn.cvt.pk.f32.fp8 float2 vdst, int srcA, imm word_sel.
2370def int_amdgcn_cvt_pk_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_fp8">,
2371  Intrinsic<[llvm_v2f32_ty],
2372            [llvm_i32_ty, llvm_i1_ty],
2373            [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
2374
2375// llvm.amdgcn.cvt.pk.bf8.f32 int vdst, float srcA, float srcB, int old, imm word_sel
2376// word_sel = 1 selects 2 high bytes in the vdst, 0 selects 2 low bytes.
2377def int_amdgcn_cvt_pk_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f32">,
2378  Intrinsic<[llvm_i32_ty],
2379            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
2380            [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<3>>]>;
2381
2382// llvm.amdgcn.cvt.pk.fp8.f32 int vdst, float srcA, float srcB, int old, imm word_sel
2383def int_amdgcn_cvt_pk_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32">,
2384  Intrinsic<[llvm_i32_ty],
2385            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty],
2386            [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<3>>]>;
2387
2388// llvm.amdgcn.cvt.sr.bf8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
2389// byte_sel selects byte to write into vdst.
2390def int_amdgcn_cvt_sr_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f32">,
2391  Intrinsic<[llvm_i32_ty],
2392            [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
2393            [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<3>>]>;
2394
2395// llvm.amdgcn.cvt.sr.fp8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3]
2396def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">,
2397  Intrinsic<[llvm_i32_ty],
2398            [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
2399            [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<3>>]>;
2400
2401//===----------------------------------------------------------------------===//
2402// Special Intrinsics for backend internal use only. No frontend
2403// should emit calls to these.
2404// ===----------------------------------------------------------------------===//
2405def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_anyint_ty],
2406  [llvm_i1_ty], [IntrConvergent, IntrWillReturn]
2407>;
2408
2409def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_anyint_ty],
2410  [llvm_anyint_ty], [IntrConvergent, IntrWillReturn]
2411>;
2412
2413def int_amdgcn_if_break : Intrinsic<[llvm_anyint_ty],
2414  [llvm_i1_ty, LLVMMatchType<0>],
2415  [IntrNoMem, IntrConvergent, IntrWillReturn]
2416>;
2417
2418def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
2419  [llvm_anyint_ty], [IntrConvergent, IntrWillReturn]
2420>;
2421
2422def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty],
2423  [IntrConvergent, IntrWillReturn]>;
2424
2425// Represent unreachable in a divergent region.
2426def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent]>;
2427
2428// Emit 2.5 ulp, no denormal division. Should only be inserted by
2429// pass based on !fpmath metadata.
2430def int_amdgcn_fdiv_fast : Intrinsic<
2431  [llvm_float_ty], [llvm_float_ty, llvm_float_ty],
2432  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
2433>;
2434
2435// Represent a relocation constant.
2436def int_amdgcn_reloc_constant : Intrinsic<
2437  [llvm_i32_ty], [llvm_metadata_ty],
2438  [IntrNoMem, IntrSpeculatable, IntrWillReturn]
2439>;
2440}
2441