1//===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics -----*- tablegen -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines all of the R600-specific intrinsics.
10//
11//===----------------------------------------------------------------------===//
12
13class AMDGPUReadPreloadRegisterIntrinsic
14  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
15
16class AMDGPUReadPreloadRegisterIntrinsicNamed<string name>
17  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>, GCCBuiltin<name>;
18
19// Used to tag image and resource intrinsics with information used to generate
20// mem operands.
21class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = 0> {
22  int RsrcArg = rsrcarg;
23  bit IsImage = isimage;
24}
25
26let TargetPrefix = "r600" in {
27
28multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz {
29  def _x : AMDGPUReadPreloadRegisterIntrinsic;
30  def _y : AMDGPUReadPreloadRegisterIntrinsic;
31  def _z : AMDGPUReadPreloadRegisterIntrinsic;
32}
33
34multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> {
35  def _x : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_x")>;
36  def _y : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_y")>;
37  def _z : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_z")>;
38}
39
40defm int_r600_read_global_size : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
41                                 <"__builtin_r600_read_global_size">;
42defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
43                             <"__builtin_r600_read_ngroups">;
44defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
45                          <"__builtin_r600_read_tgid">;
46
47defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz;
48defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz;
49
50def int_r600_group_barrier : GCCBuiltin<"__builtin_r600_group_barrier">,
51  Intrinsic<[], [], [IntrConvergent]>;
52
53// AS 7 is PARAM_I_ADDRESS, used for kernel arguments
54def int_r600_implicitarg_ptr :
55  GCCBuiltin<"__builtin_r600_implicitarg_ptr">,
56  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 7>], [],
57  [IntrNoMem, IntrSpeculatable]>;
58
59def int_r600_rat_store_typed :
60  // 1st parameter: Data
61  // 2nd parameter: Index
62  // 3rd parameter: Constant RAT ID
63  Intrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], []>,
64  GCCBuiltin<"__builtin_r600_rat_store_typed">;
65
66def int_r600_recipsqrt_ieee :  Intrinsic<
67  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
68>;
69
70def int_r600_recipsqrt_clamped : Intrinsic<
71  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
72>;
73
74def int_r600_cube : Intrinsic<
75  [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]
76>;
77
78def int_r600_store_stream_output : Intrinsic<
79  [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []
80>;
81
82class TextureIntrinsicFloatInput : Intrinsic<[llvm_v4f32_ty], [
83  llvm_v4f32_ty, // Coord
84  llvm_i32_ty,   // offset_x
85  llvm_i32_ty,   // offset_y,
86  llvm_i32_ty,   // offset_z,
87  llvm_i32_ty,   // resource_id
88  llvm_i32_ty,   // samplerid
89  llvm_i32_ty,   // coord_type_x
90  llvm_i32_ty,   // coord_type_y
91  llvm_i32_ty,   // coord_type_z
92  llvm_i32_ty],  // coord_type_w
93  [IntrNoMem]
94>;
95
96class TextureIntrinsicInt32Input : Intrinsic<[llvm_v4i32_ty], [
97    llvm_v4i32_ty, // Coord
98    llvm_i32_ty,   // offset_x
99    llvm_i32_ty,   // offset_y,
100    llvm_i32_ty,   // offset_z,
101    llvm_i32_ty,   // resource_id
102    llvm_i32_ty,   // samplerid
103    llvm_i32_ty,   // coord_type_x
104    llvm_i32_ty,   // coord_type_y
105    llvm_i32_ty,   // coord_type_z
106    llvm_i32_ty],  // coord_type_w
107    [IntrNoMem]
108>;
109
110def int_r600_store_swizzle :
111  Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []
112>;
113
114def int_r600_tex : TextureIntrinsicFloatInput;
115def int_r600_texc : TextureIntrinsicFloatInput;
116def int_r600_txl : TextureIntrinsicFloatInput;
117def int_r600_txlc : TextureIntrinsicFloatInput;
118def int_r600_txb : TextureIntrinsicFloatInput;
119def int_r600_txbc : TextureIntrinsicFloatInput;
120def int_r600_txf : TextureIntrinsicInt32Input;
121def int_r600_txq : TextureIntrinsicInt32Input;
122def int_r600_ddx : TextureIntrinsicFloatInput;
123def int_r600_ddy : TextureIntrinsicFloatInput;
124
125def int_r600_dot4 : Intrinsic<[llvm_float_ty],
126  [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]
127>;
128
129def int_r600_kill : Intrinsic<[], [llvm_float_ty], []>;
130
131} // End TargetPrefix = "r600"
132
133let TargetPrefix = "amdgcn" in {
134
135//===----------------------------------------------------------------------===//
136// ABI Special Intrinsics
137//===----------------------------------------------------------------------===//
138
139defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz;
140defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
141                               <"__builtin_amdgcn_workgroup_id">;
142
143def int_amdgcn_dispatch_ptr :
144  GCCBuiltin<"__builtin_amdgcn_dispatch_ptr">,
145  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
146  [IntrNoMem, IntrSpeculatable]>;
147
148def int_amdgcn_queue_ptr :
149  GCCBuiltin<"__builtin_amdgcn_queue_ptr">,
150  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
151  [IntrNoMem, IntrSpeculatable]>;
152
153def int_amdgcn_kernarg_segment_ptr :
154  GCCBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">,
155  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
156  [IntrNoMem, IntrSpeculatable]>;
157
158def int_amdgcn_implicitarg_ptr :
159  GCCBuiltin<"__builtin_amdgcn_implicitarg_ptr">,
160  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
161  [IntrNoMem, IntrSpeculatable]>;
162
163def int_amdgcn_groupstaticsize :
164  GCCBuiltin<"__builtin_amdgcn_groupstaticsize">,
165  Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
166
167def int_amdgcn_dispatch_id :
168  GCCBuiltin<"__builtin_amdgcn_dispatch_id">,
169  Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable]>;
170
171def int_amdgcn_implicit_buffer_ptr :
172  GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
173  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [],
174  [IntrNoMem, IntrSpeculatable]>;
175
176// Set EXEC to the 64-bit value given.
177// This is always moved to the beginning of the basic block.
178def int_amdgcn_init_exec : Intrinsic<[],
179  [llvm_i64_ty],      // 64-bit literal constant
180  [IntrConvergent, ImmArg<0>]>;
181
182// Set EXEC according to a thread count packed in an SGPR input:
183//    thread_count = (input >> bitoffset) & 0x7f;
184// This is always moved to the beginning of the basic block.
185def int_amdgcn_init_exec_from_input : Intrinsic<[],
186  [llvm_i32_ty,       // 32-bit SGPR input
187   llvm_i32_ty],      // bit offset of the thread count
188  [IntrConvergent]>;
189
190def int_amdgcn_wavefrontsize :
191  GCCBuiltin<"__builtin_amdgcn_wavefrontsize">,
192  Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
193
194
195//===----------------------------------------------------------------------===//
196// Instruction Intrinsics
197//===----------------------------------------------------------------------===//
198
199// The first parameter is s_sendmsg immediate (i16),
200// the second one is copied to m0
201def int_amdgcn_s_sendmsg : GCCBuiltin<"__builtin_amdgcn_s_sendmsg">,
202  Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, IntrInaccessibleMemOnly]>;
203def int_amdgcn_s_sendmsghalt : GCCBuiltin<"__builtin_amdgcn_s_sendmsghalt">,
204  Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [ImmArg<0>, IntrInaccessibleMemOnly]>;
205
206def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">,
207  Intrinsic<[], [], [IntrConvergent]>;
208
209def int_amdgcn_wave_barrier : GCCBuiltin<"__builtin_amdgcn_wave_barrier">,
210  Intrinsic<[], [], [IntrConvergent]>;
211
212def int_amdgcn_s_waitcnt : GCCBuiltin<"__builtin_amdgcn_s_waitcnt">,
213  Intrinsic<[], [llvm_i32_ty], [ImmArg<0>]>;
214
215def int_amdgcn_div_scale : Intrinsic<
216  // 1st parameter: Numerator
217  // 2nd parameter: Denominator
218  // 3rd parameter: Constant to select select between first and
219  //                second. (0 = first, 1 = second).
220  [llvm_anyfloat_ty, llvm_i1_ty],
221  [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
222  [IntrNoMem, IntrSpeculatable, ImmArg<2>]
223>;
224
225def int_amdgcn_div_fmas : Intrinsic<[llvm_anyfloat_ty],
226  [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
227  [IntrNoMem, IntrSpeculatable]
228>;
229
230def int_amdgcn_div_fixup : Intrinsic<[llvm_anyfloat_ty],
231  [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
232  [IntrNoMem, IntrSpeculatable]
233>;
234
235def int_amdgcn_trig_preop : Intrinsic<
236  [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty],
237  [IntrNoMem, IntrSpeculatable]
238>;
239
240def int_amdgcn_sin : Intrinsic<
241  [llvm_anyfloat_ty], [LLVMMatchType<0>],
242  [IntrNoMem, IntrSpeculatable]
243>;
244
245def int_amdgcn_cos : Intrinsic<
246  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
247>;
248
249def int_amdgcn_log_clamp : Intrinsic<
250  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
251>;
252
253def int_amdgcn_fmul_legacy : GCCBuiltin<"__builtin_amdgcn_fmul_legacy">,
254  Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
255  [IntrNoMem, IntrSpeculatable]
256>;
257
258def int_amdgcn_rcp : Intrinsic<
259  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
260>;
261
262def int_amdgcn_rcp_legacy : GCCBuiltin<"__builtin_amdgcn_rcp_legacy">,
263  Intrinsic<[llvm_float_ty], [llvm_float_ty],
264  [IntrNoMem, IntrSpeculatable]
265>;
266
267def int_amdgcn_rsq :  Intrinsic<
268  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
269>;
270
271def int_amdgcn_rsq_legacy :  GCCBuiltin<"__builtin_amdgcn_rsq_legacy">,
272  Intrinsic<
273  [llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]
274>;
275
276def int_amdgcn_rsq_clamp : Intrinsic<
277  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>;
278
279def int_amdgcn_ldexp : Intrinsic<
280  [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty],
281  [IntrNoMem, IntrSpeculatable]
282>;
283
284def int_amdgcn_frexp_mant : Intrinsic<
285  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
286>;
287
288def int_amdgcn_frexp_exp : Intrinsic<
289  [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable]
290>;
291
292// v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0
293// and always uses rtz, so is not suitable for implementing the OpenCL
294// fract function. It should be ok on VI.
295def int_amdgcn_fract : Intrinsic<
296  [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
297>;
298
299def int_amdgcn_cvt_pkrtz : GCCBuiltin<"__builtin_amdgcn_cvt_pkrtz">,
300  Intrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty],
301            [IntrNoMem, IntrSpeculatable]
302>;
303
304def int_amdgcn_cvt_pknorm_i16 :
305  GCCBuiltin<"__builtin_amdgcn_cvt_pknorm_i16">,
306  Intrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
307            [IntrNoMem, IntrSpeculatable]
308>;
309
310def int_amdgcn_cvt_pknorm_u16 :
311  GCCBuiltin<"__builtin_amdgcn_cvt_pknorm_u16">,
312  Intrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
313            [IntrNoMem, IntrSpeculatable]
314>;
315
316def int_amdgcn_cvt_pk_i16 :
317    GCCBuiltin<"__builtin_amdgcn_cvt_pk_i16">,
318    Intrinsic<
319  [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
320  [IntrNoMem, IntrSpeculatable]
321>;
322
323def int_amdgcn_cvt_pk_u16 : GCCBuiltin<"__builtin_amdgcn_cvt_pk_u16">,
324  Intrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
325    [IntrNoMem, IntrSpeculatable]
326>;
327
328def int_amdgcn_class : Intrinsic<
329  [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty],
330  [IntrNoMem, IntrSpeculatable]
331>;
332
333def int_amdgcn_fmed3 : GCCBuiltin<"__builtin_amdgcn_fmed3">,
334  Intrinsic<[llvm_anyfloat_ty],
335    [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
336    [IntrNoMem, IntrSpeculatable]
337>;
338
339def int_amdgcn_cubeid : GCCBuiltin<"__builtin_amdgcn_cubeid">,
340  Intrinsic<[llvm_float_ty],
341    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
342    [IntrNoMem, IntrSpeculatable]
343>;
344
345def int_amdgcn_cubema : GCCBuiltin<"__builtin_amdgcn_cubema">,
346  Intrinsic<[llvm_float_ty],
347  [llvm_float_ty, llvm_float_ty, llvm_float_ty],
348  [IntrNoMem, IntrSpeculatable]
349>;
350
351def int_amdgcn_cubesc : GCCBuiltin<"__builtin_amdgcn_cubesc">,
352  Intrinsic<[llvm_float_ty],
353    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
354    [IntrNoMem, IntrSpeculatable]
355>;
356
357def int_amdgcn_cubetc : GCCBuiltin<"__builtin_amdgcn_cubetc">,
358  Intrinsic<[llvm_float_ty],
359    [llvm_float_ty, llvm_float_ty, llvm_float_ty],
360    [IntrNoMem, IntrSpeculatable]
361>;
362
363// v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz
364// should be used.
365def int_amdgcn_sffbh :
366  Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>],
367  [IntrNoMem, IntrSpeculatable]
368>;
369
370// v_mad_f32|f16/v_mac_f32|f16, selected regardless of denorm support.
371def int_amdgcn_fmad_ftz :
372  Intrinsic<[llvm_anyfloat_ty],
373            [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
374            [IntrNoMem, IntrSpeculatable]
375>;
376
377// Fields should mirror atomicrmw
378class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty],
379  [llvm_anyptr_ty,
380  LLVMMatchType<0>,
381  llvm_i32_ty, // ordering
382  llvm_i32_ty, // scope
383  llvm_i1_ty], // isVolatile
384  [IntrArgMemOnly, NoCapture<0>, ImmArg<2>, ImmArg<3>, ImmArg<4>], "",
385  [SDNPMemOperand]
386>;
387
388def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin;
389def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin;
390
391class AMDGPULDSF32Intrin<string clang_builtin> :
392  GCCBuiltin<clang_builtin>,
393  Intrinsic<[llvm_float_ty],
394    [LLVMQualPointerType<llvm_float_ty, 3>,
395    llvm_float_ty,
396    llvm_i32_ty, // ordering
397    llvm_i32_ty, // scope
398    llvm_i1_ty], // isVolatile
399    [IntrArgMemOnly, NoCapture<0>, ImmArg<2>, ImmArg<3>, ImmArg<4>]
400>;
401
402// FIXME: The m0 argument should be moved after the normal arguments
403class AMDGPUDSOrderedIntrinsic : Intrinsic<
404  [llvm_i32_ty],
405  // M0 = {hi16:address, lo16:waveID}. Allow passing M0 as a pointer, so that
406  // the bit packing can be optimized at the IR level.
407  [LLVMQualPointerType<llvm_i32_ty, 2>, // IntToPtr(M0)
408   llvm_i32_ty, // value to add or swap
409   llvm_i32_ty, // ordering
410   llvm_i32_ty, // scope
411   llvm_i1_ty,  // isVolatile
412   llvm_i32_ty, // ordered count index (OA index), also added to the address
413                // gfx10: bits 24-27 indicate the number of active threads/dwords
414   llvm_i1_ty,  // wave release, usually set to 1
415   llvm_i1_ty], // wave done, set to 1 for the last ordered instruction
416  [NoCapture<0>,
417   ImmArg<2>, ImmArg<3>, ImmArg<4>,
418   ImmArg<5>, ImmArg<6>, ImmArg<7>
419  ]
420>;
421
422class AMDGPUDSAppendConsumedIntrinsic : Intrinsic<
423  [llvm_i32_ty],
424  [llvm_anyptr_ty, // LDS or GDS ptr
425   llvm_i1_ty], // isVolatile
426   [IntrConvergent, IntrArgMemOnly, NoCapture<0>, ImmArg<1>],
427   "",
428   [SDNPMemOperand]
429>;
430
431def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic;
432def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
433
434// The pointer argument is assumed to be dynamically uniform if a VGPR.
435def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
436def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
437
438def int_amdgcn_ds_fadd : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_faddf">;
439def int_amdgcn_ds_fmin : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fminf">;
440def int_amdgcn_ds_fmax : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fmaxf">;
441
442} // TargetPrefix = "amdgcn"
443
444// New-style image intrinsics
445
446//////////////////////////////////////////////////////////////////////////
447// Dimension-aware image intrinsics framework
448//////////////////////////////////////////////////////////////////////////
449
450// Helper class to represent (type, name) combinations of arguments. The
451// argument names are explanatory and used as DAG operand names for codegen
452// pattern matching.
453class AMDGPUArg<LLVMType ty, string name> {
454  LLVMType Type = ty;
455  string Name = name;
456}
457
458// Return [AMDGPUArg<basety, names[0]>, AMDGPUArg<LLVMMatchType<0>, names[1]>, ...]
459class makeArgList<list<string> names, LLVMType basety> {
460  list<AMDGPUArg> ret =
461    !listconcat([AMDGPUArg<basety, names[0]>],
462                !foreach(name, !tail(names), AMDGPUArg<LLVMMatchType<0>, name>));
463}
464
465// Return arglist, with LLVMMatchType's references shifted by 'shift'.
466class arglistmatchshift<list<AMDGPUArg> arglist, int shift> {
467  list<AMDGPUArg> ret =
468    !foreach(arg, arglist,
469             !if(!isa<LLVMMatchType>(arg.Type),
470                 AMDGPUArg<LLVMMatchType<!add(!cast<LLVMMatchType>(arg.Type).Number, shift)>,
471                           arg.Name>,
472                 arg));
473}
474
475// Return the concatenation of the given arglists. LLVMMatchType's are adjusted
476// accordingly, and shifted by an additional 'shift'.
477class arglistconcat<list<list<AMDGPUArg>> arglists, int shift = 0> {
478  list<AMDGPUArg> ret =
479    !foldl([]<AMDGPUArg>, arglists, lhs, rhs,
480           !listconcat(
481             lhs,
482             arglistmatchshift<rhs,
483                               !add(shift, !foldl(0, lhs, a, b,
484                                                  !add(a, b.Type.isAny)))>.ret));
485}
486
487// Represent texture/image types / dimensionality.
488class AMDGPUDimProps<bits<3> enc, string name, string asmsuffix,
489                     list<string> coord_names, list<string> slice_names> {
490  AMDGPUDimProps Dim = !cast<AMDGPUDimProps>(NAME);
491  string Name = name; // e.g. "2darraymsaa"
492  string AsmSuffix = asmsuffix; // e.g. 2D_MSAA_ARRAY (used in assembly strings)
493  bits<3> Encoding = enc;
494  bit DA = 0; // DA bit in MIMG encoding
495
496  list<AMDGPUArg> CoordSliceArgs =
497    makeArgList<!listconcat(coord_names, slice_names), llvm_anyfloat_ty>.ret;
498  list<AMDGPUArg> CoordSliceIntArgs =
499    makeArgList<!listconcat(coord_names, slice_names), llvm_anyint_ty>.ret;
500  list<AMDGPUArg> GradientArgs =
501    makeArgList<!listconcat(!foreach(name, coord_names, "d" # name # "dh"),
502                            !foreach(name, coord_names, "d" # name # "dv")),
503                llvm_anyfloat_ty>.ret;
504
505  bits<8> NumCoords = !size(CoordSliceArgs);
506  bits<8> NumGradients = !size(GradientArgs);
507}
508
509def AMDGPUDim1D : AMDGPUDimProps<0x0, "1d", "1D", ["s"], []>;
510def AMDGPUDim2D : AMDGPUDimProps<0x1, "2d", "2D", ["s", "t"], []>;
511def AMDGPUDim3D : AMDGPUDimProps<0x2, "3d", "3D", ["s", "t", "r"], []>;
512let DA = 1 in {
513  def AMDGPUDimCube : AMDGPUDimProps<0x3, "cube", "CUBE", ["s", "t"], ["face"]>;
514  def AMDGPUDim1DArray : AMDGPUDimProps<0x4, "1darray", "1D_ARRAY", ["s"], ["slice"]>;
515  def AMDGPUDim2DArray : AMDGPUDimProps<0x5, "2darray", "2D_ARRAY", ["s", "t"], ["slice"]>;
516}
517def AMDGPUDim2DMsaa : AMDGPUDimProps<0x6, "2dmsaa", "2D_MSAA", ["s", "t"], ["fragid"]>;
518let DA = 1 in {
519  def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<0x7, "2darraymsaa", "2D_MSAA_ARRAY", ["s", "t"], ["slice", "fragid"]>;
520}
521
522def AMDGPUDims {
523  list<AMDGPUDimProps> NoMsaa = [AMDGPUDim1D, AMDGPUDim2D, AMDGPUDim3D,
524                                 AMDGPUDimCube, AMDGPUDim1DArray,
525                                 AMDGPUDim2DArray];
526  list<AMDGPUDimProps> Msaa = [AMDGPUDim2DMsaa, AMDGPUDim2DArrayMsaa];
527  list<AMDGPUDimProps> All = !listconcat(NoMsaa, Msaa);
528}
529
530// Represent sample variants, i.e. _C, _O, _B, ... and combinations thereof.
531class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr> {
532  string UpperCaseMod = ucmod;
533  string LowerCaseMod = lcmod;
534
535  // {offset} {bias} {z-compare}
536  list<AMDGPUArg> ExtraAddrArgs = extra_addr;
537  bit Gradients = 0;
538
539  // Name of the {lod} or {clamp} argument that is appended to the coordinates,
540  // if any.
541  string LodOrClamp = "";
542}
543
544// AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE
545// AMDGPUSampleVariantsNoGradients: variants supported by IMAGE_GATHER4
546defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = {
547  multiclass AMDGPUSampleHelper_Offset<string ucmod, string lcmod,
548                                       list<AMDGPUArg> extra_addr> {
549    def NAME#lcmod : AMDGPUSampleVariant<ucmod, lcmod, extra_addr>;
550    def NAME#lcmod#_o : AMDGPUSampleVariant<
551        ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg<llvm_i32_ty, "offset">], extra_addr)>;
552  }
553
554  multiclass AMDGPUSampleHelper_Compare<string ucmod, string lcmod,
555                                        list<AMDGPUArg> extra_addr> {
556    defm NAME : AMDGPUSampleHelper_Offset<ucmod, lcmod, extra_addr>;
557    defm NAME : AMDGPUSampleHelper_Offset<
558        "_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg<llvm_float_ty, "zcompare">])>;
559  }
560
561  multiclass AMDGPUSampleHelper_Clamp<string ucmod, string lcmod,
562                                      list<AMDGPUArg> extra_addr> {
563    defm NAME : AMDGPUSampleHelper_Compare<ucmod, lcmod, extra_addr>;
564    let LodOrClamp = "clamp" in
565    defm NAME : AMDGPUSampleHelper_Compare<ucmod#"_CL", lcmod#"_cl", extra_addr>;
566  }
567
568  defset list<AMDGPUSampleVariant> AMDGPUSampleVariantsNoGradients = {
569    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>;
570    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<
571        "_B", "_b", [AMDGPUArg<llvm_anyfloat_ty, "bias">]>;
572    let LodOrClamp = "lod" in
573    defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_L", "_l", []>;
574    defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_LZ", "_lz", []>;
575  }
576
577  let Gradients = 1 in {
578    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_D", "_d", []>;
579    defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_CD", "_cd", []>;
580  }
581}
582
583// Helper class to capture the profile of a dimension-aware image intrinsic.
584// This information is used to generate the intrinsic's type and to inform
585// codegen pattern matching.
586class AMDGPUDimProfile<string opmod,
587                       AMDGPUDimProps dim> {
588  AMDGPUDimProps Dim = dim;
589  string OpMod = opmod; // the corresponding instruction is named IMAGE_OpMod
590
591  // These are entended to be overwritten by subclasses
592  bit IsSample = 0;
593  bit IsAtomic = 0;
594  list<LLVMType> RetTypes = [];
595  list<AMDGPUArg> DataArgs = [];
596  list<AMDGPUArg> ExtraAddrArgs = [];
597  bit Gradients = 0;
598  string LodClampMip = "";
599
600  int NumRetAndDataAnyTypes =
601    !foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b,
602           !add(a, b.isAny));
603
604  list<AMDGPUArg> AddrArgs =
605    arglistconcat<[ExtraAddrArgs,
606                   !if(Gradients, dim.GradientArgs, []),
607                   !listconcat(!if(IsSample, dim.CoordSliceArgs, dim.CoordSliceIntArgs),
608                               !if(!eq(LodClampMip, ""),
609                                   []<AMDGPUArg>,
610                                   [AMDGPUArg<LLVMMatchType<0>, LodClampMip>]))],
611                  NumRetAndDataAnyTypes>.ret;
612  list<LLVMType> AddrTypes = !foreach(arg, AddrArgs, arg.Type);
613  list<AMDGPUArg> AddrDefaultArgs =
614    !foreach(arg, AddrArgs,
615             AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)),
616                           !if(IsSample, llvm_float_ty, llvm_i32_ty), arg.Type),
617                       arg.Name>);
618  list<AMDGPUArg> AddrA16Args =
619    !foreach(arg, AddrArgs,
620             AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)),
621                           !if(IsSample, llvm_half_ty, llvm_i16_ty), arg.Type),
622                       arg.Name>);
623}
624
625class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod, base.Dim> {
626  let IsSample = base.IsSample;
627  let IsAtomic = base.IsAtomic;
628  let RetTypes = base.RetTypes;
629  let DataArgs = base.DataArgs;
630  let ExtraAddrArgs = base.ExtraAddrArgs;
631  let Gradients = base.Gradients;
632  let LodClampMip = base.LodClampMip;
633}
634
635class AMDGPUDimSampleProfile<string opmod,
636                             AMDGPUDimProps dim,
637                             AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> {
638  let IsSample = 1;
639  let RetTypes = [llvm_any_ty];
640  let ExtraAddrArgs = sample.ExtraAddrArgs;
641  let Gradients = sample.Gradients;
642  let LodClampMip = sample.LodOrClamp;
643}
644
645class AMDGPUDimNoSampleProfile<string opmod,
646                               AMDGPUDimProps dim,
647                               list<LLVMType> retty,
648                               list<AMDGPUArg> dataargs,
649                               bit Mip = 0> : AMDGPUDimProfile<opmod, dim> {
650  let RetTypes = retty;
651  let DataArgs = dataargs;
652  let LodClampMip = !if(Mip, "mip", "");
653}
654
655class AMDGPUDimAtomicProfile<string opmod,
656                             AMDGPUDimProps dim,
657                             list<AMDGPUArg> dataargs> : AMDGPUDimProfile<opmod, dim> {
658  let RetTypes = [llvm_anyint_ty];
659  let DataArgs = dataargs;
660  let IsAtomic = 1;
661}
662
663class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> : AMDGPUDimProfile<"GET_RESINFO", dim> {
664  let RetTypes = [llvm_anyfloat_ty];
665  let DataArgs = [];
666  let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">];
667  let LodClampMip = "mip";
668}
669
670// Helper class for figuring out image intrinsic argument indexes.
671class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> {
672  int NumDataArgs = !size(P_.DataArgs);
673  int NumDmaskArgs = !if(P_.IsAtomic, 0, 1);
674  int NumVAddrArgs = !size(P_.AddrArgs);
675  int NumRSrcArgs = 1;
676  int NumSampArgs = !if(P_.IsSample, 2, 0);
677  int DmaskArgIndex = NumDataArgs;
678  int UnormArgIndex = !add(NumDataArgs, NumDmaskArgs, NumVAddrArgs, NumRSrcArgs, 1);
679  int TexFailCtrlArgIndex = !add(NumDataArgs, NumDmaskArgs, NumVAddrArgs, NumRSrcArgs, NumSampArgs);
680  int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1);
681}
682
683// All dimension-aware intrinsics are derived from this class.
684class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
685                              list<IntrinsicProperty> props,
686                              list<SDNodeProperty> sdnodeprops> : Intrinsic<
687    P_.RetTypes,        // vdata(VGPR) -- for load/atomic-with-return
688    !listconcat(
689      !foreach(arg, P_.DataArgs, arg.Type),      // vdata(VGPR) -- for store/atomic
690      !if(P_.IsAtomic, [], [llvm_i32_ty]),       // dmask(imm)
691      P_.AddrTypes,                              // vaddr(VGPR)
692      [llvm_v8i32_ty],                           // rsrc(SGPR)
693      !if(P_.IsSample, [llvm_v4i32_ty,           // samp(SGPR)
694                        llvm_i1_ty], []),        // unorm(imm)
695      [llvm_i32_ty,                              // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe)
696       llvm_i32_ty]),                            // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc)
697     !listconcat(props,
698          !if(P_.IsAtomic, [], [ImmArg<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>]),
699          !if(P_.IsSample, [ImmArg<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>], []),
700          [ImmArg<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>,
701           ImmArg<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>]),
702      "", sdnodeprops>,
703  AMDGPURsrcIntrinsic<!add(!size(P_.DataArgs), !size(P_.AddrTypes),
704                           !if(P_.IsAtomic, 0, 1)), 1> {
705  AMDGPUDimProfile P = P_;
706
707  AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME);
708
709  let TargetPrefix = "amdgcn";
710}
711
712// Marker class for intrinsics with a DMask that determines the returned
713// channels.
714class AMDGPUImageDMaskIntrinsic;
715
716defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
717
718  //////////////////////////////////////////////////////////////////////////
719  // Load and store intrinsics
720  //////////////////////////////////////////////////////////////////////////
721  multiclass AMDGPUImageDimIntrinsicsNoMsaa<string opmod,
722                                            list<LLVMType> retty,
723                                            list<AMDGPUArg> dataargs,
724                                            list<IntrinsicProperty> props,
725                                            list<SDNodeProperty> sdnodeprops,
726                                            bit Mip = 0> {
727    foreach dim = AMDGPUDims.NoMsaa in {
728      def !strconcat(NAME, "_", dim.Name)
729        : AMDGPUImageDimIntrinsic<
730            AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>,
731            props, sdnodeprops>;
732    }
733  }
734
735  multiclass AMDGPUImageDimIntrinsicsAll<string opmod,
736                                         list<LLVMType> retty,
737                                         list<AMDGPUArg> dataargs,
738                                         list<IntrinsicProperty> props,
739                                         list<SDNodeProperty> sdnodeprops,
740                                         bit Mip = 0> {
741    foreach dim = AMDGPUDims.All in {
742      def !strconcat(NAME, "_", dim.Name)
743        : AMDGPUImageDimIntrinsic<
744            AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>,
745            props, sdnodeprops>;
746    }
747  }
748
749  defm int_amdgcn_image_load
750    : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem],
751                                  [SDNPMemOperand]>,
752      AMDGPUImageDMaskIntrinsic;
753  defm int_amdgcn_image_load_mip
754    : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [],
755                                     [IntrReadMem], [SDNPMemOperand], 1>,
756      AMDGPUImageDMaskIntrinsic;
757
758  defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll<
759              "STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
760              [IntrWriteMem], [SDNPMemOperand]>;
761  defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa<
762              "STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
763              [IntrWriteMem], [SDNPMemOperand], 1>;
764
765  //////////////////////////////////////////////////////////////////////////
766  // sample and getlod intrinsics
767  //////////////////////////////////////////////////////////////////////////
768  multiclass AMDGPUImageDimSampleDims<string opmod,
769                                      AMDGPUSampleVariant sample,
770                                      bit NoMem = 0> {
771    foreach dim = AMDGPUDims.NoMsaa in {
772      def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic<
773          AMDGPUDimSampleProfile<opmod, dim, sample>,
774          !if(NoMem, [IntrNoMem], [IntrReadMem]),
775          !if(NoMem, [], [SDNPMemOperand])>;
776    }
777  }
778
779  foreach sample = AMDGPUSampleVariants in {
780    defm int_amdgcn_image_sample # sample.LowerCaseMod
781      : AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>,
782        AMDGPUImageDMaskIntrinsic;
783  }
784
785  defm int_amdgcn_image_getlod
786    : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>,
787      AMDGPUImageDMaskIntrinsic;
788
789  //////////////////////////////////////////////////////////////////////////
790  // getresinfo intrinsics
791  //////////////////////////////////////////////////////////////////////////
792  foreach dim = AMDGPUDims.All in {
793    def !strconcat("int_amdgcn_image_getresinfo_", dim.Name)
794      : AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>,
795        AMDGPUImageDMaskIntrinsic;
796  }
797
798  //////////////////////////////////////////////////////////////////////////
799  // gather4 intrinsics
800  //////////////////////////////////////////////////////////////////////////
801  foreach sample = AMDGPUSampleVariantsNoGradients in {
802    foreach dim = [AMDGPUDim2D, AMDGPUDimCube, AMDGPUDim2DArray] in {
803      def int_amdgcn_image_gather4 # sample.LowerCaseMod # _ # dim.Name:
804          AMDGPUImageDimIntrinsic<
805              AMDGPUDimSampleProfile<"GATHER4" # sample.UpperCaseMod, dim, sample>,
806              [IntrReadMem], [SDNPMemOperand]>;
807    }
808  }
809}
810
811//////////////////////////////////////////////////////////////////////////
812// atomic intrinsics
813//////////////////////////////////////////////////////////////////////////
814defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
815  multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs> {
816    foreach dim = AMDGPUDims.All in {
817      def !strconcat(NAME, "_", dim.Name)
818        : AMDGPUImageDimIntrinsic<
819            AMDGPUDimAtomicProfile<opmod, dim, dataargs>,
820            [], [SDNPMemOperand]>;
821    }
822  }
823
824  multiclass AMDGPUImageDimAtomic<string opmod> {
825    defm "" : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">]>;
826  }
827
828  defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">;
829  defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">;
830  defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">;
831  defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">;
832  defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">;
833  defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">;
834  defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">;
835  defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">;
836  defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">;
837  defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">;
838
839  // TODO: INC/DEC are weird: they seem to have a vdata argument in hardware,
840  //       even though it clearly shouldn't be needed
841  defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">;
842  defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">;
843
844  defm int_amdgcn_image_atomic_cmpswap :
845      AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">,
846                                               AMDGPUArg<LLVMMatchType<0>, "cmp">]>;
847}
848
849//////////////////////////////////////////////////////////////////////////
850// Buffer intrinsics
851//////////////////////////////////////////////////////////////////////////
852
853let TargetPrefix = "amdgcn" in {
854
855defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = {
856
857class AMDGPUBufferLoad : Intrinsic <
858  [llvm_any_ty],
859  [llvm_v4i32_ty,     // rsrc(SGPR)
860   llvm_i32_ty,       // vindex(VGPR)
861   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
862   llvm_i1_ty,        // glc(imm)
863   llvm_i1_ty],       // slc(imm)
864  [IntrReadMem, ImmArg<3>, ImmArg<4>], "", [SDNPMemOperand]>,
865  AMDGPURsrcIntrinsic<0>;
866def int_amdgcn_buffer_load_format : AMDGPUBufferLoad;
867def int_amdgcn_buffer_load : AMDGPUBufferLoad;
868
869def int_amdgcn_s_buffer_load : Intrinsic <
870  [llvm_any_ty],
871  [llvm_v4i32_ty,     // rsrc(SGPR)
872   llvm_i32_ty,       // byte offset(SGPR/imm)
873   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 2 = dlc)
874  [IntrNoMem, ImmArg<2>]>,
875  AMDGPURsrcIntrinsic<0>;
876
877class AMDGPUBufferStore : Intrinsic <
878  [],
879  [llvm_any_ty,       // vdata(VGPR)
880   llvm_v4i32_ty,     // rsrc(SGPR)
881   llvm_i32_ty,       // vindex(VGPR)
882   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
883   llvm_i1_ty,        // glc(imm)
884   llvm_i1_ty],       // slc(imm)
885  [IntrWriteMem, ImmArg<4>, ImmArg<5>], "", [SDNPMemOperand]>,
886  AMDGPURsrcIntrinsic<1>;
887def int_amdgcn_buffer_store_format : AMDGPUBufferStore;
888def int_amdgcn_buffer_store : AMDGPUBufferStore;
889
890// New buffer intrinsics with separate raw and struct variants.  The raw
891// variant never has an index. The struct variant always has an index, even if
892// it is const 0. A struct intrinsic with constant 0 index is different to the
893// corresponding raw intrinsic on gfx9+ because the behavior of bound checking
894// and swizzling changes depending on whether idxen is set in the instruction.
895// These new instrinsics also keep the offset and soffset arguments separate as
896// they behave differently in bounds checking and swizzling.
897class AMDGPURawBufferLoad : Intrinsic <
898  [llvm_any_ty],
899  [llvm_v4i32_ty,     // rsrc(SGPR)
900   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
901   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
902   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+)
903  [IntrReadMem, ImmArg<3>], "", [SDNPMemOperand]>,
904  AMDGPURsrcIntrinsic<0>;
905def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad;
906def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad;
907
908class AMDGPUStructBufferLoad : Intrinsic <
909  [llvm_any_ty],
910  [llvm_v4i32_ty,     // rsrc(SGPR)
911   llvm_i32_ty,       // vindex(VGPR)
912   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
913   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
914   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+)
915  [IntrReadMem, ImmArg<4>], "", [SDNPMemOperand]>,
916  AMDGPURsrcIntrinsic<0>;
917def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad;
918def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad;
919
920class AMDGPURawBufferStore : Intrinsic <
921  [],
922  [llvm_any_ty,       // vdata(VGPR)
923   llvm_v4i32_ty,     // rsrc(SGPR)
924   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
925   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
926   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+)
927  [IntrWriteMem, ImmArg<4>], "", [SDNPMemOperand]>,
928  AMDGPURsrcIntrinsic<1>;
929def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore;
930def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore;
931
932class AMDGPUStructBufferStore : Intrinsic <
933  [],
934  [llvm_any_ty,       // vdata(VGPR)
935   llvm_v4i32_ty,     // rsrc(SGPR)
936   llvm_i32_ty,       // vindex(VGPR)
937   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
938   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
939   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+)
940  [IntrWriteMem, ImmArg<5>], "", [SDNPMemOperand]>,
941  AMDGPURsrcIntrinsic<1>;
942def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore;
943def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore;
944
945class AMDGPURawBufferAtomic : Intrinsic <
946  [llvm_anyint_ty],
947  [LLVMMatchType<0>,  // vdata(VGPR)
948   llvm_v4i32_ty,     // rsrc(SGPR)
949   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
950   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
951   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc)
952  [ImmArg<4>], "", [SDNPMemOperand]>,
953  AMDGPURsrcIntrinsic<1, 0>;
954def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic;
955def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic;
956def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic;
957def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic;
958def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic;
959def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic;
960def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic;
961def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic;
962def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic;
963def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
964def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
965  [llvm_anyint_ty],
966  [LLVMMatchType<0>,  // src(VGPR)
967   LLVMMatchType<0>,  // cmp(VGPR)
968   llvm_v4i32_ty,     // rsrc(SGPR)
969   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
970   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
971   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc)
972  [ImmArg<5>], "", [SDNPMemOperand]>,
973  AMDGPURsrcIntrinsic<2, 0>;
974
975class AMDGPUStructBufferAtomic : Intrinsic <
976  [llvm_anyint_ty],
977  [LLVMMatchType<0>,  // vdata(VGPR)
978   llvm_v4i32_ty,     // rsrc(SGPR)
979   llvm_i32_ty,       // vindex(VGPR)
980   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
981   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
982   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc)
983  [ImmArg<5>], "", [SDNPMemOperand]>,
984  AMDGPURsrcIntrinsic<1, 0>;
985def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic;
986def int_amdgcn_struct_buffer_atomic_add : AMDGPUStructBufferAtomic;
987def int_amdgcn_struct_buffer_atomic_sub : AMDGPUStructBufferAtomic;
988def int_amdgcn_struct_buffer_atomic_smin : AMDGPUStructBufferAtomic;
989def int_amdgcn_struct_buffer_atomic_umin : AMDGPUStructBufferAtomic;
990def int_amdgcn_struct_buffer_atomic_smax : AMDGPUStructBufferAtomic;
991def int_amdgcn_struct_buffer_atomic_umax : AMDGPUStructBufferAtomic;
992def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic;
993def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic;
994def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic;
995def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
996  [llvm_anyint_ty],
997  [LLVMMatchType<0>,  // src(VGPR)
998   LLVMMatchType<0>,  // cmp(VGPR)
999   llvm_v4i32_ty,     // rsrc(SGPR)
1000   llvm_i32_ty,       // vindex(VGPR)
1001   llvm_i32_ty,       // offset(VGPR/imm, included in bounds checking and swizzling)
1002   llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1003   llvm_i32_ty],      // cachepolicy(imm; bit 1 = slc)
1004  [ImmArg<6>], "", [SDNPMemOperand]>,
1005  AMDGPURsrcIntrinsic<2, 0>;
1006
1007// Obsolescent tbuffer intrinsics.
1008def int_amdgcn_tbuffer_load : Intrinsic <
1009    [llvm_any_ty],    // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1010    [llvm_v4i32_ty,   // rsrc(SGPR)
1011     llvm_i32_ty,     // vindex(VGPR)
1012     llvm_i32_ty,     // voffset(VGPR)
1013     llvm_i32_ty,     // soffset(SGPR)
1014     llvm_i32_ty,     // offset(imm)
1015     llvm_i32_ty,     // dfmt(imm)
1016     llvm_i32_ty,     // nfmt(imm)
1017     llvm_i1_ty,     // glc(imm)
1018     llvm_i1_ty],    // slc(imm)
1019    [IntrReadMem, ImmArg<4>, ImmArg<5>, ImmArg<6>,
1020     ImmArg<7>, ImmArg<8>], "", [SDNPMemOperand]>,
1021  AMDGPURsrcIntrinsic<0>;
1022
1023def int_amdgcn_tbuffer_store : Intrinsic <
1024    [],
1025    [llvm_any_ty,    // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1026     llvm_v4i32_ty,  // rsrc(SGPR)
1027     llvm_i32_ty,    // vindex(VGPR)
1028     llvm_i32_ty,    // voffset(VGPR)
1029     llvm_i32_ty,    // soffset(SGPR)
1030     llvm_i32_ty,    // offset(imm)
1031     llvm_i32_ty,    // dfmt(imm)
1032     llvm_i32_ty,    // nfmt(imm)
1033     llvm_i1_ty,     // glc(imm)
1034     llvm_i1_ty],    // slc(imm)
1035    [IntrWriteMem, ImmArg<5>, ImmArg<6>, ImmArg<7>,
1036     ImmArg<8>, ImmArg<9>], "", [SDNPMemOperand]>,
1037  AMDGPURsrcIntrinsic<1>;
1038
1039// New tbuffer intrinsics, with:
1040// - raw and struct variants
1041// - joint format field
1042// - joint cachepolicy field
1043def int_amdgcn_raw_tbuffer_load : Intrinsic <
1044    [llvm_any_ty],    // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1045    [llvm_v4i32_ty,   // rsrc(SGPR)
1046     llvm_i32_ty,     // offset(VGPR/imm, included in bounds checking and swizzling)
1047     llvm_i32_ty,     // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1048     llvm_i32_ty,     // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1049     llvm_i32_ty],    // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+)
1050    [IntrReadMem, ImmArg<3>, ImmArg<4>], "", [SDNPMemOperand]>,
1051  AMDGPURsrcIntrinsic<0>;
1052
1053def int_amdgcn_raw_tbuffer_store : Intrinsic <
1054    [],
1055    [llvm_any_ty,    // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1056     llvm_v4i32_ty,  // rsrc(SGPR)
1057     llvm_i32_ty,    // offset(VGPR/imm, included in bounds checking and swizzling)
1058     llvm_i32_ty,    // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1059     llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1060     llvm_i32_ty],   // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+)
1061    [IntrWriteMem, ImmArg<4>, ImmArg<5>], "", [SDNPMemOperand]>,
1062  AMDGPURsrcIntrinsic<1>;
1063
1064def int_amdgcn_struct_tbuffer_load : Intrinsic <
1065    [llvm_any_ty],    // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1066    [llvm_v4i32_ty,   // rsrc(SGPR)
1067     llvm_i32_ty,     // vindex(VGPR)
1068     llvm_i32_ty,     // offset(VGPR/imm, included in bounds checking and swizzling)
1069     llvm_i32_ty,     // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1070     llvm_i32_ty,     // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1071     llvm_i32_ty],    // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+)
1072    [IntrReadMem, ImmArg<4>, ImmArg<5>], "", [SDNPMemOperand]>,
1073  AMDGPURsrcIntrinsic<0>;
1074
1075def int_amdgcn_struct_tbuffer_store : Intrinsic <
1076    [],
1077    [llvm_any_ty,    // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
1078     llvm_v4i32_ty,  // rsrc(SGPR)
1079     llvm_i32_ty,    // vindex(VGPR)
1080     llvm_i32_ty,    // offset(VGPR/imm, included in bounds checking and swizzling)
1081     llvm_i32_ty,    // soffset(SGPR/imm, excluded from bounds checking and swizzling)
1082     llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
1083     llvm_i32_ty],   // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+)
1084    [IntrWriteMem, ImmArg<5>, ImmArg<6>], "", [SDNPMemOperand]>,
1085  AMDGPURsrcIntrinsic<1>;
1086
1087class AMDGPUBufferAtomic : Intrinsic <
1088  [llvm_anyint_ty],
1089  [LLVMMatchType<0>,       // vdata(VGPR)
1090   llvm_v4i32_ty,     // rsrc(SGPR)
1091   llvm_i32_ty,       // vindex(VGPR)
1092   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
1093   llvm_i1_ty],       // slc(imm)
1094  [ImmArg<4>], "", [SDNPMemOperand]>,
1095  AMDGPURsrcIntrinsic<1, 0>;
1096def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic;
1097def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic;
1098def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic;
1099def int_amdgcn_buffer_atomic_smin : AMDGPUBufferAtomic;
1100def int_amdgcn_buffer_atomic_umin : AMDGPUBufferAtomic;
1101def int_amdgcn_buffer_atomic_smax : AMDGPUBufferAtomic;
1102def int_amdgcn_buffer_atomic_umax : AMDGPUBufferAtomic;
1103def int_amdgcn_buffer_atomic_and : AMDGPUBufferAtomic;
1104def int_amdgcn_buffer_atomic_or : AMDGPUBufferAtomic;
1105def int_amdgcn_buffer_atomic_xor : AMDGPUBufferAtomic;
1106def int_amdgcn_buffer_atomic_cmpswap : Intrinsic<
1107  [llvm_i32_ty],
1108  [llvm_i32_ty,       // src(VGPR)
1109   llvm_i32_ty,       // cmp(VGPR)
1110   llvm_v4i32_ty,     // rsrc(SGPR)
1111   llvm_i32_ty,       // vindex(VGPR)
1112   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
1113   llvm_i1_ty],       // slc(imm)
1114  [ImmArg<5>], "", [SDNPMemOperand]>,
1115  AMDGPURsrcIntrinsic<2, 0>;
1116
1117} // defset AMDGPUBufferIntrinsics
1118
1119// Uses that do not set the done bit should set IntrWriteMem on the
1120// call site.
1121def int_amdgcn_exp : Intrinsic <[], [
1122  llvm_i32_ty,       // tgt,
1123  llvm_i32_ty,       // en
1124  llvm_any_ty,       // src0 (f32 or i32)
1125  LLVMMatchType<0>,  // src1
1126  LLVMMatchType<0>,  // src2
1127  LLVMMatchType<0>,  // src3
1128  llvm_i1_ty,        // done
1129  llvm_i1_ty         // vm
1130  ],
1131  [ImmArg<0>, ImmArg<1>, ImmArg<6>, ImmArg<7>, IntrInaccessibleMemOnly]
1132>;
1133
1134// exp with compr bit set.
1135def int_amdgcn_exp_compr : Intrinsic <[], [
1136  llvm_i32_ty,       // tgt,
1137  llvm_i32_ty,       // en
1138  llvm_anyvector_ty, // src0 (v2f16 or v2i16)
1139  LLVMMatchType<0>,  // src1
1140  llvm_i1_ty,        // done
1141  llvm_i1_ty],       // vm
1142  [ImmArg<0>, ImmArg<1>, ImmArg<4>, ImmArg<5>, IntrInaccessibleMemOnly]
1143>;
1144
1145def int_amdgcn_buffer_wbinvl1_sc :
1146  GCCBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">,
1147  Intrinsic<[], [], []>;
1148
1149def int_amdgcn_buffer_wbinvl1 :
1150  GCCBuiltin<"__builtin_amdgcn_buffer_wbinvl1">,
1151  Intrinsic<[], [], []>;
1152
1153def int_amdgcn_s_dcache_inv :
1154  GCCBuiltin<"__builtin_amdgcn_s_dcache_inv">,
1155  Intrinsic<[], [], []>;
1156
1157def int_amdgcn_s_memtime :
1158  GCCBuiltin<"__builtin_amdgcn_s_memtime">,
1159  Intrinsic<[llvm_i64_ty], []>;
1160
1161def int_amdgcn_s_sleep :
1162  GCCBuiltin<"__builtin_amdgcn_s_sleep">,
1163  Intrinsic<[], [llvm_i32_ty], [ImmArg<0>]> {
1164}
1165
1166def int_amdgcn_s_incperflevel :
1167  GCCBuiltin<"__builtin_amdgcn_s_incperflevel">,
1168  Intrinsic<[], [llvm_i32_ty], [ImmArg<0>]> {
1169}
1170
1171def int_amdgcn_s_decperflevel :
1172  GCCBuiltin<"__builtin_amdgcn_s_decperflevel">,
1173  Intrinsic<[], [llvm_i32_ty], [ImmArg<0>]> {
1174}
1175
1176def int_amdgcn_s_getreg :
1177  GCCBuiltin<"__builtin_amdgcn_s_getreg">,
1178  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
1179  [IntrInaccessibleMemOnly, IntrReadMem, IntrSpeculatable, ImmArg<0>]
1180>;
1181
1182// int_amdgcn_s_getpc is provided to allow a specific style of position
1183// independent code to determine the high part of its address when it is
1184// known (through convention) that the code and any data of interest does
1185// not cross a 4Gb address boundary. Use for any other purpose may not
1186// produce the desired results as optimizations may cause code movement,
1187// especially as we explicitly use IntrNoMem to allow optimizations.
1188def int_amdgcn_s_getpc :
1189  GCCBuiltin<"__builtin_amdgcn_s_getpc">,
1190  Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable]>;
1191
1192// __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0>
1193// param values: 0 = P10, 1 = P20, 2 = P0
1194def int_amdgcn_interp_mov :
1195  GCCBuiltin<"__builtin_amdgcn_interp_mov">,
1196  Intrinsic<[llvm_float_ty],
1197            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1198            [IntrNoMem, IntrSpeculatable, ImmArg<1>, ImmArg<2>]>;
1199
1200// __builtin_amdgcn_interp_p1 <i>, <attr_chan>, <attr>, <m0>
1201// This intrinsic reads from lds, but the memory values are constant,
1202// so it behaves like IntrNoMem.
1203def int_amdgcn_interp_p1 :
1204  GCCBuiltin<"__builtin_amdgcn_interp_p1">,
1205  Intrinsic<[llvm_float_ty],
1206            [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1207            [IntrNoMem, IntrSpeculatable, ImmArg<1>, ImmArg<2>]>;
1208
1209// __builtin_amdgcn_interp_p2 <p1>, <j>, <attr_chan>, <attr>, <m0>
1210def int_amdgcn_interp_p2 :
1211  GCCBuiltin<"__builtin_amdgcn_interp_p2">,
1212  Intrinsic<[llvm_float_ty],
1213            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1214            [IntrNoMem, IntrSpeculatable, ImmArg<2>, ImmArg<3>]>;
1215          // See int_amdgcn_v_interp_p1 for why this is IntrNoMem.
1216
1217// __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0>
1218def int_amdgcn_interp_p1_f16 :
1219  GCCBuiltin<"__builtin_amdgcn_interp_p1_f16">,
1220  Intrinsic<[llvm_float_ty],
1221            [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
1222            [IntrNoMem, IntrSpeculatable, ImmArg<1>, ImmArg<2>, ImmArg<3>]>;
1223
1224// __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0>
1225def int_amdgcn_interp_p2_f16 :
1226  GCCBuiltin<"__builtin_amdgcn_interp_p2_f16">,
1227  Intrinsic<[llvm_half_ty],
1228            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty],
1229            [IntrNoMem, IntrSpeculatable, ImmArg<2>, ImmArg<3>, ImmArg<4>]>;
1230
1231// Pixel shaders only: whether the current pixel is live (i.e. not a helper
1232// invocation for derivative computation).
1233def int_amdgcn_ps_live : Intrinsic <
1234  [llvm_i1_ty],
1235  [],
1236  [IntrNoMem]>;
1237
1238def int_amdgcn_mbcnt_lo :
1239  GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">,
1240  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1241
1242def int_amdgcn_mbcnt_hi :
1243  GCCBuiltin<"__builtin_amdgcn_mbcnt_hi">,
1244  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
1245
1246// llvm.amdgcn.ds.swizzle src offset
1247def int_amdgcn_ds_swizzle :
1248  GCCBuiltin<"__builtin_amdgcn_ds_swizzle">,
1249  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
1250            [IntrNoMem, IntrConvergent, ImmArg<1>]>;
1251
1252def int_amdgcn_ubfe : Intrinsic<[llvm_anyint_ty],
1253    [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
1254    [IntrNoMem, IntrSpeculatable]
1255>;
1256
1257def int_amdgcn_sbfe : Intrinsic<[llvm_anyint_ty],
1258    [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
1259    [IntrNoMem, IntrSpeculatable]
1260>;
1261
1262def int_amdgcn_lerp :
1263  GCCBuiltin<"__builtin_amdgcn_lerp">,
1264  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1265  [IntrNoMem, IntrSpeculatable]
1266>;
1267
1268def int_amdgcn_sad_u8 :
1269  GCCBuiltin<"__builtin_amdgcn_sad_u8">,
1270  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1271  [IntrNoMem, IntrSpeculatable]
1272>;
1273
1274def int_amdgcn_msad_u8 :
1275  GCCBuiltin<"__builtin_amdgcn_msad_u8">,
1276  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1277  [IntrNoMem, IntrSpeculatable]
1278>;
1279
1280def int_amdgcn_sad_hi_u8 :
1281  GCCBuiltin<"__builtin_amdgcn_sad_hi_u8">,
1282  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1283  [IntrNoMem, IntrSpeculatable]
1284>;
1285
1286def int_amdgcn_sad_u16 :
1287  GCCBuiltin<"__builtin_amdgcn_sad_u16">,
1288  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1289  [IntrNoMem, IntrSpeculatable]
1290>;
1291
1292def int_amdgcn_qsad_pk_u16_u8 :
1293  GCCBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">,
1294  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
1295  [IntrNoMem, IntrSpeculatable]
1296>;
1297
1298def int_amdgcn_mqsad_pk_u16_u8 :
1299  GCCBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">,
1300  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
1301  [IntrNoMem, IntrSpeculatable]
1302>;
1303
1304def int_amdgcn_mqsad_u32_u8 :
1305  GCCBuiltin<"__builtin_amdgcn_mqsad_u32_u8">,
1306  Intrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty],
1307  [IntrNoMem, IntrSpeculatable]
1308>;
1309
1310def int_amdgcn_cvt_pk_u8_f32 :
1311  GCCBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">,
1312  Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
1313  [IntrNoMem, IntrSpeculatable]
1314>;
1315
1316def int_amdgcn_icmp :
1317  Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>, llvm_i32_ty],
1318            [IntrNoMem, IntrConvergent, ImmArg<2>]>;
1319
1320def int_amdgcn_fcmp :
1321  Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>, llvm_i32_ty],
1322            [IntrNoMem, IntrConvergent, ImmArg<2>]>;
1323
1324def int_amdgcn_readfirstlane :
1325  GCCBuiltin<"__builtin_amdgcn_readfirstlane">,
1326  Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
1327
1328// The lane argument must be uniform across the currently active threads of the
1329// current wave. Otherwise, the result is undefined.
1330def int_amdgcn_readlane :
1331  GCCBuiltin<"__builtin_amdgcn_readlane">,
1332  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
1333
1334// The value to write and lane select arguments must be uniform across the
1335// currently active threads of the current wave. Otherwise, the result is
1336// undefined.
1337def int_amdgcn_writelane :
1338  GCCBuiltin<"__builtin_amdgcn_writelane">,
1339  Intrinsic<[llvm_i32_ty], [
1340    llvm_i32_ty,    // uniform value to write: returned by the selected lane
1341    llvm_i32_ty,    // uniform lane select
1342    llvm_i32_ty     // returned by all lanes other than the selected one
1343  ],
1344  [IntrNoMem, IntrConvergent]
1345>;
1346
1347def int_amdgcn_alignbit :
1348  GCCBuiltin<"__builtin_amdgcn_alignbit">, Intrinsic<[llvm_i32_ty],
1349  [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1350  [IntrNoMem, IntrSpeculatable]
1351>;
1352
1353def int_amdgcn_alignbyte : GCCBuiltin<"__builtin_amdgcn_alignbyte">,
1354  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1355  [IntrNoMem, IntrSpeculatable]
1356>;
1357
1358def int_amdgcn_mul_i24 : Intrinsic<[llvm_i32_ty],
1359  [llvm_i32_ty, llvm_i32_ty],
1360  [IntrNoMem, IntrSpeculatable]
1361>;
1362
1363def int_amdgcn_mul_u24 : Intrinsic<[llvm_i32_ty],
1364  [llvm_i32_ty, llvm_i32_ty],
1365  [IntrNoMem, IntrSpeculatable]
1366>;
1367
1368// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id)
1369//
1370// bar_val is the total number of waves that will wait on this
1371// barrier, minus 1.
1372def int_amdgcn_ds_gws_init :
1373  GCCBuiltin<"__builtin_amdgcn_ds_gws_init">,
1374  Intrinsic<[],
1375  [llvm_i32_ty, llvm_i32_ty],
1376  [IntrConvergent, IntrWriteMem, IntrInaccessibleMemOnly], "",
1377  [SDNPMemOperand]
1378>;
1379
1380// llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id)
1381// bar_val is the total number of waves that will wait on this
1382// barrier, minus 1.
1383def int_amdgcn_ds_gws_barrier :
1384  GCCBuiltin<"__builtin_amdgcn_ds_gws_barrier">,
1385  Intrinsic<[],
1386  [llvm_i32_ty, llvm_i32_ty],
1387  [IntrConvergent, IntrInaccessibleMemOnly], "",
1388  [SDNPMemOperand]
1389>;
1390
1391// llvm.amdgcn.ds.gws.sema.v(i32 resource_id)
1392def int_amdgcn_ds_gws_sema_v :
1393  GCCBuiltin<"__builtin_amdgcn_ds_gws_sema_v">,
1394  Intrinsic<[],
1395  [llvm_i32_ty],
1396  [IntrConvergent, IntrInaccessibleMemOnly], "",
1397  [SDNPMemOperand]
1398>;
1399
1400// llvm.amdgcn.ds.gws.sema.br(i32 vsrc, i32 resource_id)
1401def int_amdgcn_ds_gws_sema_br :
1402  GCCBuiltin<"__builtin_amdgcn_ds_gws_sema_br">,
1403  Intrinsic<[],
1404  [llvm_i32_ty, llvm_i32_ty],
1405  [IntrConvergent, IntrInaccessibleMemOnly], "",
1406  [SDNPMemOperand]
1407>;
1408
1409// llvm.amdgcn.ds.gws.sema.p(i32 resource_id)
1410def int_amdgcn_ds_gws_sema_p :
1411  GCCBuiltin<"__builtin_amdgcn_ds_gws_sema_p">,
1412  Intrinsic<[],
1413  [llvm_i32_ty],
1414  [IntrConvergent, IntrInaccessibleMemOnly], "",
1415  [SDNPMemOperand]
1416>;
1417
1418// llvm.amdgcn.ds.gws.sema.release.all(i32 resource_id)
1419def int_amdgcn_ds_gws_sema_release_all :
1420  GCCBuiltin<"__builtin_amdgcn_ds_gws_sema_release_all">,
1421  Intrinsic<[],
1422  [llvm_i32_ty],
1423  [IntrConvergent, IntrInaccessibleMemOnly], "",
1424  [SDNPMemOperand]
1425>;
1426
1427
1428// Copies the source value to the destination value, with the guarantee that
1429// the source value is computed as if the entire program were executed in WQM.
1430def int_amdgcn_wqm : Intrinsic<[llvm_any_ty],
1431  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
1432>;
1433
1434// Return true if at least one thread within the pixel quad passes true into
1435// the function.
1436def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty],
1437  [llvm_i1_ty], [IntrNoMem, IntrConvergent]
1438>;
1439
1440// If false, set EXEC=0 for the current thread until the end of program.
1441def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>;
1442
1443// Copies the active channels of the source value to the destination value,
1444// with the guarantee that the source value is computed as if the entire
1445// program were executed in Whole Wavefront Mode, i.e. with all channels
1446// enabled, with a few exceptions: - Phi nodes with require WWM return an
1447// undefined value.
1448def int_amdgcn_wwm : Intrinsic<[llvm_any_ty],
1449  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrConvergent]
1450>;
1451
1452// Given a value, copies it while setting all the inactive lanes to a given
1453// value. Note that OpenGL helper lanes are considered active, so if the
1454// program ever uses WQM, then the instruction and the first source will be
1455// computed in WQM.
1456def int_amdgcn_set_inactive :
1457  Intrinsic<[llvm_anyint_ty],
1458            [LLVMMatchType<0>, // value to be copied
1459             LLVMMatchType<0>], // value for the inactive lanes to take
1460            [IntrNoMem, IntrConvergent]>;
1461
1462//===----------------------------------------------------------------------===//
1463// CI+ Intrinsics
1464//===----------------------------------------------------------------------===//
1465
1466def int_amdgcn_s_dcache_inv_vol :
1467  GCCBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">,
1468  Intrinsic<[], [], []>;
1469
1470def int_amdgcn_buffer_wbinvl1_vol :
1471  GCCBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">,
1472  Intrinsic<[], [], []>;
1473
1474//===----------------------------------------------------------------------===//
1475// VI Intrinsics
1476//===----------------------------------------------------------------------===//
1477
1478// llvm.amdgcn.mov.dpp.i32 <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
1479def int_amdgcn_mov_dpp :
1480  Intrinsic<[llvm_anyint_ty],
1481            [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
1482             llvm_i1_ty], [IntrNoMem, IntrConvergent, ImmArg<1>,
1483                           ImmArg<2>, ImmArg<3>, ImmArg<4>]>;
1484
1485// llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
1486// Should be equivalent to:
1487// v_mov_b32 <dest> <old>
1488// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
1489def int_amdgcn_update_dpp :
1490  Intrinsic<[llvm_anyint_ty],
1491            [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty,
1492            llvm_i32_ty, llvm_i32_ty, llvm_i1_ty],
1493             [IntrNoMem, IntrConvergent,
1494              ImmArg<2>, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1495
1496def int_amdgcn_s_dcache_wb :
1497  GCCBuiltin<"__builtin_amdgcn_s_dcache_wb">,
1498  Intrinsic<[], [], []>;
1499
1500def int_amdgcn_s_dcache_wb_vol :
1501  GCCBuiltin<"__builtin_amdgcn_s_dcache_wb_vol">,
1502  Intrinsic<[], [], []>;
1503
1504def int_amdgcn_s_memrealtime :
1505  GCCBuiltin<"__builtin_amdgcn_s_memrealtime">,
1506  Intrinsic<[llvm_i64_ty]>;
1507
1508// llvm.amdgcn.ds.permute <index> <src>
1509def int_amdgcn_ds_permute :
1510  GCCBuiltin<"__builtin_amdgcn_ds_permute">,
1511  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
1512
1513// llvm.amdgcn.ds.bpermute <index> <src>
1514def int_amdgcn_ds_bpermute :
1515  GCCBuiltin<"__builtin_amdgcn_ds_bpermute">,
1516  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
1517
1518//===----------------------------------------------------------------------===//
1519// GFX10 Intrinsics
1520//===----------------------------------------------------------------------===//
1521
1522// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control>
1523def int_amdgcn_permlane16 : GCCBuiltin<"__builtin_amdgcn_permlane16">,
1524  Intrinsic<[llvm_i32_ty],
1525            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
1526            [IntrNoMem, IntrConvergent, ImmArg<4>, ImmArg<5>]>;
1527
1528// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
1529def int_amdgcn_permlanex16 : GCCBuiltin<"__builtin_amdgcn_permlanex16">,
1530  Intrinsic<[llvm_i32_ty],
1531            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
1532            [IntrNoMem, IntrConvergent, ImmArg<4>, ImmArg<5>]>;
1533
1534// llvm.amdgcn.mov.dpp8.i32 <src> <sel>
1535// <sel> is a 32-bit constant whose high 8 bits must be zero which selects
1536// the lanes to read from.
1537def int_amdgcn_mov_dpp8 :
1538  Intrinsic<[llvm_anyint_ty],
1539            [LLVMMatchType<0>, llvm_i32_ty],
1540            [IntrNoMem, IntrConvergent, ImmArg<1>]>;
1541
1542def int_amdgcn_s_get_waveid_in_workgroup :
1543  GCCBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">,
1544  Intrinsic<[llvm_i32_ty], [], [IntrReadMem, IntrInaccessibleMemOnly]>;
1545
1546//===----------------------------------------------------------------------===//
1547// Deep learning intrinsics.
1548//===----------------------------------------------------------------------===//
1549
1550// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp)
1551//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
1552def int_amdgcn_fdot2 :
1553  GCCBuiltin<"__builtin_amdgcn_fdot2">,
1554  Intrinsic<
1555    [llvm_float_ty], // %r
1556    [
1557      llvm_v2f16_ty, // %a
1558      llvm_v2f16_ty, // %b
1559      llvm_float_ty, // %c
1560      llvm_i1_ty     // %clamp
1561    ],
1562    [IntrNoMem, IntrSpeculatable, ImmArg<3>]
1563  >;
1564
1565// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp)
1566//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
1567def int_amdgcn_sdot2 :
1568  GCCBuiltin<"__builtin_amdgcn_sdot2">,
1569  Intrinsic<
1570    [llvm_i32_ty], // %r
1571    [
1572      llvm_v2i16_ty, // %a
1573      llvm_v2i16_ty, // %b
1574      llvm_i32_ty,   // %c
1575      llvm_i1_ty     // %clamp
1576    ],
1577    [IntrNoMem, IntrSpeculatable, ImmArg<3>]
1578  >;
1579
1580// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp)
1581//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
1582def int_amdgcn_udot2 :
1583  GCCBuiltin<"__builtin_amdgcn_udot2">,
1584  Intrinsic<
1585    [llvm_i32_ty], // %r
1586    [
1587      llvm_v2i16_ty, // %a
1588      llvm_v2i16_ty, // %b
1589      llvm_i32_ty,   // %c
1590      llvm_i1_ty     // %clamp
1591    ],
1592    [IntrNoMem, IntrSpeculatable, ImmArg<3>]
1593  >;
1594
1595// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp)
1596//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
1597def int_amdgcn_sdot4 :
1598  GCCBuiltin<"__builtin_amdgcn_sdot4">,
1599  Intrinsic<
1600    [llvm_i32_ty], // %r
1601    [
1602      llvm_i32_ty, // %a
1603      llvm_i32_ty, // %b
1604      llvm_i32_ty, // %c
1605      llvm_i1_ty   // %clamp
1606    ],
1607    [IntrNoMem, IntrSpeculatable, ImmArg<3>]
1608  >;
1609
1610// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp)
1611//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
1612def int_amdgcn_udot4 :
1613  GCCBuiltin<"__builtin_amdgcn_udot4">,
1614  Intrinsic<
1615    [llvm_i32_ty], // %r
1616    [
1617      llvm_i32_ty, // %a
1618      llvm_i32_ty, // %b
1619      llvm_i32_ty, // %c
1620      llvm_i1_ty   // %clamp
1621    ],
1622    [IntrNoMem, IntrSpeculatable, ImmArg<3>]
1623  >;
1624
1625// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp)
1626//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
1627//        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
1628def int_amdgcn_sdot8 :
1629  GCCBuiltin<"__builtin_amdgcn_sdot8">,
1630  Intrinsic<
1631    [llvm_i32_ty], // %r
1632    [
1633      llvm_i32_ty, // %a
1634      llvm_i32_ty, // %b
1635      llvm_i32_ty, // %c
1636      llvm_i1_ty   // %clamp
1637    ],
1638    [IntrNoMem, IntrSpeculatable, ImmArg<3>]
1639  >;
1640
1641// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp)
1642//   %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
1643//        %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
1644def int_amdgcn_udot8 :
1645  GCCBuiltin<"__builtin_amdgcn_udot8">,
1646  Intrinsic<
1647    [llvm_i32_ty], // %r
1648    [
1649      llvm_i32_ty, // %a
1650      llvm_i32_ty, // %b
1651      llvm_i32_ty, // %c
1652      llvm_i1_ty   // %clamp
1653    ],
1654    [IntrNoMem, IntrSpeculatable, ImmArg<3>]
1655  >;
1656
1657//===----------------------------------------------------------------------===//
1658// gfx908 intrinsics
1659// ===----------------------------------------------------------------------===//
1660
1661class AMDGPUBufferAtomicNoRtn : Intrinsic <
1662  [],
1663  [llvm_anyfloat_ty,  // vdata(VGPR)
1664   llvm_v4i32_ty,     // rsrc(SGPR)
1665   llvm_i32_ty,       // vindex(VGPR)
1666   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
1667   llvm_i1_ty],       // slc(imm)
1668  [], "", [SDNPMemOperand]>,
1669  AMDGPURsrcIntrinsic<1, 0>;
1670
1671class AMDGPUGlobalAtomicNoRtn : Intrinsic <
1672  [],
1673  [llvm_anyptr_ty,    // vaddr
1674   llvm_anyfloat_ty],               // vdata(VGPR)
1675  [IntrArgMemOnly, NoCapture<0>], "", [SDNPMemOperand]>;
1676
1677def int_amdgcn_buffer_atomic_fadd    : AMDGPUBufferAtomicNoRtn;
1678def int_amdgcn_global_atomic_fadd    : AMDGPUGlobalAtomicNoRtn;
1679
1680// llvm.amdgcn.mfma.f32.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
1681def int_amdgcn_mfma_f32_32x32x1f32 : Intrinsic<[llvm_v32f32_ty],
1682  [llvm_float_ty, llvm_float_ty, llvm_v32f32_ty,
1683   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1684   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1685
1686def int_amdgcn_mfma_f32_16x16x1f32 : Intrinsic<[llvm_v16f32_ty],
1687  [llvm_float_ty, llvm_float_ty, llvm_v16f32_ty,
1688   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1689   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1690
1691def int_amdgcn_mfma_f32_4x4x1f32 : Intrinsic<[llvm_v4f32_ty],
1692  [llvm_float_ty, llvm_float_ty, llvm_v4f32_ty,
1693   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1694   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1695
1696def int_amdgcn_mfma_f32_32x32x2f32 : Intrinsic<[llvm_v16f32_ty],
1697  [llvm_float_ty, llvm_float_ty, llvm_v16f32_ty,
1698   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1699   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1700
1701def int_amdgcn_mfma_f32_16x16x4f32 : Intrinsic<[llvm_v4f32_ty],
1702  [llvm_float_ty, llvm_float_ty, llvm_v4f32_ty,
1703   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1704   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1705
1706def int_amdgcn_mfma_f32_32x32x4f16 : Intrinsic<[llvm_v32f32_ty],
1707  [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v32f32_ty,
1708   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1709   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1710
1711def int_amdgcn_mfma_f32_16x16x4f16 : Intrinsic<[llvm_v16f32_ty],
1712  [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v16f32_ty,
1713   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1714   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1715
1716def int_amdgcn_mfma_f32_4x4x4f16 : Intrinsic<[llvm_v4f32_ty],
1717  [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v4f32_ty,
1718   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1719   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1720
1721def int_amdgcn_mfma_f32_32x32x8f16 : Intrinsic<[llvm_v16f32_ty],
1722  [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v16f32_ty,
1723   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1724   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1725
1726def int_amdgcn_mfma_f32_16x16x16f16 : Intrinsic<[llvm_v4f32_ty],
1727  [llvm_v4f16_ty, llvm_v4f16_ty, llvm_v4f32_ty,
1728   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1729   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1730
1731def int_amdgcn_mfma_i32_32x32x4i8 : Intrinsic<[llvm_v32i32_ty],
1732  [llvm_i32_ty, llvm_i32_ty, llvm_v32i32_ty,
1733   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1734   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1735
1736def int_amdgcn_mfma_i32_16x16x4i8 : Intrinsic<[llvm_v16i32_ty],
1737  [llvm_i32_ty, llvm_i32_ty, llvm_v16i32_ty,
1738   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1739   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1740
1741def int_amdgcn_mfma_i32_4x4x4i8 : Intrinsic<[llvm_v4i32_ty],
1742  [llvm_i32_ty, llvm_i32_ty, llvm_v4i32_ty,
1743   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1744   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1745
1746def int_amdgcn_mfma_i32_32x32x8i8 : Intrinsic<[llvm_v16i32_ty],
1747  [llvm_i32_ty, llvm_i32_ty, llvm_v16i32_ty,
1748   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1749   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1750
1751def int_amdgcn_mfma_i32_16x16x16i8 : Intrinsic<[llvm_v4i32_ty],
1752  [llvm_i32_ty, llvm_i32_ty, llvm_v4i32_ty,
1753   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1754   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1755
1756def int_amdgcn_mfma_f32_32x32x2bf16 : Intrinsic<[llvm_v32f32_ty],
1757  [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v32f32_ty,
1758   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1759   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1760
1761def int_amdgcn_mfma_f32_16x16x2bf16 : Intrinsic<[llvm_v16f32_ty],
1762  [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v16f32_ty,
1763   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1764   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1765
1766def int_amdgcn_mfma_f32_4x4x2bf16 : Intrinsic<[llvm_v4f32_ty],
1767  [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v4f32_ty,
1768   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1769   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1770
1771def int_amdgcn_mfma_f32_32x32x4bf16 : Intrinsic<[llvm_v16f32_ty],
1772  [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v16f32_ty,
1773   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1774   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1775
1776def int_amdgcn_mfma_f32_16x16x8bf16 : Intrinsic<[llvm_v4f32_ty],
1777  [llvm_v2i16_ty, llvm_v2i16_ty, llvm_v4f32_ty,
1778   llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
1779   [IntrConvergent, IntrNoMem, ImmArg<3>, ImmArg<4>, ImmArg<5>]>;
1780
1781//===----------------------------------------------------------------------===//
1782// Special Intrinsics for backend internal use only. No frontend
1783// should emit calls to these.
1784// ===----------------------------------------------------------------------===//
1785def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_anyint_ty],
1786  [llvm_i1_ty], [IntrConvergent]
1787>;
1788
1789def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_anyint_ty],
1790  [llvm_anyint_ty], [IntrConvergent]
1791>;
1792
1793def int_amdgcn_if_break : Intrinsic<[llvm_anyint_ty],
1794  [llvm_i1_ty, llvm_anyint_ty], [IntrNoMem, IntrConvergent]
1795>;
1796
1797def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
1798  [llvm_anyint_ty], [IntrConvergent]
1799>;
1800
1801def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty], [IntrConvergent]>;
1802
1803// Represent unreachable in a divergent region.
1804def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent]>;
1805
1806// Emit 2.5 ulp, no denormal division. Should only be inserted by
1807// pass based on !fpmath metadata.
1808def int_amdgcn_fdiv_fast : Intrinsic<
1809  [llvm_float_ty], [llvm_float_ty, llvm_float_ty],
1810  [IntrNoMem, IntrSpeculatable]
1811>;
1812}
1813