1//===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics -----*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines all of the R600-specific intrinsics. 10// 11//===----------------------------------------------------------------------===// 12 13class AMDGPUReadPreloadRegisterIntrinsic 14 : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 15 16class AMDGPUReadPreloadRegisterIntrinsicNamed<string name> 17 : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>, ClangBuiltin<name>; 18 19// Used to tag image and resource intrinsics with information used to generate 20// mem operands. 21class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = false> { 22 int RsrcArg = rsrcarg; 23 bit IsImage = isimage; 24} 25 26let TargetPrefix = "r600" in { 27 28multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz { 29 def _x : AMDGPUReadPreloadRegisterIntrinsic; 30 def _y : AMDGPUReadPreloadRegisterIntrinsic; 31 def _z : AMDGPUReadPreloadRegisterIntrinsic; 32} 33 34multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> { 35 def _x : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_x")>; 36 def _y : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_y")>; 37 def _z : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_z")>; 38} 39 40defm int_r600_read_global_size : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 41 <"__builtin_r600_read_global_size">; 42defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 43 <"__builtin_r600_read_ngroups">; 44defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 45 <"__builtin_r600_read_tgid">; 46 47defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz; 48defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz; 49 50def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">, 51 Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>; 52 53// AS 7 is PARAM_I_ADDRESS, used for kernel arguments 54def int_r600_implicitarg_ptr : 55 ClangBuiltin<"__builtin_r600_implicitarg_ptr">, 56 Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 7>], [], 57 [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 58 59def int_r600_rat_store_typed : 60 // 1st parameter: Data 61 // 2nd parameter: Index 62 // 3rd parameter: Constant RAT ID 63 Intrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrWillReturn]>, 64 ClangBuiltin<"__builtin_r600_rat_store_typed">; 65 66def int_r600_recipsqrt_ieee : Intrinsic< 67 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 68>; 69 70def int_r600_recipsqrt_clamped : Intrinsic< 71 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 72>; 73 74def int_r600_cube : Intrinsic< 75 [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 76>; 77 78def int_r600_store_stream_output : Intrinsic< 79 [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrWillReturn] 80>; 81 82class TextureIntrinsicFloatInput : Intrinsic<[llvm_v4f32_ty], [ 83 llvm_v4f32_ty, // Coord 84 llvm_i32_ty, // offset_x 85 llvm_i32_ty, // offset_y, 86 llvm_i32_ty, // offset_z, 87 llvm_i32_ty, // resource_id 88 llvm_i32_ty, // samplerid 89 llvm_i32_ty, // coord_type_x 90 llvm_i32_ty, // coord_type_y 91 llvm_i32_ty, // coord_type_z 92 llvm_i32_ty], // coord_type_w 93 [IntrNoMem, IntrWillReturn] 94>; 95 96class TextureIntrinsicInt32Input : Intrinsic<[llvm_v4i32_ty], [ 97 llvm_v4i32_ty, // Coord 98 llvm_i32_ty, // offset_x 99 llvm_i32_ty, // offset_y, 100 llvm_i32_ty, // offset_z, 101 llvm_i32_ty, // resource_id 102 llvm_i32_ty, // samplerid 103 llvm_i32_ty, // coord_type_x 104 llvm_i32_ty, // coord_type_y 105 llvm_i32_ty, // coord_type_z 106 llvm_i32_ty], // coord_type_w 107 [IntrNoMem, IntrWillReturn] 108>; 109 110def int_r600_store_swizzle : 111 Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrWillReturn] 112>; 113 114def int_r600_tex : TextureIntrinsicFloatInput; 115def int_r600_texc : TextureIntrinsicFloatInput; 116def int_r600_txl : TextureIntrinsicFloatInput; 117def int_r600_txlc : TextureIntrinsicFloatInput; 118def int_r600_txb : TextureIntrinsicFloatInput; 119def int_r600_txbc : TextureIntrinsicFloatInput; 120def int_r600_txf : TextureIntrinsicInt32Input; 121def int_r600_txq : TextureIntrinsicInt32Input; 122def int_r600_ddx : TextureIntrinsicFloatInput; 123def int_r600_ddy : TextureIntrinsicFloatInput; 124 125def int_r600_dot4 : Intrinsic<[llvm_float_ty], 126 [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 127>; 128 129def int_r600_kill : Intrinsic<[], [llvm_float_ty], [IntrWillReturn]>; 130 131} // End TargetPrefix = "r600" 132 133let TargetPrefix = "amdgcn" in { 134 135//===----------------------------------------------------------------------===// 136// ABI Special Intrinsics 137//===----------------------------------------------------------------------===// 138 139defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz; 140defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 141 <"__builtin_amdgcn_workgroup_id">; 142 143def int_amdgcn_dispatch_ptr : 144 Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], 145 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 146 147def int_amdgcn_queue_ptr : 148 ClangBuiltin<"__builtin_amdgcn_queue_ptr">, 149 Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], 150 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 151 152def int_amdgcn_kernarg_segment_ptr : 153 ClangBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">, 154 Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], 155 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 156 157def int_amdgcn_implicitarg_ptr : 158 ClangBuiltin<"__builtin_amdgcn_implicitarg_ptr">, 159 Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], 160 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 161 162def int_amdgcn_groupstaticsize : 163 ClangBuiltin<"__builtin_amdgcn_groupstaticsize">, 164 Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 165 166def int_amdgcn_dispatch_id : 167 ClangBuiltin<"__builtin_amdgcn_dispatch_id">, 168 Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 169 170def int_amdgcn_implicit_buffer_ptr : 171 ClangBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">, 172 Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], 173 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 174 175// Set EXEC to the 64-bit value given. 176// This is always moved to the beginning of the basic block. 177// FIXME: Should be mangled for wave size. 178def int_amdgcn_init_exec : Intrinsic<[], 179 [llvm_i64_ty], // 64-bit literal constant 180 [IntrConvergent, ImmArg<ArgIndex<0>>]>; 181 182// Set EXEC according to a thread count packed in an SGPR input: 183// thread_count = (input >> bitoffset) & 0x7f; 184// This is always moved to the beginning of the basic block. 185// Note: only inreg arguments to the parent function are valid as 186// inputs to this intrinsic, computed values cannot be used. 187def int_amdgcn_init_exec_from_input : Intrinsic<[], 188 [llvm_i32_ty, // 32-bit SGPR input 189 llvm_i32_ty], // bit offset of the thread count 190 [IntrConvergent, ImmArg<ArgIndex<1>>]>; 191 192def int_amdgcn_wavefrontsize : 193 ClangBuiltin<"__builtin_amdgcn_wavefrontsize">, 194 Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 195 196 197//===----------------------------------------------------------------------===// 198// Instruction Intrinsics 199//===----------------------------------------------------------------------===// 200 201// The first parameter is s_sendmsg immediate (i16), 202// the second one is copied to m0 203def int_amdgcn_s_sendmsg : ClangBuiltin<"__builtin_amdgcn_s_sendmsg">, 204 Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], 205 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 206def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">, 207 Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], 208 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 209 210 211// gfx11 intrinsic 212// The first parameter is s_sendmsg immediate (i16). Return type is i32 or i64. 213def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty], 214 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 215 216def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">, 217 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn]>; 218 219def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">, 220 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn]>; 221 222// The 1st parameter is a mask for the types of instructions that may be allowed 223// to cross the SCHED_BARRIER during scheduling. 224// MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER. 225// MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be 226// scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass. 227// MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER. 228// MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER. 229// MASK = 0x0000 0008: MFMA instructions may be scheduled across SCHED_BARRIER. 230// MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER. 231// MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER. 232// MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER. 233// MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER. 234// MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER. 235// MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER. 236def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">, 237 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 238 IntrWillReturn]>; 239 240def int_amdgcn_s_waitcnt : ClangBuiltin<"__builtin_amdgcn_s_waitcnt">, 241 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 242 243def int_amdgcn_div_scale : Intrinsic< 244 // 1st parameter: Numerator 245 // 2nd parameter: Denominator 246 // 3rd parameter: Select quotient. Must equal Numerator or Denominator. 247 // (0 = Denominator, 1 = Numerator). 248 [llvm_anyfloat_ty, llvm_i1_ty], 249 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], 250 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>, IntrWillReturn] 251>; 252 253def int_amdgcn_div_fmas : Intrinsic<[llvm_anyfloat_ty], 254 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], 255 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 256>; 257 258def int_amdgcn_div_fixup : Intrinsic<[llvm_anyfloat_ty], 259 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 260 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 261>; 262 263// Look Up 2.0 / pi src0 with segment select src1[4:0] 264def int_amdgcn_trig_preop : Intrinsic< 265 [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], 266 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 267>; 268 269def int_amdgcn_sin : Intrinsic< 270 [llvm_anyfloat_ty], [LLVMMatchType<0>], 271 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 272>; 273 274def int_amdgcn_cos : Intrinsic< 275 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 276>; 277 278def int_amdgcn_log_clamp : Intrinsic< 279 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 280>; 281 282def int_amdgcn_fmul_legacy : ClangBuiltin<"__builtin_amdgcn_fmul_legacy">, 283 Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], 284 [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative] 285>; 286 287// Fused single-precision multiply-add with legacy behaviour for the multiply, 288// which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is 289// intended for use on subtargets that have the v_fma_legacy_f32 and/or 290// v_fmac_legacy_f32 instructions. (Note that v_fma_legacy_f16 is unrelated and 291// has a completely different kind of legacy behaviour.) 292def int_amdgcn_fma_legacy : 293 Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], 294 [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative] 295>; 296 297def int_amdgcn_rcp : Intrinsic< 298 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 299>; 300 301def int_amdgcn_rcp_legacy : ClangBuiltin<"__builtin_amdgcn_rcp_legacy">, 302 Intrinsic<[llvm_float_ty], [llvm_float_ty], 303 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 304>; 305 306def int_amdgcn_sqrt : Intrinsic< 307 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 308>; 309 310def int_amdgcn_rsq : Intrinsic< 311 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 312>; 313 314def int_amdgcn_rsq_legacy : ClangBuiltin<"__builtin_amdgcn_rsq_legacy">, 315 Intrinsic< 316 [llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 317>; 318 319// out = 1.0 / sqrt(a) result clamped to +/- max_float. 320def int_amdgcn_rsq_clamp : Intrinsic< 321 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 322 323def int_amdgcn_ldexp : Intrinsic< 324 [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], 325 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 326>; 327 328def int_amdgcn_frexp_mant : Intrinsic< 329 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 330>; 331 332def int_amdgcn_frexp_exp : Intrinsic< 333 [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 334>; 335 336// v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0 337// and always uses rtz, so is not suitable for implementing the OpenCL 338// fract function. It should be ok on VI. 339def int_amdgcn_fract : Intrinsic< 340 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 341>; 342 343def int_amdgcn_cvt_pkrtz : ClangBuiltin<"__builtin_amdgcn_cvt_pkrtz">, 344 Intrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], 345 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 346>; 347 348def int_amdgcn_cvt_pknorm_i16 : 349 ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_i16">, 350 Intrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], 351 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 352>; 353 354def int_amdgcn_cvt_pknorm_u16 : 355 ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_u16">, 356 Intrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], 357 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 358>; 359 360def int_amdgcn_cvt_pk_i16 : 361 ClangBuiltin<"__builtin_amdgcn_cvt_pk_i16">, 362 Intrinsic< 363 [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], 364 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 365>; 366 367def int_amdgcn_cvt_pk_u16 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_u16">, 368 Intrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], 369 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 370>; 371 372def int_amdgcn_class : Intrinsic< 373 [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], 374 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 375>; 376 377def int_amdgcn_fmed3 : ClangBuiltin<"__builtin_amdgcn_fmed3">, 378 Intrinsic<[llvm_anyfloat_ty], 379 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 380 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 381>; 382 383def int_amdgcn_cubeid : ClangBuiltin<"__builtin_amdgcn_cubeid">, 384 Intrinsic<[llvm_float_ty], 385 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 386 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 387>; 388 389def int_amdgcn_cubema : ClangBuiltin<"__builtin_amdgcn_cubema">, 390 Intrinsic<[llvm_float_ty], 391 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 392 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 393>; 394 395def int_amdgcn_cubesc : ClangBuiltin<"__builtin_amdgcn_cubesc">, 396 Intrinsic<[llvm_float_ty], 397 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 398 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 399>; 400 401def int_amdgcn_cubetc : ClangBuiltin<"__builtin_amdgcn_cubetc">, 402 Intrinsic<[llvm_float_ty], 403 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 404 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 405>; 406 407// v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz 408// should be used. 409def int_amdgcn_sffbh : 410 Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], 411 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 412>; 413 414// v_mad_f32|f16/v_mac_f32|f16, selected regardless of denorm support. 415def int_amdgcn_fmad_ftz : 416 Intrinsic<[llvm_anyfloat_ty], 417 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 418 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 419>; 420 421// Fields should mirror atomicrmw 422class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty], 423 [llvm_anyptr_ty, 424 LLVMMatchType<0>, 425 llvm_i32_ty, // ordering 426 llvm_i32_ty, // scope 427 llvm_i1_ty], // isVolatile 428 [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, 429 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", 430 [SDNPMemOperand] 431>; 432 433def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin; 434def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin; 435 436class AMDGPULDSIntrin : 437 Intrinsic<[llvm_any_ty], 438 [LLVMQualPointerType<LLVMMatchType<0>, 3>, 439 LLVMMatchType<0>, 440 llvm_i32_ty, // ordering 441 llvm_i32_ty, // scope 442 llvm_i1_ty], // isVolatile 443 [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, 444 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>] 445>; 446 447// FIXME: The m0 argument should be moved after the normal arguments 448class AMDGPUDSOrderedIntrinsic : Intrinsic< 449 [llvm_i32_ty], 450 // M0 = {hi16:address, lo16:waveID}. Allow passing M0 as a pointer, so that 451 // the bit packing can be optimized at the IR level. 452 [LLVMQualPointerType<llvm_i32_ty, 2>, // IntToPtr(M0) 453 llvm_i32_ty, // value to add or swap 454 llvm_i32_ty, // ordering 455 llvm_i32_ty, // scope 456 llvm_i1_ty, // isVolatile 457 llvm_i32_ty, // ordered count index (OA index), also added to the address 458 // gfx10: bits 24-27 indicate the number of active threads/dwords 459 llvm_i1_ty, // wave release, usually set to 1 460 llvm_i1_ty], // wave done, set to 1 for the last ordered instruction 461 [IntrWillReturn, NoCapture<ArgIndex<0>>, 462 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, 463 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>> 464 ] 465>; 466 467class AMDGPUDSAppendConsumedIntrinsic : Intrinsic< 468 [llvm_i32_ty], 469 [llvm_anyptr_ty, // LDS or GDS ptr 470 llvm_i1_ty], // isVolatile 471 [IntrConvergent, IntrWillReturn, IntrArgMemOnly, 472 NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>], 473 "", 474 [SDNPMemOperand] 475>; 476 477def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic; 478def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic; 479 480// The pointer argument is assumed to be dynamically uniform if a VGPR. 481def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic; 482def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic; 483 484def int_amdgcn_ds_fadd : AMDGPULDSIntrin; 485def int_amdgcn_ds_fmin : AMDGPULDSIntrin; 486def int_amdgcn_ds_fmax : AMDGPULDSIntrin; 487 488} // TargetPrefix = "amdgcn" 489 490// New-style image intrinsics 491 492////////////////////////////////////////////////////////////////////////// 493// Dimension-aware image intrinsics framework 494////////////////////////////////////////////////////////////////////////// 495 496// Helper class to represent (type, name) combinations of arguments. The 497// argument names are explanatory and used as DAG operand names for codegen 498// pattern matching. 499class AMDGPUArg<LLVMType ty, string name> { 500 LLVMType Type = ty; 501 string Name = name; 502} 503 504// Return [AMDGPUArg<basety, names[0]>, AMDGPUArg<LLVMMatchType<0>, names[1]>, ...] 505class makeArgList<list<string> names, LLVMType basety> { 506 list<AMDGPUArg> ret = 507 !listconcat([AMDGPUArg<basety, names[0]>], 508 !foreach(name, !tail(names), AMDGPUArg<LLVMMatchType<0>, name>)); 509} 510 511// Return arglist, with LLVMMatchType's references shifted by 'shift'. 512class arglistmatchshift<list<AMDGPUArg> arglist, int shift> { 513 list<AMDGPUArg> ret = 514 !foreach(arg, arglist, 515 !if(!isa<LLVMMatchType>(arg.Type), 516 AMDGPUArg<LLVMMatchType<!add(!cast<LLVMMatchType>(arg.Type).Number, shift)>, 517 arg.Name>, 518 arg)); 519} 520 521// Return the concatenation of the given arglists. LLVMMatchType's are adjusted 522// accordingly, and shifted by an additional 'shift'. 523class arglistconcat<list<list<AMDGPUArg>> arglists, int shift = 0> { 524 list<AMDGPUArg> ret = 525 !foldl([]<AMDGPUArg>, arglists, lhs, rhs, 526 !listconcat( 527 lhs, 528 arglistmatchshift<rhs, 529 !add(shift, !foldl(0, lhs, a, b, 530 !add(a, b.Type.isAny)))>.ret)); 531} 532 533// Represent texture/image types / dimensionality. 534class AMDGPUDimProps<bits<3> enc, string name, string asmsuffix, 535 list<string> coord_names, list<string> slice_names, 536 bit msaa = 0> { 537 AMDGPUDimProps Dim = !cast<AMDGPUDimProps>(NAME); 538 string Name = name; // e.g. "2darraymsaa" 539 string AsmSuffix = asmsuffix; // e.g. 2D_MSAA_ARRAY (used in assembly strings) 540 bits<3> Encoding = enc; 541 bit DA = 0; // DA bit in MIMG encoding 542 bit MSAA = msaa; 543 544 list<AMDGPUArg> CoordSliceArgs = 545 makeArgList<!listconcat(coord_names, slice_names), llvm_anyfloat_ty>.ret; 546 list<AMDGPUArg> CoordSliceIntArgs = 547 makeArgList<!listconcat(coord_names, slice_names), llvm_anyint_ty>.ret; 548 list<AMDGPUArg> GradientArgs = 549 makeArgList<!listconcat(!foreach(name, coord_names, "d" # name # "dh"), 550 !foreach(name, coord_names, "d" # name # "dv")), 551 llvm_anyfloat_ty>.ret; 552 553 bits<8> NumCoords = !size(CoordSliceArgs); 554 bits<8> NumGradients = !size(GradientArgs); 555} 556 557def AMDGPUDim1D : AMDGPUDimProps<0x0, "1d", "1D", ["s"], []>; 558def AMDGPUDim2D : AMDGPUDimProps<0x1, "2d", "2D", ["s", "t"], []>; 559def AMDGPUDim3D : AMDGPUDimProps<0x2, "3d", "3D", ["s", "t", "r"], []>; 560let DA = 1 in { 561 def AMDGPUDimCube : AMDGPUDimProps<0x3, "cube", "CUBE", ["s", "t"], ["face"]>; 562 def AMDGPUDim1DArray : AMDGPUDimProps<0x4, "1darray", "1D_ARRAY", ["s"], ["slice"]>; 563 def AMDGPUDim2DArray : AMDGPUDimProps<0x5, "2darray", "2D_ARRAY", ["s", "t"], ["slice"]>; 564} 565def AMDGPUDim2DMsaa : AMDGPUDimProps<0x6, "2dmsaa", "2D_MSAA", ["s", "t"], ["fragid"], 1>; 566let DA = 1 in { 567 def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<0x7, "2darraymsaa", "2D_MSAA_ARRAY", ["s", "t"], ["slice", "fragid"], 1>; 568} 569 570def AMDGPUDims { 571 list<AMDGPUDimProps> NoMsaa = [AMDGPUDim1D, AMDGPUDim2D, AMDGPUDim3D, 572 AMDGPUDimCube, AMDGPUDim1DArray, 573 AMDGPUDim2DArray]; 574 list<AMDGPUDimProps> Msaa = [AMDGPUDim2DMsaa, AMDGPUDim2DArrayMsaa]; 575 list<AMDGPUDimProps> All = !listconcat(NoMsaa, Msaa); 576} 577 578// Represent sample variants, i.e. _C, _O, _B, ... and combinations thereof. 579class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr> { 580 string UpperCaseMod = ucmod; 581 string LowerCaseMod = lcmod; 582 583 // {offset} {bias} {z-compare} 584 list<AMDGPUArg> ExtraAddrArgs = extra_addr; 585 bit Offset = false; 586 bit Bias = false; 587 bit ZCompare = false; 588 bit Gradients = false; 589 590 // Name of the {lod} or {clamp} argument that is appended to the coordinates, 591 // if any. 592 string LodOrClamp = ""; 593} 594 595// AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE 596// AMDGPUSampleVariantsNoGradients: variants supported by IMAGE_GATHER4 597defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = { 598 multiclass AMDGPUSampleHelper_Offset<string ucmod, string lcmod, 599 list<AMDGPUArg> extra_addr> { 600 def NAME#lcmod : AMDGPUSampleVariant<ucmod, lcmod, extra_addr>; 601 let Offset = true in 602 def NAME#lcmod#_o : AMDGPUSampleVariant< 603 ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg<llvm_i32_ty, "offset">], extra_addr)>; 604 } 605 606 multiclass AMDGPUSampleHelper_Compare<string ucmod, string lcmod, 607 list<AMDGPUArg> extra_addr> { 608 defm NAME : AMDGPUSampleHelper_Offset<ucmod, lcmod, extra_addr>; 609 let ZCompare = true in 610 defm NAME : AMDGPUSampleHelper_Offset< 611 "_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg<llvm_float_ty, "zcompare">])>; 612 } 613 614 multiclass AMDGPUSampleHelper_Clamp<string ucmod, string lcmod, 615 list<AMDGPUArg> extra_addr> { 616 defm NAME : AMDGPUSampleHelper_Compare<ucmod, lcmod, extra_addr>; 617 let LodOrClamp = "clamp" in 618 defm NAME : AMDGPUSampleHelper_Compare<ucmod#"_CL", lcmod#"_cl", extra_addr>; 619 } 620 621 defset list<AMDGPUSampleVariant> AMDGPUSampleVariantsNoGradients = { 622 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>; 623 let Bias = true in 624 defm AMDGPUSample : AMDGPUSampleHelper_Clamp< 625 "_B", "_b", [AMDGPUArg<llvm_anyfloat_ty, "bias">]>; 626 let LodOrClamp = "lod" in 627 defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_L", "_l", []>; 628 defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_LZ", "_lz", []>; 629 } 630 631 let Gradients = true in { 632 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_D", "_d", []>; 633 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_CD", "_cd", []>; 634 } 635} 636 637// Helper class to capture the profile of a dimension-aware image intrinsic. 638// This information is used to generate the intrinsic's type and to inform 639// codegen pattern matching. 640class AMDGPUDimProfile<string opmod, 641 AMDGPUDimProps dim> { 642 AMDGPUDimProps Dim = dim; 643 string OpMod = opmod; // the corresponding instruction is named IMAGE_OpMod 644 645 // These are intended to be overwritten by subclasses 646 bit IsSample = false; 647 bit IsAtomic = false; 648 list<LLVMType> RetTypes = []; 649 list<AMDGPUArg> DataArgs = []; 650 list<AMDGPUArg> ExtraAddrArgs = []; 651 bit Offset = false; 652 bit Bias = false; 653 bit ZCompare = false; 654 bit Gradients = false; 655 string LodClampMip = ""; 656 657 int NumRetAndDataAnyTypes = 658 !foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b, 659 !add(a, b.isAny)); 660 661 list<AMDGPUArg> AddrArgs = 662 arglistconcat<[ExtraAddrArgs, 663 !if(Gradients, dim.GradientArgs, []), 664 !listconcat(!if(IsSample, dim.CoordSliceArgs, dim.CoordSliceIntArgs), 665 !if(!empty(LodClampMip), 666 []<AMDGPUArg>, 667 [AMDGPUArg<LLVMMatchType<0>, LodClampMip>]))], 668 NumRetAndDataAnyTypes>.ret; 669 list<LLVMType> AddrTypes = !foreach(arg, AddrArgs, arg.Type); 670 list<AMDGPUArg> AddrDefaultArgs = 671 !foreach(arg, AddrArgs, 672 AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)), 673 !if(IsSample, llvm_float_ty, llvm_i32_ty), arg.Type), 674 arg.Name>); 675 list<AMDGPUArg> AddrA16Args = 676 !foreach(arg, AddrArgs, 677 AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)), 678 !if(IsSample, llvm_half_ty, llvm_i16_ty), arg.Type), 679 arg.Name>); 680} 681 682class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod, base.Dim> { 683 let IsSample = base.IsSample; 684 let IsAtomic = base.IsAtomic; 685 let RetTypes = base.RetTypes; 686 let DataArgs = base.DataArgs; 687 let ExtraAddrArgs = base.ExtraAddrArgs; 688 let Offset = base.Offset; 689 let Bias = base.Bias; 690 let ZCompare = base.ZCompare; 691 let Gradients = base.Gradients; 692 let LodClampMip = base.LodClampMip; 693} 694 695class AMDGPUDimSampleProfile<string opmod, 696 AMDGPUDimProps dim, 697 AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> { 698 let IsSample = true; 699 let RetTypes = [llvm_any_ty]; 700 let ExtraAddrArgs = sample.ExtraAddrArgs; 701 let Offset = sample.Offset; 702 let Bias = sample.Bias; 703 let ZCompare = sample.ZCompare; 704 let Gradients = sample.Gradients; 705 let LodClampMip = sample.LodOrClamp; 706} 707 708class AMDGPUDimNoSampleProfile<string opmod, 709 AMDGPUDimProps dim, 710 list<LLVMType> retty, 711 list<AMDGPUArg> dataargs, 712 bit Mip = false> : AMDGPUDimProfile<opmod, dim> { 713 let RetTypes = retty; 714 let DataArgs = dataargs; 715 let LodClampMip = !if(Mip, "mip", ""); 716} 717 718class AMDGPUDimAtomicProfile<string opmod, 719 AMDGPUDimProps dim, 720 list<AMDGPUArg> dataargs> : AMDGPUDimProfile<opmod, dim> { 721 let RetTypes = [llvm_anyint_ty]; 722 let DataArgs = dataargs; 723 let IsAtomic = true; 724} 725 726class AMDGPUDimAtomicFloatProfile<string opmod, AMDGPUDimProps dim, 727 list<AMDGPUArg> dataargs> 728 : AMDGPUDimAtomicProfile<opmod, dim, dataargs> { 729 let RetTypes = [llvm_anyfloat_ty]; 730} 731 732class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> 733 : AMDGPUDimProfile<"GET_RESINFO", dim> { 734 let RetTypes = [llvm_anyfloat_ty]; 735 let DataArgs = []; 736 let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">]; 737 let LodClampMip = "mip"; 738} 739 740// Helper class for figuring out image intrinsic argument indexes. 741class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> { 742 int NumDataArgs = !size(P_.DataArgs); 743 int NumDmaskArgs = !not(P_.IsAtomic); 744 int NumOffsetArgs = !if(P_.Offset, 1, 0); 745 int NumBiasArgs = !if(P_.Bias, 1, 0); 746 int NumZCompareArgs = !if(P_.ZCompare, 1, 0); 747 int NumExtraAddrArgs = !add(NumOffsetArgs, NumBiasArgs, NumZCompareArgs); 748 int NumVAddrArgs = !size(P_.AddrArgs); 749 int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0); 750 int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs)); 751 int NumRSrcArgs = 1; 752 int NumSampArgs = !if(P_.IsSample, 2, 0); 753 int DmaskArgIndex = NumDataArgs; 754 int VAddrArgIndex = !add(DmaskArgIndex, NumDmaskArgs); 755 int OffsetArgIndex = VAddrArgIndex; 756 int BiasArgIndex = !add(VAddrArgIndex, NumOffsetArgs); 757 int ZCompareArgIndex = !add(BiasArgIndex, NumBiasArgs); 758 int GradientArgIndex = !add(VAddrArgIndex, NumExtraAddrArgs); 759 int CoordArgIndex = !add(GradientArgIndex, NumGradientArgs); 760 int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1); 761 int MipArgIndex = LodArgIndex; 762 int RsrcArgIndex = !add(VAddrArgIndex, NumVAddrArgs); 763 int SampArgIndex = !add(RsrcArgIndex, NumRSrcArgs); 764 int UnormArgIndex = !add(SampArgIndex, 1); 765 int TexFailCtrlArgIndex = !add(SampArgIndex, NumSampArgs); 766 int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1); 767} 768 769// All dimension-aware intrinsics are derived from this class. 770class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_, 771 list<IntrinsicProperty> props, 772 list<SDNodeProperty> sdnodeprops> : Intrinsic< 773 P_.RetTypes, // vdata(VGPR) -- for load/atomic-with-return 774 !listconcat( 775 !foreach(arg, P_.DataArgs, arg.Type), // vdata(VGPR) -- for store/atomic 776 !if(P_.IsAtomic, [], [llvm_i32_ty]), // dmask(imm) 777 P_.AddrTypes, // vaddr(VGPR) 778 [llvm_v8i32_ty], // rsrc(SGPR) 779 !if(P_.IsSample, [llvm_v4i32_ty, // samp(SGPR) 780 llvm_i1_ty], []), // unorm(imm) 781 [llvm_i32_ty, // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe) 782 llvm_i32_ty]), // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc) 783 784 !listconcat(props, 785 !if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]), 786 !if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []), 787 [IntrWillReturn], 788 [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>, 789 ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>]), 790 791 792 "", sdnodeprops>, 793 AMDGPURsrcIntrinsic<!add(!size(P_.DataArgs), !size(P_.AddrTypes), 794 !if(P_.IsAtomic, 0, 1)), 1> { 795 AMDGPUDimProfile P = P_; 796 797 AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME); 798 799 let TargetPrefix = "amdgcn"; 800} 801 802// Marker class for intrinsics with a DMask that determines the returned 803// channels. 804class AMDGPUImageDMaskIntrinsic; 805 806defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = { 807 808 ////////////////////////////////////////////////////////////////////////// 809 // Load and store intrinsics 810 ////////////////////////////////////////////////////////////////////////// 811 multiclass AMDGPUImageDimIntrinsicsNoMsaa<string opmod, 812 list<LLVMType> retty, 813 list<AMDGPUArg> dataargs, 814 list<IntrinsicProperty> props, 815 list<SDNodeProperty> sdnodeprops, 816 bit Mip = false> { 817 foreach dim = AMDGPUDims.NoMsaa in { 818 def !strconcat(NAME, "_", dim.Name) 819 : AMDGPUImageDimIntrinsic< 820 AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>, 821 props, sdnodeprops>; 822 } 823 } 824 825 multiclass AMDGPUImageDimIntrinsicsAll<string opmod, 826 list<LLVMType> retty, 827 list<AMDGPUArg> dataargs, 828 list<IntrinsicProperty> props, 829 list<SDNodeProperty> sdnodeprops, 830 bit Mip = false> { 831 foreach dim = AMDGPUDims.All in { 832 def !strconcat(NAME, "_", dim.Name) 833 : AMDGPUImageDimIntrinsic< 834 AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>, 835 props, sdnodeprops>; 836 } 837 } 838 839 defm int_amdgcn_image_load 840 : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem], 841 [SDNPMemOperand]>, 842 AMDGPUImageDMaskIntrinsic; 843 defm int_amdgcn_image_load_mip 844 : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [], 845 [IntrReadMem, IntrWillReturn], [SDNPMemOperand], 1>, 846 AMDGPUImageDMaskIntrinsic; 847 848 defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll< 849 "STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">], 850 [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>; 851 defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa< 852 "STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">], 853 [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>; 854 855 ////////////////////////////////////////////////////////////////////////// 856 // MSAA intrinsics 857 ////////////////////////////////////////////////////////////////////////// 858 foreach dim = AMDGPUDims.Msaa in { 859 def int_amdgcn_image_msaa_load_x # _ # dim.Name: 860 AMDGPUImageDimIntrinsic< 861 AMDGPUDimNoSampleProfile<"MSAA_LOAD_X", dim, [llvm_any_ty], []>, 862 [IntrReadMem], [SDNPMemOperand]>; 863 } 864 865 foreach dim = AMDGPUDims.Msaa in { 866 def int_amdgcn_image_msaa_load # _ # dim.Name: 867 AMDGPUImageDimIntrinsic< 868 AMDGPUDimNoSampleProfile<"MSAA_LOAD", dim, [llvm_any_ty], []>, 869 [IntrReadMem], [SDNPMemOperand]>; 870 } 871 872 ////////////////////////////////////////////////////////////////////////// 873 // sample and getlod intrinsics 874 ////////////////////////////////////////////////////////////////////////// 875 multiclass AMDGPUImageDimSampleDims<string opmod, 876 AMDGPUSampleVariant sample, 877 bit NoMem = false> { 878 foreach dim = AMDGPUDims.NoMsaa in { 879 def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic< 880 AMDGPUDimSampleProfile<opmod, dim, sample>, 881 !if(NoMem, [IntrNoMem], [IntrReadMem]), 882 !if(NoMem, [], [SDNPMemOperand])>; 883 } 884 } 885 886 foreach sample = AMDGPUSampleVariants in { 887 defm int_amdgcn_image_sample # sample.LowerCaseMod 888 : AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>, 889 AMDGPUImageDMaskIntrinsic; 890 } 891 892 defm int_amdgcn_image_getlod 893 : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>, 894 AMDGPUImageDMaskIntrinsic; 895 896 ////////////////////////////////////////////////////////////////////////// 897 // getresinfo intrinsics 898 ////////////////////////////////////////////////////////////////////////// 899 foreach dim = AMDGPUDims.All in { 900 def !strconcat("int_amdgcn_image_getresinfo_", dim.Name) 901 : AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>, 902 AMDGPUImageDMaskIntrinsic; 903 } 904 905 ////////////////////////////////////////////////////////////////////////// 906 // gather4 intrinsics 907 ////////////////////////////////////////////////////////////////////////// 908 foreach sample = AMDGPUSampleVariantsNoGradients in { 909 foreach dim = [AMDGPUDim2D, AMDGPUDimCube, AMDGPUDim2DArray] in { 910 def int_amdgcn_image_gather4 # sample.LowerCaseMod # _ # dim.Name: 911 AMDGPUImageDimIntrinsic< 912 AMDGPUDimSampleProfile<"GATHER4" # sample.UpperCaseMod, dim, sample>, 913 [IntrReadMem], [SDNPMemOperand]>; 914 } 915 } 916} 917 918////////////////////////////////////////////////////////////////////////// 919// atomic intrinsics 920////////////////////////////////////////////////////////////////////////// 921defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = { 922 multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs, 923 int isFloat = 0> { 924 foreach dim = AMDGPUDims.All in { 925 def !strconcat(NAME, "_", dim.Name): AMDGPUImageDimIntrinsic< 926 !if (isFloat, AMDGPUDimAtomicFloatProfile<opmod, dim, dataargs>, 927 AMDGPUDimAtomicProfile<opmod, dim, dataargs>), 928 [], [SDNPMemOperand]>; 929 } 930 } 931 932 multiclass AMDGPUImageDimAtomic<string opmod, int isFloat = 0> { 933 defm "" 934 : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">], 935 isFloat>; 936 } 937 938 multiclass AMDGPUImageDimFloatAtomic<string opmod> { 939 defm "" : AMDGPUImageDimAtomic<opmod, 1 /*isFloat*/>; 940 } 941 942 defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">; 943 defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">; 944 defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">; 945 defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">; 946 defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">; 947 defm int_amdgcn_image_atomic_fmin : AMDGPUImageDimFloatAtomic<"ATOMIC_FMIN">; 948 defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">; 949 defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">; 950 defm int_amdgcn_image_atomic_fmax : AMDGPUImageDimFloatAtomic<"ATOMIC_FMAX">; 951 defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">; 952 defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">; 953 defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">; 954 defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">; 955 defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">; 956 957 defm int_amdgcn_image_atomic_cmpswap : 958 AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">, 959 AMDGPUArg<LLVMMatchType<0>, "cmp">]>; 960} 961 962////////////////////////////////////////////////////////////////////////// 963// Buffer intrinsics 964////////////////////////////////////////////////////////////////////////// 965 966let TargetPrefix = "amdgcn" in { 967 968defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = { 969 970class AMDGPUBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic < 971 [data_ty], 972 [llvm_v4i32_ty, // rsrc(SGPR) 973 llvm_i32_ty, // vindex(VGPR) 974 llvm_i32_ty, // offset(SGPR/VGPR/imm) 975 llvm_i1_ty, // glc(imm) 976 llvm_i1_ty], // slc(imm) 977 [IntrReadMem, IntrWillReturn, 978 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 979 AMDGPURsrcIntrinsic<0>; 980def int_amdgcn_buffer_load_format : AMDGPUBufferLoad<llvm_anyfloat_ty>; 981def int_amdgcn_buffer_load : AMDGPUBufferLoad; 982 983// Generate a buffer_load instruction that may be optimized to s_buffer_load if 984// the offset argument is uniform. 985def int_amdgcn_s_buffer_load : Intrinsic < 986 [llvm_any_ty], 987 [llvm_v4i32_ty, // rsrc(SGPR) 988 llvm_i32_ty, // byte offset 989 llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 2 = dlc) 990 [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<2>>]>, 991 AMDGPURsrcIntrinsic<0>; 992 993class AMDGPUBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic < 994 [], 995 [data_ty, // vdata(VGPR) 996 llvm_v4i32_ty, // rsrc(SGPR) 997 llvm_i32_ty, // vindex(VGPR) 998 llvm_i32_ty, // offset(SGPR/VGPR/imm) 999 llvm_i1_ty, // glc(imm) 1000 llvm_i1_ty], // slc(imm) 1001 [IntrWriteMem, IntrWillReturn, 1002 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1003 AMDGPURsrcIntrinsic<1>; 1004def int_amdgcn_buffer_store_format : AMDGPUBufferStore<llvm_anyfloat_ty>; 1005def int_amdgcn_buffer_store : AMDGPUBufferStore; 1006 1007// New buffer intrinsics with separate raw and struct variants. The raw 1008// variant never has an index. The struct variant always has an index, even if 1009// it is const 0. A struct intrinsic with constant 0 index is different to the 1010// corresponding raw intrinsic on gfx9+ because the behavior of bound checking 1011// and swizzling changes depending on whether idxen is set in the instruction. 1012// These new instrinsics also keep the offset and soffset arguments separate as 1013// they behave differently in bounds checking and swizzling. 1014class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1015 [data_ty], 1016 [llvm_v4i32_ty, // rsrc(SGPR) 1017 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1018 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1019 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1020 // bit 1 = slc, 1021 // bit 2 = dlc on gfx10+), 1022 // swizzled buffer (bit 3 = swz)) 1023 [IntrReadMem, IntrWillReturn, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>, 1024 AMDGPURsrcIntrinsic<0>; 1025def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>; 1026def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad; 1027 1028class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1029 [data_ty], 1030 [llvm_v4i32_ty, // rsrc(SGPR) 1031 llvm_i32_ty, // vindex(VGPR) 1032 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1033 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1034 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1035 // bit 1 = slc, 1036 // bit 2 = dlc on gfx10+), 1037 // swizzled buffer (bit 3 = swz)) 1038 [IntrReadMem, IntrWillReturn, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1039 AMDGPURsrcIntrinsic<0>; 1040def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad; 1041def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad; 1042 1043class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1044 [], 1045 [data_ty, // vdata(VGPR) 1046 llvm_v4i32_ty, // rsrc(SGPR) 1047 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1048 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1049 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1050 // bit 1 = slc, 1051 // bit 2 = dlc on gfx10+), 1052 // swizzled buffer (bit 3 = swz)) 1053 [IntrWriteMem, IntrWillReturn, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1054 AMDGPURsrcIntrinsic<1>; 1055def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore<llvm_anyfloat_ty>; 1056def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore; 1057 1058class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1059 [], 1060 [data_ty, // vdata(VGPR) 1061 llvm_v4i32_ty, // rsrc(SGPR) 1062 llvm_i32_ty, // vindex(VGPR) 1063 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1064 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1065 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1066 // bit 1 = slc, 1067 // bit 2 = dlc on gfx10+), 1068 // swizzled buffer (bit 3 = swz)) 1069 [IntrWriteMem, IntrWillReturn, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1070 AMDGPURsrcIntrinsic<1>; 1071def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; 1072def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore; 1073 1074class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic < 1075 !if(NoRtn, [], [data_ty]), 1076 [!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR) 1077 llvm_v4i32_ty, // rsrc(SGPR) 1078 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1079 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1080 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1081 [ImmArg<ArgIndex<4>>, IntrWillReturn], "", [SDNPMemOperand]>, 1082 AMDGPURsrcIntrinsic<1, 0>; 1083def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic; 1084def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic; 1085def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic; 1086def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic; 1087def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic; 1088def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1089def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic; 1090def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic; 1091def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1092def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic; 1093def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic; 1094def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic; 1095def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic; 1096def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic; 1097def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< 1098 [llvm_anyint_ty], 1099 [LLVMMatchType<0>, // src(VGPR) 1100 LLVMMatchType<0>, // cmp(VGPR) 1101 llvm_v4i32_ty, // rsrc(SGPR) 1102 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1103 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1104 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1105 [ImmArg<ArgIndex<5>>, IntrWillReturn], "", [SDNPMemOperand]>, 1106 AMDGPURsrcIntrinsic<2, 0>; 1107 1108// gfx908 intrinsic 1109def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1110 1111class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic < 1112 !if(NoRtn, [], [data_ty]), 1113 [!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR) 1114 llvm_v4i32_ty, // rsrc(SGPR) 1115 llvm_i32_ty, // vindex(VGPR) 1116 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1117 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1118 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1119 [ImmArg<ArgIndex<5>>, IntrWillReturn], "", [SDNPMemOperand]>, 1120 AMDGPURsrcIntrinsic<1, 0>; 1121def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic; 1122def int_amdgcn_struct_buffer_atomic_add : AMDGPUStructBufferAtomic; 1123def int_amdgcn_struct_buffer_atomic_sub : AMDGPUStructBufferAtomic; 1124def int_amdgcn_struct_buffer_atomic_smin : AMDGPUStructBufferAtomic; 1125def int_amdgcn_struct_buffer_atomic_umin : AMDGPUStructBufferAtomic; 1126def int_amdgcn_struct_buffer_atomic_smax : AMDGPUStructBufferAtomic; 1127def int_amdgcn_struct_buffer_atomic_umax : AMDGPUStructBufferAtomic; 1128def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic; 1129def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic; 1130def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic; 1131def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic; 1132def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic; 1133def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< 1134 [llvm_anyint_ty], 1135 [LLVMMatchType<0>, // src(VGPR) 1136 LLVMMatchType<0>, // cmp(VGPR) 1137 llvm_v4i32_ty, // rsrc(SGPR) 1138 llvm_i32_ty, // vindex(VGPR) 1139 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1140 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1141 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1142 [ImmArg<ArgIndex<6>>, IntrWillReturn], "", [SDNPMemOperand]>, 1143 AMDGPURsrcIntrinsic<2, 0>; 1144 1145// gfx908 intrinsic 1146def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1147 1148// gfx90a intrinsics 1149def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1150def int_amdgcn_struct_buffer_atomic_fmax : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1151 1152 1153// Obsolescent tbuffer intrinsics. 1154def int_amdgcn_tbuffer_load : Intrinsic < 1155 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1156 [llvm_v4i32_ty, // rsrc(SGPR) 1157 llvm_i32_ty, // vindex(VGPR) 1158 llvm_i32_ty, // voffset(VGPR) 1159 llvm_i32_ty, // soffset(SGPR) 1160 llvm_i32_ty, // offset(imm) 1161 llvm_i32_ty, // dfmt(imm) 1162 llvm_i32_ty, // nfmt(imm) 1163 llvm_i1_ty, // glc(imm) 1164 llvm_i1_ty], // slc(imm) 1165 [IntrReadMem, IntrWillReturn, 1166 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, 1167 ImmArg<ArgIndex<7>>, ImmArg<ArgIndex<8>>], "", [SDNPMemOperand]>, 1168 AMDGPURsrcIntrinsic<0>; 1169 1170def int_amdgcn_tbuffer_store : Intrinsic < 1171 [], 1172 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1173 llvm_v4i32_ty, // rsrc(SGPR) 1174 llvm_i32_ty, // vindex(VGPR) 1175 llvm_i32_ty, // voffset(VGPR) 1176 llvm_i32_ty, // soffset(SGPR) 1177 llvm_i32_ty, // offset(imm) 1178 llvm_i32_ty, // dfmt(imm) 1179 llvm_i32_ty, // nfmt(imm) 1180 llvm_i1_ty, // glc(imm) 1181 llvm_i1_ty], // slc(imm) 1182 [IntrWriteMem, IntrWillReturn, ImmArg<ArgIndex<5>>, 1183 ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, 1184 ImmArg<ArgIndex<8>>, ImmArg<ArgIndex<9>>], "", [SDNPMemOperand]>, 1185 AMDGPURsrcIntrinsic<1>; 1186 1187// New tbuffer intrinsics, with: 1188// - raw and struct variants 1189// - joint format field 1190// - joint cachepolicy field 1191def int_amdgcn_raw_tbuffer_load : Intrinsic < 1192 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1193 [llvm_v4i32_ty, // rsrc(SGPR) 1194 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1195 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1196 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1197 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1198 // bit 1 = slc, 1199 // bit 2 = dlc on gfx10+), 1200 // swizzled buffer (bit 3 = swz)) 1201 [IntrReadMem, IntrWillReturn, 1202 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1203 AMDGPURsrcIntrinsic<0>; 1204 1205def int_amdgcn_raw_tbuffer_store : Intrinsic < 1206 [], 1207 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1208 llvm_v4i32_ty, // rsrc(SGPR) 1209 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1210 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1211 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1212 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1213 // bit 1 = slc, 1214 // bit 2 = dlc on gfx10+), 1215 // swizzled buffer (bit 3 = swz)) 1216 [IntrWriteMem, IntrWillReturn, 1217 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1218 AMDGPURsrcIntrinsic<1>; 1219 1220def int_amdgcn_struct_tbuffer_load : Intrinsic < 1221 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1222 [llvm_v4i32_ty, // rsrc(SGPR) 1223 llvm_i32_ty, // vindex(VGPR) 1224 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1225 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1226 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1227 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1228 // bit 1 = slc, 1229 // bit 2 = dlc on gfx10+), 1230 // swizzled buffer (bit 3 = swz)) 1231 [IntrReadMem, IntrWillReturn, 1232 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1233 AMDGPURsrcIntrinsic<0>; 1234 1235def int_amdgcn_struct_tbuffer_store : Intrinsic < 1236 [], 1237 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1238 llvm_v4i32_ty, // rsrc(SGPR) 1239 llvm_i32_ty, // vindex(VGPR) 1240 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1241 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1242 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1243 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1244 // bit 1 = slc, 1245 // bit 2 = dlc on gfx10+), 1246 // swizzled buffer (bit 3 = swz)) 1247 [IntrWriteMem, IntrWillReturn, 1248 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, 1249 AMDGPURsrcIntrinsic<1>; 1250 1251class AMDGPUBufferAtomic : Intrinsic < 1252 [llvm_anyint_ty], 1253 [LLVMMatchType<0>, // vdata(VGPR) 1254 llvm_v4i32_ty, // rsrc(SGPR) 1255 llvm_i32_ty, // vindex(VGPR) 1256 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1257 llvm_i1_ty], // slc(imm) 1258 [ImmArg<ArgIndex<4>>, IntrWillReturn], "", [SDNPMemOperand]>, 1259 AMDGPURsrcIntrinsic<1, 0>; 1260def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic; 1261def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic; 1262def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic; 1263def int_amdgcn_buffer_atomic_smin : AMDGPUBufferAtomic; 1264def int_amdgcn_buffer_atomic_umin : AMDGPUBufferAtomic; 1265def int_amdgcn_buffer_atomic_smax : AMDGPUBufferAtomic; 1266def int_amdgcn_buffer_atomic_umax : AMDGPUBufferAtomic; 1267def int_amdgcn_buffer_atomic_and : AMDGPUBufferAtomic; 1268def int_amdgcn_buffer_atomic_or : AMDGPUBufferAtomic; 1269def int_amdgcn_buffer_atomic_xor : AMDGPUBufferAtomic; 1270def int_amdgcn_buffer_atomic_cmpswap : Intrinsic< 1271 [llvm_i32_ty], 1272 [llvm_i32_ty, // src(VGPR) 1273 llvm_i32_ty, // cmp(VGPR) 1274 llvm_v4i32_ty, // rsrc(SGPR) 1275 llvm_i32_ty, // vindex(VGPR) 1276 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1277 llvm_i1_ty], // slc(imm) 1278 [ImmArg<ArgIndex<5>>, IntrWillReturn], "", [SDNPMemOperand]>, 1279 AMDGPURsrcIntrinsic<2, 0>; 1280 1281def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic; 1282 1283class AMDGPUBufferAtomicFP : Intrinsic < 1284 [llvm_anyfloat_ty], 1285 [LLVMMatchType<0>, // vdata(VGPR) 1286 llvm_v4i32_ty, // rsrc(SGPR) 1287 llvm_i32_ty, // vindex(VGPR) 1288 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1289 llvm_i1_ty], // slc(imm) 1290 [ImmArg<ArgIndex<4>>, IntrWillReturn], "", [SDNPMemOperand]>, 1291 AMDGPURsrcIntrinsic<1, 0>; 1292 1293// Legacy form of the intrinsic. raw and struct forms should be preferred. 1294def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP; 1295 1296class AMDGPURawBufferLoadLDS : Intrinsic < 1297 [], 1298 [llvm_v4i32_ty, // rsrc(SGPR) 1299 LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset 1300 llvm_i32_ty, // Data byte size: 1/2/4 1301 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1302 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1303 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1304 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1305 // bit 1 = slc, 1306 // bit 2 = dlc on gfx10+)) 1307 // swizzled buffer (bit 3 = swz)) 1308 [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, 1309 ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1310def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS; 1311 1312class AMDGPUStructBufferLoadLDS : Intrinsic < 1313 [], 1314 [llvm_v4i32_ty, // rsrc(SGPR) 1315 LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset 1316 llvm_i32_ty, // Data byte size: 1/2/4 1317 llvm_i32_ty, // vindex(VGPR) 1318 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1319 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1320 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1321 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1322 // bit 1 = slc, 1323 // bit 2 = dlc on gfx10+)) 1324 // swizzled buffer (bit 3 = swz)) 1325 [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, 1326 ImmArg<ArgIndex<7>>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1327def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS; 1328 1329} // defset AMDGPUBufferIntrinsics 1330 1331// Uses that do not set the done bit should set IntrWriteMem on the 1332// call site. 1333def int_amdgcn_exp : Intrinsic <[], [ 1334 llvm_i32_ty, // tgt, 1335 llvm_i32_ty, // en 1336 llvm_any_ty, // src0 (f32 or i32) 1337 LLVMMatchType<0>, // src1 1338 LLVMMatchType<0>, // src2 1339 LLVMMatchType<0>, // src3 1340 llvm_i1_ty, // done 1341 llvm_i1_ty // vm (ignored on GFX11+) 1342 ], 1343 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, 1344 ImmArg<ArgIndex<7>>, IntrWriteMem, IntrInaccessibleMemOnly, 1345 IntrWillReturn] 1346>; 1347 1348// exp with row_en bit set. Only supported on GFX11+. 1349def int_amdgcn_exp_row : Intrinsic <[], [ 1350 llvm_i32_ty, // tgt, 1351 llvm_i32_ty, // en 1352 llvm_any_ty, // src0 (f32 or i32) 1353 LLVMMatchType<0>, // src1 1354 LLVMMatchType<0>, // src2 1355 LLVMMatchType<0>, // src3 1356 llvm_i1_ty, // done 1357 llvm_i32_ty], // row number 1358 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, 1359 IntrWriteMem, IntrInaccessibleMemOnly, IntrWillReturn] 1360>; 1361 1362// exp with compr bit set. Not supported on GFX11+. 1363def int_amdgcn_exp_compr : Intrinsic <[], [ 1364 llvm_i32_ty, // tgt, 1365 llvm_i32_ty, // en 1366 llvm_anyvector_ty, // src0 (v2f16 or v2i16) 1367 LLVMMatchType<0>, // src1 1368 llvm_i1_ty, // done 1369 llvm_i1_ty], // vm 1370 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>, 1371 ImmArg<ArgIndex<5>>, IntrWriteMem, IntrInaccessibleMemOnly, 1372 IntrWillReturn] 1373>; 1374 1375def int_amdgcn_buffer_wbinvl1_sc : 1376 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">, 1377 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 1378 1379def int_amdgcn_buffer_wbinvl1 : 1380 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1">, 1381 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 1382 1383def int_amdgcn_s_dcache_inv : 1384 ClangBuiltin<"__builtin_amdgcn_s_dcache_inv">, 1385 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 1386 1387def int_amdgcn_s_memtime : 1388 ClangBuiltin<"__builtin_amdgcn_s_memtime">, 1389 Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 1390 1391def int_amdgcn_s_sleep : 1392 ClangBuiltin<"__builtin_amdgcn_s_sleep">, 1393 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1394 IntrHasSideEffects, IntrWillReturn]> { 1395} 1396 1397def int_amdgcn_s_incperflevel : 1398 ClangBuiltin<"__builtin_amdgcn_s_incperflevel">, 1399 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1400 IntrHasSideEffects, IntrWillReturn]> { 1401} 1402 1403def int_amdgcn_s_decperflevel : 1404 ClangBuiltin<"__builtin_amdgcn_s_decperflevel">, 1405 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1406 IntrHasSideEffects, IntrWillReturn]> { 1407} 1408 1409def int_amdgcn_s_sethalt : 1410 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1411 IntrHasSideEffects, IntrWillReturn]>; 1412 1413def int_amdgcn_s_setprio : 1414 ClangBuiltin<"__builtin_amdgcn_s_setprio">, 1415 Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1416 IntrHasSideEffects, IntrWillReturn]>; 1417 1418// This is IntrHasSideEffects so it can be used to read cycle counters. 1419def int_amdgcn_s_getreg : 1420 ClangBuiltin<"__builtin_amdgcn_s_getreg">, 1421 Intrinsic<[llvm_i32_ty], [llvm_i32_ty], 1422 [IntrNoMem, IntrHasSideEffects, IntrWillReturn, ImmArg<ArgIndex<0>>] 1423>; 1424 1425// Note this can be used to set FP environment properties that are 1426// unsafe to change in non-strictfp functions. The register properties 1427// available (and value required to access them) may differ per 1428// subtarget. llvm.amdgcn.s.setreg(hwmode, value) 1429def int_amdgcn_s_setreg : 1430 ClangBuiltin<"__builtin_amdgcn_s_setreg">, 1431 Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], 1432 [IntrNoMem, IntrHasSideEffects, IntrWillReturn, ImmArg<ArgIndex<0>>] 1433>; 1434 1435// int_amdgcn_s_getpc is provided to allow a specific style of position 1436// independent code to determine the high part of its address when it is 1437// known (through convention) that the code and any data of interest does 1438// not cross a 4Gb address boundary. Use for any other purpose may not 1439// produce the desired results as optimizations may cause code movement, 1440// especially as we explicitly use IntrNoMem to allow optimizations. 1441def int_amdgcn_s_getpc : 1442 ClangBuiltin<"__builtin_amdgcn_s_getpc">, 1443 Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable, 1444 IntrWillReturn]>; 1445 1446// __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0> 1447// param values: 0 = P10, 1 = P20, 2 = P0 1448def int_amdgcn_interp_mov : 1449 ClangBuiltin<"__builtin_amdgcn_interp_mov">, 1450 Intrinsic<[llvm_float_ty], 1451 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1452 [IntrNoMem, IntrSpeculatable, IntrWillReturn, 1453 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; 1454 1455// __builtin_amdgcn_interp_p1 <i>, <attr_chan>, <attr>, <m0> 1456// This intrinsic reads from lds, but the memory values are constant, 1457// so it behaves like IntrNoMem. 1458def int_amdgcn_interp_p1 : 1459 ClangBuiltin<"__builtin_amdgcn_interp_p1">, 1460 Intrinsic<[llvm_float_ty], 1461 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1462 [IntrNoMem, IntrSpeculatable, IntrWillReturn, 1463 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; 1464 1465// __builtin_amdgcn_interp_p2 <p1>, <j>, <attr_chan>, <attr>, <m0> 1466def int_amdgcn_interp_p2 : 1467 ClangBuiltin<"__builtin_amdgcn_interp_p2">, 1468 Intrinsic<[llvm_float_ty], 1469 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1470 [IntrNoMem, IntrSpeculatable, IntrWillReturn, 1471 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; 1472 // See int_amdgcn_v_interp_p1 for why this is IntrNoMem. 1473 1474// __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0> 1475// high selects whether high or low 16-bits are loaded from LDS 1476def int_amdgcn_interp_p1_f16 : 1477 ClangBuiltin<"__builtin_amdgcn_interp_p1_f16">, 1478 Intrinsic<[llvm_float_ty], 1479 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], 1480 [IntrNoMem, IntrSpeculatable, IntrWillReturn, 1481 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; 1482 1483// __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0> 1484// high selects whether high or low 16-bits are loaded from LDS 1485def int_amdgcn_interp_p2_f16 : 1486 ClangBuiltin<"__builtin_amdgcn_interp_p2_f16">, 1487 Intrinsic<[llvm_half_ty], 1488 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], 1489 [IntrNoMem, IntrSpeculatable, IntrWillReturn, 1490 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>; 1491 1492// llvm.amdgcn.lds.direct.load <m0> 1493// The input argument is m0, which contains a packed combination of address 1494// offset and flags describing the data type. 1495def int_amdgcn_lds_direct_load : 1496 Intrinsic<[llvm_any_ty], // overloaded for types u8, u16, i32/f32, i8, i16 1497 [llvm_i32_ty], 1498 [IntrReadMem, IntrSpeculatable, IntrWillReturn]>; 1499 1500// llvm.amdgcn.lds.param.load <attr_chan>, <attr>, <m0> 1501// Like interp intrinsics, this reads from lds, but the memory values are constant, 1502// so it behaves like IntrNoMem. 1503def int_amdgcn_lds_param_load : 1504 Intrinsic<[llvm_float_ty], 1505 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1506 [IntrNoMem, IntrSpeculatable, IntrWillReturn, 1507 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; 1508 1509// llvm.amdgcn.interp.inreg.p10 <p>, <i>, <p0> 1510def int_amdgcn_interp_inreg_p10 : 1511 Intrinsic<[llvm_float_ty], 1512 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 1513 [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 1514 1515// llvm.amdgcn.interp.inreg.p2 <p>, <j>, <tmp> 1516def int_amdgcn_interp_inreg_p2 : 1517 Intrinsic<[llvm_float_ty], 1518 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 1519 [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 1520 1521// llvm.amdgcn.interp.inreg.p10.f16 <p>, <i>, <p0>, <high> 1522// high selects whether high or low 16-bits are used for p and p0 operands 1523def int_amdgcn_interp_inreg_p10_f16: 1524 Intrinsic<[llvm_float_ty], 1525 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 1526 [IntrNoMem, IntrSpeculatable, IntrWillReturn, 1527 ImmArg<ArgIndex<3>>]>; 1528 1529// llvm.amdgcn.interp.inreg.p2.f16 <p>, <j>, <tmp>, <high> 1530// high selects whether high or low 16-bits are used for p operand 1531def int_amdgcn_interp_inreg_p2_f16 : 1532 Intrinsic<[llvm_half_ty], 1533 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 1534 [IntrNoMem, IntrSpeculatable, IntrWillReturn, 1535 ImmArg<ArgIndex<3>>]>; 1536 1537// Deprecated: use llvm.amdgcn.live.mask instead. 1538def int_amdgcn_ps_live : Intrinsic < 1539 [llvm_i1_ty], 1540 [], 1541 [IntrNoMem, IntrWillReturn]>; 1542 1543// Query currently live lanes. 1544// Returns true if lane is live (and not a helper lane). 1545def int_amdgcn_live_mask : Intrinsic <[llvm_i1_ty], 1546 [], [IntrReadMem, IntrInaccessibleMemOnly, IntrWillReturn] 1547>; 1548 1549def int_amdgcn_mbcnt_lo : 1550 ClangBuiltin<"__builtin_amdgcn_mbcnt_lo">, 1551 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1552 [IntrNoMem, IntrWillReturn]>; 1553 1554def int_amdgcn_mbcnt_hi : 1555 ClangBuiltin<"__builtin_amdgcn_mbcnt_hi">, 1556 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1557 [IntrNoMem, IntrWillReturn]>; 1558 1559// llvm.amdgcn.ds.swizzle src offset 1560def int_amdgcn_ds_swizzle : 1561 ClangBuiltin<"__builtin_amdgcn_ds_swizzle">, 1562 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1563 [IntrNoMem, IntrConvergent, IntrWillReturn, 1564 ImmArg<ArgIndex<1>>]>; 1565 1566def int_amdgcn_ubfe : Intrinsic<[llvm_anyint_ty], 1567 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], 1568 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1569>; 1570 1571def int_amdgcn_sbfe : Intrinsic<[llvm_anyint_ty], 1572 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], 1573 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1574>; 1575 1576def int_amdgcn_lerp : 1577 ClangBuiltin<"__builtin_amdgcn_lerp">, 1578 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1579 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1580>; 1581 1582def int_amdgcn_sad_u8 : 1583 ClangBuiltin<"__builtin_amdgcn_sad_u8">, 1584 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1585 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1586>; 1587 1588def int_amdgcn_msad_u8 : 1589 ClangBuiltin<"__builtin_amdgcn_msad_u8">, 1590 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1591 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1592>; 1593 1594def int_amdgcn_sad_hi_u8 : 1595 ClangBuiltin<"__builtin_amdgcn_sad_hi_u8">, 1596 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1597 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1598>; 1599 1600def int_amdgcn_sad_u16 : 1601 ClangBuiltin<"__builtin_amdgcn_sad_u16">, 1602 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1603 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1604>; 1605 1606def int_amdgcn_qsad_pk_u16_u8 : 1607 ClangBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">, 1608 Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], 1609 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1610>; 1611 1612def int_amdgcn_mqsad_pk_u16_u8 : 1613 ClangBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">, 1614 Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], 1615 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1616>; 1617 1618def int_amdgcn_mqsad_u32_u8 : 1619 ClangBuiltin<"__builtin_amdgcn_mqsad_u32_u8">, 1620 Intrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty], 1621 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1622>; 1623 1624def int_amdgcn_cvt_pk_u8_f32 : 1625 ClangBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">, 1626 Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], 1627 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1628>; 1629 1630def int_amdgcn_icmp : 1631 Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>, llvm_i32_ty], 1632 [IntrNoMem, IntrConvergent, IntrWillReturn, 1633 ImmArg<ArgIndex<2>>]>; 1634 1635def int_amdgcn_fcmp : 1636 Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>, llvm_i32_ty], 1637 [IntrNoMem, IntrConvergent, IntrWillReturn, 1638 ImmArg<ArgIndex<2>>]>; 1639 1640def int_amdgcn_ballot : 1641 Intrinsic<[llvm_anyint_ty], [llvm_i1_ty], 1642 [IntrNoMem, IntrConvergent, IntrWillReturn]>; 1643 1644def int_amdgcn_readfirstlane : 1645 ClangBuiltin<"__builtin_amdgcn_readfirstlane">, 1646 Intrinsic<[llvm_i32_ty], [llvm_i32_ty], 1647 [IntrNoMem, IntrConvergent, IntrWillReturn]>; 1648 1649// The lane argument must be uniform across the currently active threads of the 1650// current wave. Otherwise, the result is undefined. 1651def int_amdgcn_readlane : 1652 ClangBuiltin<"__builtin_amdgcn_readlane">, 1653 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1654 [IntrNoMem, IntrConvergent, IntrWillReturn]>; 1655 1656// The value to write and lane select arguments must be uniform across the 1657// currently active threads of the current wave. Otherwise, the result is 1658// undefined. 1659def int_amdgcn_writelane : 1660 ClangBuiltin<"__builtin_amdgcn_writelane">, 1661 Intrinsic<[llvm_i32_ty], [ 1662 llvm_i32_ty, // uniform value to write: returned by the selected lane 1663 llvm_i32_ty, // uniform lane select 1664 llvm_i32_ty // returned by all lanes other than the selected one 1665 ], 1666 [IntrNoMem, IntrConvergent, IntrWillReturn] 1667>; 1668 1669def int_amdgcn_alignbyte : ClangBuiltin<"__builtin_amdgcn_alignbyte">, 1670 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1671 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1672>; 1673 1674def int_amdgcn_mul_i24 : Intrinsic<[llvm_i32_ty], 1675 [llvm_i32_ty, llvm_i32_ty], 1676 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1677>; 1678 1679def int_amdgcn_mul_u24 : Intrinsic<[llvm_i32_ty], 1680 [llvm_i32_ty, llvm_i32_ty], 1681 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1682>; 1683 1684def int_amdgcn_mulhi_i24 : Intrinsic<[llvm_i32_ty], 1685 [llvm_i32_ty, llvm_i32_ty], 1686 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1687>; 1688 1689def int_amdgcn_mulhi_u24 : Intrinsic<[llvm_i32_ty], 1690 [llvm_i32_ty, llvm_i32_ty], 1691 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1692>; 1693 1694// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id) 1695// 1696// bar_val is the total number of waves that will wait on this 1697// barrier, minus 1. 1698def int_amdgcn_ds_gws_init : 1699 ClangBuiltin<"__builtin_amdgcn_ds_gws_init">, 1700 Intrinsic<[], 1701 [llvm_i32_ty, llvm_i32_ty], 1702 [IntrConvergent, IntrWriteMem, 1703 IntrInaccessibleMemOnly, IntrWillReturn], "", 1704 [SDNPMemOperand] 1705>; 1706 1707// llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id) 1708// bar_val is the total number of waves that will wait on this 1709// barrier, minus 1. 1710def int_amdgcn_ds_gws_barrier : 1711 ClangBuiltin<"__builtin_amdgcn_ds_gws_barrier">, 1712 Intrinsic<[], 1713 [llvm_i32_ty, llvm_i32_ty], 1714 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "", 1715 [SDNPMemOperand] 1716>; 1717 1718// llvm.amdgcn.ds.gws.sema.v(i32 resource_id) 1719def int_amdgcn_ds_gws_sema_v : 1720 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_v">, 1721 Intrinsic<[], 1722 [llvm_i32_ty], 1723 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "", 1724 [SDNPMemOperand] 1725>; 1726 1727// llvm.amdgcn.ds.gws.sema.br(i32 vsrc, i32 resource_id) 1728def int_amdgcn_ds_gws_sema_br : 1729 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_br">, 1730 Intrinsic<[], 1731 [llvm_i32_ty, llvm_i32_ty], 1732 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "", 1733 [SDNPMemOperand] 1734>; 1735 1736// llvm.amdgcn.ds.gws.sema.p(i32 resource_id) 1737def int_amdgcn_ds_gws_sema_p : 1738 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_p">, 1739 Intrinsic<[], 1740 [llvm_i32_ty], 1741 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "", 1742 [SDNPMemOperand] 1743>; 1744 1745// llvm.amdgcn.ds.gws.sema.release.all(i32 resource_id) 1746def int_amdgcn_ds_gws_sema_release_all : 1747 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_release_all">, 1748 Intrinsic<[], 1749 [llvm_i32_ty], 1750 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "", 1751 [SDNPMemOperand] 1752>; 1753 1754 1755// Copies the source value to the destination value, with the guarantee that 1756// the source value is computed as if the entire program were executed in WQM. 1757def int_amdgcn_wqm : Intrinsic<[llvm_any_ty], 1758 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1759>; 1760 1761// Copies the source value to the destination value, such that the source 1762// is computed as if the entire program were executed in WQM if any other 1763// program code executes in WQM. 1764def int_amdgcn_softwqm : Intrinsic<[llvm_any_ty], 1765 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1766>; 1767 1768// Return true if at least one thread within the pixel quad passes true into 1769// the function. 1770def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty], 1771 [llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn] 1772>; 1773 1774// If false, set EXEC=0 for the current thread until the end of program. 1775// FIXME: Should this be IntrNoMem, IntrHasSideEffects, or IntrWillReturn? 1776def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>; 1777 1778def int_amdgcn_endpgm : ClangBuiltin<"__builtin_amdgcn_endpgm">, 1779 Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects] 1780>; 1781 1782// If false, mark all active lanes as helper lanes until the end of program. 1783def int_amdgcn_wqm_demote : Intrinsic<[], 1784 [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly] 1785>; 1786 1787// Copies the active channels of the source value to the destination value, 1788// with the guarantee that the source value is computed as if the entire 1789// program were executed in Whole Wavefront Mode, i.e. with all channels 1790// enabled, with a few exceptions: - Phi nodes which require WWM return an 1791// undefined value. 1792def int_amdgcn_strict_wwm : Intrinsic<[llvm_any_ty], 1793 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 1794 IntrConvergent, IntrWillReturn] 1795>; 1796// Deprecated. Use int_amdgcn_strict_wwm instead. 1797def int_amdgcn_wwm : Intrinsic<[llvm_any_ty], 1798 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 1799 IntrConvergent, IntrWillReturn] 1800>; 1801def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty], 1802 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 1803 IntrConvergent, IntrWillReturn] 1804>; 1805 1806// Given a value, copies it while setting all the inactive lanes to a given 1807// value. Note that OpenGL helper lanes are considered active, so if the 1808// program ever uses WQM, then the instruction and the first source will be 1809// computed in WQM. 1810def int_amdgcn_set_inactive : 1811 Intrinsic<[llvm_anyint_ty], 1812 [LLVMMatchType<0>, // value to be copied 1813 LLVMMatchType<0>], // value for the inactive lanes to take 1814 [IntrNoMem, IntrConvergent, IntrWillReturn]>; 1815 1816// Return if the given flat pointer points to a local memory address. 1817def int_amdgcn_is_shared : ClangBuiltin<"__builtin_amdgcn_is_shared">, 1818 Intrinsic<[llvm_i1_ty], [llvm_ptr_ty], 1819 [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>, IntrWillReturn] 1820>; 1821 1822// Return if the given flat pointer points to a prvate memory address. 1823def int_amdgcn_is_private : ClangBuiltin<"__builtin_amdgcn_is_private">, 1824 Intrinsic<[llvm_i1_ty], [llvm_ptr_ty], 1825 [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>, IntrWillReturn] 1826>; 1827 1828//===----------------------------------------------------------------------===// 1829// CI+ Intrinsics 1830//===----------------------------------------------------------------------===// 1831 1832def int_amdgcn_s_dcache_inv_vol : 1833 ClangBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">, 1834 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 1835 1836def int_amdgcn_buffer_wbinvl1_vol : 1837 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">, 1838 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 1839 1840//===----------------------------------------------------------------------===// 1841// VI Intrinsics 1842//===----------------------------------------------------------------------===// 1843 1844// llvm.amdgcn.mov.dpp.i32 <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 1845def int_amdgcn_mov_dpp : 1846 Intrinsic<[llvm_anyint_ty], 1847 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, 1848 llvm_i1_ty], 1849 [IntrNoMem, IntrConvergent, IntrWillReturn, 1850 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, 1851 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>; 1852 1853// llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 1854// Should be equivalent to: 1855// v_mov_b32 <dest> <old> 1856// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 1857def int_amdgcn_update_dpp : 1858 Intrinsic<[llvm_anyint_ty], 1859 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, 1860 llvm_i32_ty, llvm_i32_ty, llvm_i1_ty], 1861 [IntrNoMem, IntrConvergent, IntrWillReturn, 1862 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, 1863 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 1864 1865def int_amdgcn_s_dcache_wb : 1866 ClangBuiltin<"__builtin_amdgcn_s_dcache_wb">, 1867 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 1868 1869def int_amdgcn_s_dcache_wb_vol : 1870 ClangBuiltin<"__builtin_amdgcn_s_dcache_wb_vol">, 1871 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 1872 1873def int_amdgcn_s_memrealtime : 1874 ClangBuiltin<"__builtin_amdgcn_s_memrealtime">, 1875 Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 1876 1877// llvm.amdgcn.ds.permute <index> <src> 1878def int_amdgcn_ds_permute : 1879 ClangBuiltin<"__builtin_amdgcn_ds_permute">, 1880 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1881 [IntrNoMem, IntrConvergent, IntrWillReturn]>; 1882 1883// llvm.amdgcn.ds.bpermute <index> <src> 1884def int_amdgcn_ds_bpermute : 1885 ClangBuiltin<"__builtin_amdgcn_ds_bpermute">, 1886 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1887 [IntrNoMem, IntrConvergent, IntrWillReturn]>; 1888 1889// llvm.amdgcn.perm <src0> <src1> <selector> 1890def int_amdgcn_perm : 1891 ClangBuiltin<"__builtin_amdgcn_perm">, 1892 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1893 [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 1894 1895//===----------------------------------------------------------------------===// 1896// GFX9 Intrinsics 1897//===----------------------------------------------------------------------===// 1898 1899class AMDGPUGlobalLoadLDS : Intrinsic < 1900 [], 1901 [LLVMQualPointerType<llvm_i8_ty, 1>, // Base global pointer to load from 1902 LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base pointer to store to 1903 llvm_i32_ty, // Data byte size: 1/2/4 1904 llvm_i32_ty, // imm offset (applied to both global and LDS address) 1905 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0, 1906 // bit 1 = slc/sc1, 1907 // bit 2 = dlc on gfx10+)) 1908 // bit 4 = scc/nt on gfx90a+)) 1909 [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, 1910 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], 1911 "", [SDNPMemOperand]>; 1912def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS; 1913 1914//===----------------------------------------------------------------------===// 1915// GFX10 Intrinsics 1916//===----------------------------------------------------------------------===// 1917 1918// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control> 1919def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">, 1920 Intrinsic<[llvm_i32_ty], 1921 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 1922 [IntrNoMem, IntrConvergent, IntrWillReturn, 1923 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 1924 1925// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control> 1926def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">, 1927 Intrinsic<[llvm_i32_ty], 1928 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 1929 [IntrNoMem, IntrConvergent, IntrWillReturn, 1930 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 1931 1932// llvm.amdgcn.mov.dpp8.i32 <src> <sel> 1933// <sel> is a 32-bit constant whose high 8 bits must be zero which selects 1934// the lanes to read from. 1935def int_amdgcn_mov_dpp8 : 1936 Intrinsic<[llvm_anyint_ty], 1937 [LLVMMatchType<0>, llvm_i32_ty], 1938 [IntrNoMem, IntrConvergent, IntrWillReturn, 1939 ImmArg<ArgIndex<1>>]>; 1940 1941def int_amdgcn_s_get_waveid_in_workgroup : 1942 ClangBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">, 1943 Intrinsic<[llvm_i32_ty], [], 1944 [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 1945 1946class AMDGPUGlobalAtomicRtn<LLVMType vt> : Intrinsic < 1947 [vt], 1948 [llvm_anyptr_ty, // vaddr 1949 vt], // vdata(VGPR) 1950 [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>], "", 1951 [SDNPMemOperand]>; 1952 1953def int_amdgcn_global_atomic_csub : AMDGPUGlobalAtomicRtn<llvm_i32_ty>; 1954 1955// uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>, 1956// <ray_dir>, <ray_inv_dir>, <texture_descr> 1957// <node_ptr> is i32 or i64. 1958// <ray_dir> and <ray_inv_dir> are both v3f16 or both v3f32. 1959def int_amdgcn_image_bvh_intersect_ray : 1960 Intrinsic<[llvm_v4i32_ty], 1961 [llvm_anyint_ty, llvm_float_ty, llvm_v3f32_ty, llvm_anyvector_ty, 1962 LLVMMatchType<1>, llvm_v4i32_ty], 1963 [IntrReadMem, IntrWillReturn]>; 1964 1965//===----------------------------------------------------------------------===// 1966// GFX11 Intrinsics 1967//===----------------------------------------------------------------------===// 1968 1969// llvm.amdgcn.permlane64 <src0> 1970def int_amdgcn_permlane64 : 1971 Intrinsic<[llvm_i32_ty], [llvm_i32_ty], 1972 [IntrNoMem, IntrConvergent, IntrWillReturn]>; 1973 1974def int_amdgcn_ds_add_gs_reg_rtn : 1975 ClangBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">, 1976 Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], 1977 [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn]>; 1978 1979def int_amdgcn_ds_sub_gs_reg_rtn : 1980 ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">, 1981 Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], 1982 [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn]>; 1983 1984// WMMA (Wave Matrix Multiply-Accumulate) intrinsics 1985// 1986// These operations perform a matrix multiplication and accumulation of 1987// the form: D = A * B + C . 1988 1989class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> : 1990 Intrinsic< 1991 [CD], // %D 1992 [ 1993 AB, // %A 1994 AB, // %B 1995 LLVMMatchType<0>, // %C 1996 ], 1997 [IntrNoMem, IntrConvergent, IntrWillReturn] 1998>; 1999 2000class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> : 2001 Intrinsic< 2002 [CD], // %D 2003 [ 2004 AB, // %A 2005 AB, // %B 2006 LLVMMatchType<0>, // %C 2007 llvm_i1_ty, // %high 2008 ], 2009 [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<3>>] 2010>; 2011 2012class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> : 2013 Intrinsic< 2014 [CD], // %D 2015 [ 2016 llvm_i1_ty, // %A_sign 2017 AB, // %A 2018 llvm_i1_ty, // %B_sign 2019 AB, // %B 2020 LLVMMatchType<0>, // %C 2021 llvm_i1_ty, // %clamp 2022 ], 2023 [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] 2024>; 2025 2026def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llvm_anyfloat_ty>; 2027def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>; 2028def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>; 2029def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>; 2030def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>; 2031def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>; 2032 2033 2034//===----------------------------------------------------------------------===// 2035// Deep learning intrinsics. 2036//===----------------------------------------------------------------------===// 2037 2038// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp) 2039// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2040def int_amdgcn_fdot2 : 2041 ClangBuiltin<"__builtin_amdgcn_fdot2">, 2042 Intrinsic< 2043 [llvm_float_ty], // %r 2044 [ 2045 llvm_v2f16_ty, // %a 2046 llvm_v2f16_ty, // %b 2047 llvm_float_ty, // %c 2048 llvm_i1_ty // %clamp 2049 ], 2050 [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>] 2051 >; 2052 2053// f16 %r = llvm.amdgcn.fdot2.f16.f16(v2f16 %a, v2f16 %b, f16 %c) 2054// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2055def int_amdgcn_fdot2_f16_f16 : 2056 ClangBuiltin<"__builtin_amdgcn_fdot2_f16_f16">, 2057 Intrinsic< 2058 [llvm_half_ty], // %r 2059 [ 2060 llvm_v2f16_ty, // %a 2061 llvm_v2f16_ty, // %b 2062 llvm_half_ty // %c 2063 ], 2064 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 2065 >; 2066 2067// bf16 %r = llvm.amdgcn.fdot2.bf16.bf16(v2bf16 %a, v2bf16 %b, bf16 %c) 2068// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2069def int_amdgcn_fdot2_bf16_bf16 : 2070 ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, 2071 Intrinsic< 2072 [llvm_i16_ty], // %r 2073 [ 2074 llvm_v2i16_ty, // %a 2075 llvm_v2i16_ty, // %b 2076 llvm_i16_ty // %c 2077 ], 2078 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 2079 >; 2080 2081// f32 %r = llvm.amdgcn.fdot2.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp) 2082// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2083def int_amdgcn_fdot2_f32_bf16 : 2084 ClangBuiltin<"__builtin_amdgcn_fdot2_f32_bf16">, 2085 Intrinsic< 2086 [llvm_float_ty], // %r 2087 [ 2088 llvm_v2i16_ty, // %a 2089 llvm_v2i16_ty, // %b 2090 llvm_float_ty, // %c 2091 llvm_i1_ty // %clamp 2092 ], 2093 [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>] 2094 >; 2095 2096// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp) 2097// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2098def int_amdgcn_sdot2 : 2099 ClangBuiltin<"__builtin_amdgcn_sdot2">, 2100 Intrinsic< 2101 [llvm_i32_ty], // %r 2102 [ 2103 llvm_v2i16_ty, // %a 2104 llvm_v2i16_ty, // %b 2105 llvm_i32_ty, // %c 2106 llvm_i1_ty // %clamp 2107 ], 2108 [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>] 2109 >; 2110 2111// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp) 2112// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2113def int_amdgcn_udot2 : 2114 ClangBuiltin<"__builtin_amdgcn_udot2">, 2115 Intrinsic< 2116 [llvm_i32_ty], // %r 2117 [ 2118 llvm_v2i16_ty, // %a 2119 llvm_v2i16_ty, // %b 2120 llvm_i32_ty, // %c 2121 llvm_i1_ty // %clamp 2122 ], 2123 [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>] 2124 >; 2125 2126// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp) 2127// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2128def int_amdgcn_sdot4 : 2129 ClangBuiltin<"__builtin_amdgcn_sdot4">, 2130 Intrinsic< 2131 [llvm_i32_ty], // %r 2132 [ 2133 llvm_i32_ty, // %a 2134 llvm_i32_ty, // %b 2135 llvm_i32_ty, // %c 2136 llvm_i1_ty // %clamp 2137 ], 2138 [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>] 2139 >; 2140 2141// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp) 2142// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2143def int_amdgcn_udot4 : 2144 ClangBuiltin<"__builtin_amdgcn_udot4">, 2145 Intrinsic< 2146 [llvm_i32_ty], // %r 2147 [ 2148 llvm_i32_ty, // %a 2149 llvm_i32_ty, // %b 2150 llvm_i32_ty, // %c 2151 llvm_i1_ty // %clamp 2152 ], 2153 [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>] 2154 >; 2155 2156// i32 %r = llvm.amdgcn.sudot4(i1 %a_sign, v4i8 (as i32) %a, i1 %b_sign, v4i8 (as i32) %b, i32 %c, i1 %clamp) 2157// Treat input as signed (_sign = 1) or unsigned (_sign = 0). 2158// a[i in 0. . . 3] = (%a_sign ? a.i8[i] : promoteToSigned(a.u8[i])); 2159// b[i in 0. . . 3] = (%b_sign ? b.i8[i] : promoteToSigned(b.u8[i])); 2160// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2161def int_amdgcn_sudot4 : 2162 ClangBuiltin<"__builtin_amdgcn_sudot4">, 2163 Intrinsic< 2164 [llvm_i32_ty], // %r 2165 [ 2166 llvm_i1_ty, // %a_sign 2167 llvm_i32_ty, // %a 2168 llvm_i1_ty, // %b_sign 2169 llvm_i32_ty, // %b 2170 llvm_i32_ty, // %c 2171 llvm_i1_ty // %clamp 2172 ], 2173 [IntrNoMem, IntrSpeculatable, IntrWillReturn, 2174 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] 2175 >; 2176 2177// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp) 2178// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2179// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2180def int_amdgcn_sdot8 : 2181 ClangBuiltin<"__builtin_amdgcn_sdot8">, 2182 Intrinsic< 2183 [llvm_i32_ty], // %r 2184 [ 2185 llvm_i32_ty, // %a 2186 llvm_i32_ty, // %b 2187 llvm_i32_ty, // %c 2188 llvm_i1_ty // %clamp 2189 ], 2190 [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>] 2191 >; 2192 2193// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp) 2194// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2195// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2196def int_amdgcn_udot8 : 2197 ClangBuiltin<"__builtin_amdgcn_udot8">, 2198 Intrinsic< 2199 [llvm_i32_ty], // %r 2200 [ 2201 llvm_i32_ty, // %a 2202 llvm_i32_ty, // %b 2203 llvm_i32_ty, // %c 2204 llvm_i1_ty // %clamp 2205 ], 2206 [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>] 2207 >; 2208 2209// i32 %r = llvm.amdgcn.sudot8(i1 %a_sign, v8i4 (as i32) %a, i1 %b_sign, v8i4 (as i32) %b, i32 %c, i1 %clamp) 2210// Treat input as signed (_sign = 1) or unsigned (_sign = 0). 2211// a[i in 0. . . 7] = (%a_sign ? a.i4[i] : promoteToSigned(a.u4[i])); 2212// b[i in 0. . . 7] = (%b_sign ? b.i4[i] : promoteToSigned(b.u4[i])); 2213// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2214// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2215 def int_amdgcn_sudot8 : 2216 ClangBuiltin<"__builtin_amdgcn_sudot8">, 2217 Intrinsic< 2218 [llvm_i32_ty], // %r 2219 [ 2220 llvm_i1_ty, // %a_sign 2221 llvm_i32_ty, // %a 2222 llvm_i1_ty, // %b_sign 2223 llvm_i32_ty, // %b 2224 llvm_i32_ty, // %c 2225 llvm_i1_ty // %clamp 2226 ], 2227 [IntrNoMem, IntrSpeculatable, IntrWillReturn, 2228 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] 2229 >; 2230 2231//===----------------------------------------------------------------------===// 2232// gfx908 intrinsics 2233// ===----------------------------------------------------------------------===// 2234 2235def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; 2236 2237// llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp 2238class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> : 2239 ClangBuiltin<!subst("int", "__builtin", NAME)>, 2240 Intrinsic<[DestTy], 2241 [SrcABTy, SrcABTy, DestTy, 2242 llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2243 [IntrConvergent, IntrNoMem, IntrWillReturn, 2244 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 2245 2246def int_amdgcn_mfma_f32_32x32x1f32 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>; 2247def int_amdgcn_mfma_f32_16x16x1f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>; 2248def int_amdgcn_mfma_f32_4x4x1f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>; 2249def int_amdgcn_mfma_f32_32x32x2f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>; 2250def int_amdgcn_mfma_f32_16x16x4f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>; 2251def int_amdgcn_mfma_f32_32x32x4f16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4f16_ty>; 2252def int_amdgcn_mfma_f32_16x16x4f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>; 2253def int_amdgcn_mfma_f32_4x4x4f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>; 2254def int_amdgcn_mfma_f32_32x32x8f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>; 2255def int_amdgcn_mfma_f32_16x16x16f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>; 2256def int_amdgcn_mfma_i32_32x32x4i8 : AMDGPUMfmaIntrinsic<llvm_v32i32_ty, llvm_i32_ty>; 2257def int_amdgcn_mfma_i32_16x16x4i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>; 2258def int_amdgcn_mfma_i32_4x4x4i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>; 2259def int_amdgcn_mfma_i32_32x32x8i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>; 2260def int_amdgcn_mfma_i32_16x16x16i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>; 2261def int_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v2i16_ty>; 2262def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>; 2263def int_amdgcn_mfma_f32_4x4x2bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>; 2264def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>; 2265def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>; 2266 2267//===----------------------------------------------------------------------===// 2268// gfx90a intrinsics 2269// ===----------------------------------------------------------------------===// 2270 2271def int_amdgcn_global_atomic_fmin : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; 2272def int_amdgcn_global_atomic_fmax : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; 2273def int_amdgcn_flat_atomic_fadd : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; 2274def int_amdgcn_flat_atomic_fmin : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; 2275def int_amdgcn_flat_atomic_fmax : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; 2276 2277def int_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>; 2278def int_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>; 2279def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>; 2280def int_amdgcn_mfma_f32_32x32x8bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>; 2281def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>; 2282 2283// Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA. 2284// Three bits corresponding to the neg modifier applied to the respective 2285// source operand. 2286def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic<llvm_v4f64_ty, llvm_double_ty>; 2287def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>; 2288 2289//===----------------------------------------------------------------------===// 2290// gfx940 intrinsics 2291// ===----------------------------------------------------------------------===// 2292 2293// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. 2294def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>; 2295def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>; 2296def int_amdgcn_ds_fadd_v2bf16 : Intrinsic< 2297 [llvm_v2i16_ty], 2298 [LLVMQualPointerType<llvm_v2i16_ty, 3>, llvm_v2i16_ty], 2299 [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>]>, 2300 ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">; 2301 2302def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>; 2303def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>; 2304def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>; 2305def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>; 2306 2307// llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid 2308class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> : 2309 ClangBuiltin<!subst("int", "__builtin", NAME)>, 2310 Intrinsic<[DestTy], 2311 [SrcA, SrcB, DestTy, llvm_i32_ty, 2312 llvm_i32_ty, llvm_i32_ty], 2313 [IntrConvergent, IntrNoMem, IntrWillReturn, 2314 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 2315 2316def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>; 2317def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>; 2318def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>; 2319def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>; 2320def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>; 2321def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>; 2322 2323//===----------------------------------------------------------------------===// 2324// Special Intrinsics for backend internal use only. No frontend 2325// should emit calls to these. 2326// ===----------------------------------------------------------------------===// 2327def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_anyint_ty], 2328 [llvm_i1_ty], [IntrConvergent, IntrWillReturn] 2329>; 2330 2331def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_anyint_ty], 2332 [llvm_anyint_ty], [IntrConvergent, IntrWillReturn] 2333>; 2334 2335def int_amdgcn_if_break : Intrinsic<[llvm_anyint_ty], 2336 [llvm_i1_ty, LLVMMatchType<0>], 2337 [IntrNoMem, IntrConvergent, IntrWillReturn] 2338>; 2339 2340def int_amdgcn_loop : Intrinsic<[llvm_i1_ty], 2341 [llvm_anyint_ty], [IntrConvergent, IntrWillReturn] 2342>; 2343 2344def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty], 2345 [IntrConvergent, IntrWillReturn]>; 2346 2347// Represent unreachable in a divergent region. 2348def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent]>; 2349 2350// Emit 2.5 ulp, no denormal division. Should only be inserted by 2351// pass based on !fpmath metadata. 2352def int_amdgcn_fdiv_fast : Intrinsic< 2353 [llvm_float_ty], [llvm_float_ty, llvm_float_ty], 2354 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 2355>; 2356 2357// Represent a relocation constant. 2358def int_amdgcn_reloc_constant : Intrinsic< 2359 [llvm_i32_ty], [llvm_metadata_ty], 2360 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 2361>; 2362} 2363