1//===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics -----*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines all of the R600-specific intrinsics. 10// 11//===----------------------------------------------------------------------===// 12 13class AMDGPUReadPreloadRegisterIntrinsic 14 : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 15 16class AMDGPUReadPreloadRegisterIntrinsicNamed<string name> 17 : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>, ClangBuiltin<name>; 18 19// Used to tag image and resource intrinsics with information used to generate 20// mem operands. 21class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = false> { 22 int RsrcArg = rsrcarg; 23 bit IsImage = isimage; 24} 25 26let TargetPrefix = "r600" in { 27 28multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz { 29 def _x : AMDGPUReadPreloadRegisterIntrinsic; 30 def _y : AMDGPUReadPreloadRegisterIntrinsic; 31 def _z : AMDGPUReadPreloadRegisterIntrinsic; 32} 33 34multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> { 35 def _x : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_x")>; 36 def _y : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_y")>; 37 def _z : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_z")>; 38} 39 40defm int_r600_read_global_size : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 41 <"__builtin_r600_read_global_size">; 42defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 43 <"__builtin_r600_read_ngroups">; 44defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 45 <"__builtin_r600_read_tgid">; 46 47defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz; 48defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz; 49 50def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">, 51 Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>; 52 53// AS 7 is PARAM_I_ADDRESS, used for kernel arguments 54def int_r600_implicitarg_ptr : 55 ClangBuiltin<"__builtin_r600_implicitarg_ptr">, 56 Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 7>], [], 57 [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 58 59def int_r600_rat_store_typed : 60 // 1st parameter: Data 61 // 2nd parameter: Index 62 // 3rd parameter: Constant RAT ID 63 Intrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrWillReturn]>, 64 ClangBuiltin<"__builtin_r600_rat_store_typed">; 65 66def int_r600_recipsqrt_ieee : Intrinsic< 67 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 68>; 69 70def int_r600_recipsqrt_clamped : Intrinsic< 71 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 72>; 73 74def int_r600_cube : Intrinsic< 75 [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 76>; 77 78def int_r600_store_stream_output : Intrinsic< 79 [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrWillReturn] 80>; 81 82class TextureIntrinsicFloatInput : Intrinsic<[llvm_v4f32_ty], [ 83 llvm_v4f32_ty, // Coord 84 llvm_i32_ty, // offset_x 85 llvm_i32_ty, // offset_y, 86 llvm_i32_ty, // offset_z, 87 llvm_i32_ty, // resource_id 88 llvm_i32_ty, // samplerid 89 llvm_i32_ty, // coord_type_x 90 llvm_i32_ty, // coord_type_y 91 llvm_i32_ty, // coord_type_z 92 llvm_i32_ty], // coord_type_w 93 [IntrNoMem, IntrWillReturn] 94>; 95 96class TextureIntrinsicInt32Input : Intrinsic<[llvm_v4i32_ty], [ 97 llvm_v4i32_ty, // Coord 98 llvm_i32_ty, // offset_x 99 llvm_i32_ty, // offset_y, 100 llvm_i32_ty, // offset_z, 101 llvm_i32_ty, // resource_id 102 llvm_i32_ty, // samplerid 103 llvm_i32_ty, // coord_type_x 104 llvm_i32_ty, // coord_type_y 105 llvm_i32_ty, // coord_type_z 106 llvm_i32_ty], // coord_type_w 107 [IntrNoMem, IntrWillReturn] 108>; 109 110def int_r600_store_swizzle : 111 Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrWillReturn] 112>; 113 114def int_r600_tex : TextureIntrinsicFloatInput; 115def int_r600_texc : TextureIntrinsicFloatInput; 116def int_r600_txl : TextureIntrinsicFloatInput; 117def int_r600_txlc : TextureIntrinsicFloatInput; 118def int_r600_txb : TextureIntrinsicFloatInput; 119def int_r600_txbc : TextureIntrinsicFloatInput; 120def int_r600_txf : TextureIntrinsicInt32Input; 121def int_r600_txq : TextureIntrinsicInt32Input; 122def int_r600_ddx : TextureIntrinsicFloatInput; 123def int_r600_ddy : TextureIntrinsicFloatInput; 124 125def int_r600_dot4 : Intrinsic<[llvm_float_ty], 126 [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 127>; 128 129def int_r600_kill : Intrinsic<[], [llvm_float_ty], [IntrWillReturn]>; 130 131} // End TargetPrefix = "r600" 132 133let TargetPrefix = "amdgcn" in { 134 135//===----------------------------------------------------------------------===// 136// ABI Special Intrinsics 137//===----------------------------------------------------------------------===// 138 139defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz; 140defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 141 <"__builtin_amdgcn_workgroup_id">; 142 143def int_amdgcn_dispatch_ptr : 144 Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], 145 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 146 147def int_amdgcn_queue_ptr : 148 ClangBuiltin<"__builtin_amdgcn_queue_ptr">, 149 Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], 150 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 151 152def int_amdgcn_kernarg_segment_ptr : 153 ClangBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">, 154 Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], 155 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 156 157def int_amdgcn_implicitarg_ptr : 158 ClangBuiltin<"__builtin_amdgcn_implicitarg_ptr">, 159 Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], 160 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 161 162def int_amdgcn_groupstaticsize : 163 ClangBuiltin<"__builtin_amdgcn_groupstaticsize">, 164 Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 165 166def int_amdgcn_dispatch_id : 167 ClangBuiltin<"__builtin_amdgcn_dispatch_id">, 168 Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 169 170// For internal use. Coordinates LDS lowering between IR transform and backend. 171def int_amdgcn_lds_kernel_id : 172 Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 173 174def int_amdgcn_implicit_buffer_ptr : 175 ClangBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">, 176 Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 4>], [], 177 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 178 179// Set EXEC to the 64-bit value given. 180// This is always moved to the beginning of the basic block. 181// FIXME: Should be mangled for wave size. 182def int_amdgcn_init_exec : Intrinsic<[], 183 [llvm_i64_ty], // 64-bit literal constant 184 [IntrConvergent, ImmArg<ArgIndex<0>>]>; 185 186// Set EXEC according to a thread count packed in an SGPR input: 187// thread_count = (input >> bitoffset) & 0x7f; 188// This is always moved to the beginning of the basic block. 189// Note: only inreg arguments to the parent function are valid as 190// inputs to this intrinsic, computed values cannot be used. 191def int_amdgcn_init_exec_from_input : Intrinsic<[], 192 [llvm_i32_ty, // 32-bit SGPR input 193 llvm_i32_ty], // bit offset of the thread count 194 [IntrConvergent, ImmArg<ArgIndex<1>>]>; 195 196def int_amdgcn_wavefrontsize : 197 ClangBuiltin<"__builtin_amdgcn_wavefrontsize">, 198 Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 199 200 201//===----------------------------------------------------------------------===// 202// Instruction Intrinsics 203//===----------------------------------------------------------------------===// 204 205// The first parameter is s_sendmsg immediate (i16), 206// the second one is copied to m0 207def int_amdgcn_s_sendmsg : ClangBuiltin<"__builtin_amdgcn_s_sendmsg">, 208 Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], 209 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 210def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">, 211 Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], 212 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 213 214 215// gfx11 intrinsic 216// The first parameter is s_sendmsg immediate (i16). Return type is i32 or i64. 217def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty], 218 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 219 220def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">, 221 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn]>; 222 223def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">, 224 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn]>; 225 226// The 1st parameter is a mask for the types of instructions that may be allowed 227// to cross the SCHED_BARRIER during scheduling. 228// MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER. 229// MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be 230// scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass. 231// MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER. 232// MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER. 233// MASK = 0x0000 0008: MFMA instructions may be scheduled across SCHED_BARRIER. 234// MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER. 235// MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER. 236// MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER. 237// MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER. 238// MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER. 239// MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER. 240def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">, 241 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 242 IntrWillReturn]>; 243 244def int_amdgcn_s_waitcnt : ClangBuiltin<"__builtin_amdgcn_s_waitcnt">, 245 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 246 247def int_amdgcn_div_scale : Intrinsic< 248 // 1st parameter: Numerator 249 // 2nd parameter: Denominator 250 // 3rd parameter: Select quotient. Must equal Numerator or Denominator. 251 // (0 = Denominator, 1 = Numerator). 252 [llvm_anyfloat_ty, llvm_i1_ty], 253 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], 254 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>, IntrWillReturn] 255>; 256 257def int_amdgcn_div_fmas : Intrinsic<[llvm_anyfloat_ty], 258 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], 259 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 260>; 261 262def int_amdgcn_div_fixup : Intrinsic<[llvm_anyfloat_ty], 263 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 264 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 265>; 266 267// Look Up 2.0 / pi src0 with segment select src1[4:0] 268def int_amdgcn_trig_preop : Intrinsic< 269 [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], 270 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 271>; 272 273def int_amdgcn_sin : Intrinsic< 274 [llvm_anyfloat_ty], [LLVMMatchType<0>], 275 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 276>; 277 278def int_amdgcn_cos : Intrinsic< 279 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 280>; 281 282def int_amdgcn_log_clamp : Intrinsic< 283 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 284>; 285 286def int_amdgcn_fmul_legacy : ClangBuiltin<"__builtin_amdgcn_fmul_legacy">, 287 Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], 288 [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative] 289>; 290 291// Fused single-precision multiply-add with legacy behaviour for the multiply, 292// which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is 293// intended for use on subtargets that have the v_fma_legacy_f32 and/or 294// v_fmac_legacy_f32 instructions. (Note that v_fma_legacy_f16 is unrelated and 295// has a completely different kind of legacy behaviour.) 296def int_amdgcn_fma_legacy : 297 Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], 298 [IntrNoMem, IntrSpeculatable, IntrWillReturn, Commutative] 299>; 300 301def int_amdgcn_rcp : Intrinsic< 302 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 303>; 304 305def int_amdgcn_rcp_legacy : ClangBuiltin<"__builtin_amdgcn_rcp_legacy">, 306 Intrinsic<[llvm_float_ty], [llvm_float_ty], 307 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 308>; 309 310def int_amdgcn_sqrt : Intrinsic< 311 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 312>; 313 314def int_amdgcn_rsq : Intrinsic< 315 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 316>; 317 318def int_amdgcn_rsq_legacy : ClangBuiltin<"__builtin_amdgcn_rsq_legacy">, 319 Intrinsic< 320 [llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 321>; 322 323// out = 1.0 / sqrt(a) result clamped to +/- max_float. 324def int_amdgcn_rsq_clamp : Intrinsic< 325 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 326 327def int_amdgcn_ldexp : Intrinsic< 328 [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], 329 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 330>; 331 332def int_amdgcn_frexp_mant : Intrinsic< 333 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 334>; 335 336def int_amdgcn_frexp_exp : Intrinsic< 337 [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 338>; 339 340// v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0 341// and always uses rtz, so is not suitable for implementing the OpenCL 342// fract function. It should be ok on VI. 343def int_amdgcn_fract : Intrinsic< 344 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 345>; 346 347def int_amdgcn_cvt_pkrtz : ClangBuiltin<"__builtin_amdgcn_cvt_pkrtz">, 348 Intrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], 349 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 350>; 351 352def int_amdgcn_cvt_pknorm_i16 : 353 ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_i16">, 354 Intrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], 355 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 356>; 357 358def int_amdgcn_cvt_pknorm_u16 : 359 ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_u16">, 360 Intrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], 361 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 362>; 363 364def int_amdgcn_cvt_pk_i16 : 365 ClangBuiltin<"__builtin_amdgcn_cvt_pk_i16">, 366 Intrinsic< 367 [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], 368 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 369>; 370 371def int_amdgcn_cvt_pk_u16 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_u16">, 372 Intrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], 373 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 374>; 375 376def int_amdgcn_class : Intrinsic< 377 [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], 378 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 379>; 380 381def int_amdgcn_fmed3 : ClangBuiltin<"__builtin_amdgcn_fmed3">, 382 Intrinsic<[llvm_anyfloat_ty], 383 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 384 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 385>; 386 387def int_amdgcn_cubeid : ClangBuiltin<"__builtin_amdgcn_cubeid">, 388 Intrinsic<[llvm_float_ty], 389 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 390 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 391>; 392 393def int_amdgcn_cubema : ClangBuiltin<"__builtin_amdgcn_cubema">, 394 Intrinsic<[llvm_float_ty], 395 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 396 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 397>; 398 399def int_amdgcn_cubesc : ClangBuiltin<"__builtin_amdgcn_cubesc">, 400 Intrinsic<[llvm_float_ty], 401 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 402 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 403>; 404 405def int_amdgcn_cubetc : ClangBuiltin<"__builtin_amdgcn_cubetc">, 406 Intrinsic<[llvm_float_ty], 407 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 408 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 409>; 410 411// v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz 412// should be used. 413def int_amdgcn_sffbh : 414 Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], 415 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 416>; 417 418// v_mad_f32|f16/v_mac_f32|f16, selected regardless of denorm support. 419def int_amdgcn_fmad_ftz : 420 Intrinsic<[llvm_anyfloat_ty], 421 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 422 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 423>; 424 425// Fields should mirror atomicrmw 426class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty], 427 [llvm_anyptr_ty, 428 LLVMMatchType<0>, 429 llvm_i32_ty, // ordering 430 llvm_i32_ty, // scope 431 llvm_i1_ty], // isVolatile 432 [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, 433 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", 434 [SDNPMemOperand] 435>; 436 437def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin; 438def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin; 439 440class AMDGPULDSIntrin : 441 Intrinsic<[llvm_any_ty], 442 [LLVMQualPointerType<LLVMMatchType<0>, 3>, 443 LLVMMatchType<0>, 444 llvm_i32_ty, // ordering 445 llvm_i32_ty, // scope 446 llvm_i1_ty], // isVolatile 447 [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, 448 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>] 449>; 450 451// FIXME: The m0 argument should be moved after the normal arguments 452class AMDGPUDSOrderedIntrinsic : Intrinsic< 453 [llvm_i32_ty], 454 // M0 = {hi16:address, lo16:waveID}. Allow passing M0 as a pointer, so that 455 // the bit packing can be optimized at the IR level. 456 [LLVMQualPointerType<llvm_i32_ty, 2>, // IntToPtr(M0) 457 llvm_i32_ty, // value to add or swap 458 llvm_i32_ty, // ordering 459 llvm_i32_ty, // scope 460 llvm_i1_ty, // isVolatile 461 llvm_i32_ty, // ordered count index (OA index), also added to the address 462 // gfx10: bits 24-27 indicate the number of active threads/dwords 463 llvm_i1_ty, // wave release, usually set to 1 464 llvm_i1_ty], // wave done, set to 1 for the last ordered instruction 465 [IntrWillReturn, NoCapture<ArgIndex<0>>, 466 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, 467 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>> 468 ] 469>; 470 471class AMDGPUDSAppendConsumedIntrinsic : Intrinsic< 472 [llvm_i32_ty], 473 [llvm_anyptr_ty, // LDS or GDS ptr 474 llvm_i1_ty], // isVolatile 475 [IntrConvergent, IntrWillReturn, IntrArgMemOnly, 476 NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>], 477 "", 478 [SDNPMemOperand] 479>; 480 481def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic; 482def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic; 483 484// The pointer argument is assumed to be dynamically uniform if a VGPR. 485def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic; 486def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic; 487 488def int_amdgcn_ds_fadd : AMDGPULDSIntrin; 489def int_amdgcn_ds_fmin : AMDGPULDSIntrin; 490def int_amdgcn_ds_fmax : AMDGPULDSIntrin; 491 492} // TargetPrefix = "amdgcn" 493 494// New-style image intrinsics 495 496////////////////////////////////////////////////////////////////////////// 497// Dimension-aware image intrinsics framework 498////////////////////////////////////////////////////////////////////////// 499 500// Helper class to represent (type, name) combinations of arguments. The 501// argument names are explanatory and used as DAG operand names for codegen 502// pattern matching. 503class AMDGPUArg<LLVMType ty, string name> { 504 LLVMType Type = ty; 505 string Name = name; 506} 507 508// Return [AMDGPUArg<basety, names[0]>, AMDGPUArg<LLVMMatchType<0>, names[1]>, ...] 509class makeArgList<list<string> names, LLVMType basety> { 510 list<AMDGPUArg> ret = 511 !listconcat([AMDGPUArg<basety, names[0]>], 512 !foreach(name, !tail(names), AMDGPUArg<LLVMMatchType<0>, name>)); 513} 514 515// Return arglist, with LLVMMatchType's references shifted by 'shift'. 516class arglistmatchshift<list<AMDGPUArg> arglist, int shift> { 517 list<AMDGPUArg> ret = 518 !foreach(arg, arglist, 519 !if(!isa<LLVMMatchType>(arg.Type), 520 AMDGPUArg<LLVMMatchType<!add(!cast<LLVMMatchType>(arg.Type).Number, shift)>, 521 arg.Name>, 522 arg)); 523} 524 525// Return the concatenation of the given arglists. LLVMMatchType's are adjusted 526// accordingly, and shifted by an additional 'shift'. 527class arglistconcat<list<list<AMDGPUArg>> arglists, int shift = 0> { 528 list<AMDGPUArg> ret = 529 !foldl([]<AMDGPUArg>, arglists, lhs, rhs, 530 !listconcat( 531 lhs, 532 arglistmatchshift<rhs, 533 !add(shift, !foldl(0, lhs, a, b, 534 !add(a, b.Type.isAny)))>.ret)); 535} 536 537// Represent texture/image types / dimensionality. 538class AMDGPUDimProps<bits<3> enc, string name, string asmsuffix, 539 list<string> coord_names, list<string> slice_names, 540 bit msaa = 0> { 541 AMDGPUDimProps Dim = !cast<AMDGPUDimProps>(NAME); 542 string Name = name; // e.g. "2darraymsaa" 543 string AsmSuffix = asmsuffix; // e.g. 2D_MSAA_ARRAY (used in assembly strings) 544 bits<3> Encoding = enc; 545 bit DA = 0; // DA bit in MIMG encoding 546 bit MSAA = msaa; 547 548 list<AMDGPUArg> CoordSliceArgs = 549 makeArgList<!listconcat(coord_names, slice_names), llvm_anyfloat_ty>.ret; 550 list<AMDGPUArg> CoordSliceIntArgs = 551 makeArgList<!listconcat(coord_names, slice_names), llvm_anyint_ty>.ret; 552 list<AMDGPUArg> GradientArgs = 553 makeArgList<!listconcat(!foreach(name, coord_names, "d" # name # "dh"), 554 !foreach(name, coord_names, "d" # name # "dv")), 555 llvm_anyfloat_ty>.ret; 556 557 bits<8> NumCoords = !size(CoordSliceArgs); 558 bits<8> NumGradients = !size(GradientArgs); 559} 560 561def AMDGPUDim1D : AMDGPUDimProps<0x0, "1d", "1D", ["s"], []>; 562def AMDGPUDim2D : AMDGPUDimProps<0x1, "2d", "2D", ["s", "t"], []>; 563def AMDGPUDim3D : AMDGPUDimProps<0x2, "3d", "3D", ["s", "t", "r"], []>; 564let DA = 1 in { 565 def AMDGPUDimCube : AMDGPUDimProps<0x3, "cube", "CUBE", ["s", "t"], ["face"]>; 566 def AMDGPUDim1DArray : AMDGPUDimProps<0x4, "1darray", "1D_ARRAY", ["s"], ["slice"]>; 567 def AMDGPUDim2DArray : AMDGPUDimProps<0x5, "2darray", "2D_ARRAY", ["s", "t"], ["slice"]>; 568} 569def AMDGPUDim2DMsaa : AMDGPUDimProps<0x6, "2dmsaa", "2D_MSAA", ["s", "t"], ["fragid"], 1>; 570let DA = 1 in { 571 def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<0x7, "2darraymsaa", "2D_MSAA_ARRAY", ["s", "t"], ["slice", "fragid"], 1>; 572} 573 574def AMDGPUDims { 575 list<AMDGPUDimProps> NoMsaa = [AMDGPUDim1D, AMDGPUDim2D, AMDGPUDim3D, 576 AMDGPUDimCube, AMDGPUDim1DArray, 577 AMDGPUDim2DArray]; 578 list<AMDGPUDimProps> Msaa = [AMDGPUDim2DMsaa, AMDGPUDim2DArrayMsaa]; 579 list<AMDGPUDimProps> All = !listconcat(NoMsaa, Msaa); 580} 581 582// Represent sample variants, i.e. _C, _O, _B, ... and combinations thereof. 583class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr> { 584 string UpperCaseMod = ucmod; 585 string LowerCaseMod = lcmod; 586 587 // {offset} {bias} {z-compare} 588 list<AMDGPUArg> ExtraAddrArgs = extra_addr; 589 bit Offset = false; 590 bit Bias = false; 591 bit ZCompare = false; 592 bit Gradients = false; 593 594 // Name of the {lod} or {clamp} argument that is appended to the coordinates, 595 // if any. 596 string LodOrClamp = ""; 597} 598 599// AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE 600// AMDGPUSampleVariantsNoGradients: variants supported by IMAGE_GATHER4 601defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = { 602 multiclass AMDGPUSampleHelper_Offset<string ucmod, string lcmod, 603 list<AMDGPUArg> extra_addr> { 604 def NAME#lcmod : AMDGPUSampleVariant<ucmod, lcmod, extra_addr>; 605 let Offset = true in 606 def NAME#lcmod#_o : AMDGPUSampleVariant< 607 ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg<llvm_i32_ty, "offset">], extra_addr)>; 608 } 609 610 multiclass AMDGPUSampleHelper_Compare<string ucmod, string lcmod, 611 list<AMDGPUArg> extra_addr> { 612 defm NAME : AMDGPUSampleHelper_Offset<ucmod, lcmod, extra_addr>; 613 let ZCompare = true in 614 defm NAME : AMDGPUSampleHelper_Offset< 615 "_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg<llvm_float_ty, "zcompare">])>; 616 } 617 618 multiclass AMDGPUSampleHelper_Clamp<string ucmod, string lcmod, 619 list<AMDGPUArg> extra_addr> { 620 defm NAME : AMDGPUSampleHelper_Compare<ucmod, lcmod, extra_addr>; 621 let LodOrClamp = "clamp" in 622 defm NAME : AMDGPUSampleHelper_Compare<ucmod#"_CL", lcmod#"_cl", extra_addr>; 623 } 624 625 defset list<AMDGPUSampleVariant> AMDGPUSampleVariantsNoGradients = { 626 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>; 627 let Bias = true in 628 defm AMDGPUSample : AMDGPUSampleHelper_Clamp< 629 "_B", "_b", [AMDGPUArg<llvm_anyfloat_ty, "bias">]>; 630 let LodOrClamp = "lod" in 631 defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_L", "_l", []>; 632 defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_LZ", "_lz", []>; 633 } 634 635 let Gradients = true in { 636 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_D", "_d", []>; 637 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_CD", "_cd", []>; 638 } 639} 640 641// Helper class to capture the profile of a dimension-aware image intrinsic. 642// This information is used to generate the intrinsic's type and to inform 643// codegen pattern matching. 644class AMDGPUDimProfile<string opmod, 645 AMDGPUDimProps dim> { 646 AMDGPUDimProps Dim = dim; 647 string OpMod = opmod; // the corresponding instruction is named IMAGE_OpMod 648 649 // These are intended to be overwritten by subclasses 650 bit IsSample = false; 651 bit IsAtomic = false; 652 list<LLVMType> RetTypes = []; 653 list<AMDGPUArg> DataArgs = []; 654 list<AMDGPUArg> ExtraAddrArgs = []; 655 bit Offset = false; 656 bit Bias = false; 657 bit ZCompare = false; 658 bit Gradients = false; 659 string LodClampMip = ""; 660 661 int NumRetAndDataAnyTypes = 662 !foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b, 663 !add(a, b.isAny)); 664 665 list<AMDGPUArg> AddrArgs = 666 arglistconcat<[ExtraAddrArgs, 667 !if(Gradients, dim.GradientArgs, []), 668 !listconcat(!if(IsSample, dim.CoordSliceArgs, dim.CoordSliceIntArgs), 669 !if(!empty(LodClampMip), 670 []<AMDGPUArg>, 671 [AMDGPUArg<LLVMMatchType<0>, LodClampMip>]))], 672 NumRetAndDataAnyTypes>.ret; 673 list<LLVMType> AddrTypes = !foreach(arg, AddrArgs, arg.Type); 674 list<AMDGPUArg> AddrDefaultArgs = 675 !foreach(arg, AddrArgs, 676 AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)), 677 !if(IsSample, llvm_float_ty, llvm_i32_ty), arg.Type), 678 arg.Name>); 679 list<AMDGPUArg> AddrA16Args = 680 !foreach(arg, AddrArgs, 681 AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)), 682 !if(IsSample, llvm_half_ty, llvm_i16_ty), arg.Type), 683 arg.Name>); 684} 685 686class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod, base.Dim> { 687 let IsSample = base.IsSample; 688 let IsAtomic = base.IsAtomic; 689 let RetTypes = base.RetTypes; 690 let DataArgs = base.DataArgs; 691 let ExtraAddrArgs = base.ExtraAddrArgs; 692 let Offset = base.Offset; 693 let Bias = base.Bias; 694 let ZCompare = base.ZCompare; 695 let Gradients = base.Gradients; 696 let LodClampMip = base.LodClampMip; 697} 698 699class AMDGPUDimSampleProfile<string opmod, 700 AMDGPUDimProps dim, 701 AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> { 702 let IsSample = true; 703 let RetTypes = [llvm_any_ty]; 704 let ExtraAddrArgs = sample.ExtraAddrArgs; 705 let Offset = sample.Offset; 706 let Bias = sample.Bias; 707 let ZCompare = sample.ZCompare; 708 let Gradients = sample.Gradients; 709 let LodClampMip = sample.LodOrClamp; 710} 711 712class AMDGPUDimNoSampleProfile<string opmod, 713 AMDGPUDimProps dim, 714 list<LLVMType> retty, 715 list<AMDGPUArg> dataargs, 716 bit Mip = false> : AMDGPUDimProfile<opmod, dim> { 717 let RetTypes = retty; 718 let DataArgs = dataargs; 719 let LodClampMip = !if(Mip, "mip", ""); 720} 721 722class AMDGPUDimAtomicProfile<string opmod, 723 AMDGPUDimProps dim, 724 list<AMDGPUArg> dataargs> : AMDGPUDimProfile<opmod, dim> { 725 let RetTypes = [llvm_anyint_ty]; 726 let DataArgs = dataargs; 727 let IsAtomic = true; 728} 729 730class AMDGPUDimAtomicFloatProfile<string opmod, AMDGPUDimProps dim, 731 list<AMDGPUArg> dataargs> 732 : AMDGPUDimAtomicProfile<opmod, dim, dataargs> { 733 let RetTypes = [llvm_anyfloat_ty]; 734} 735 736class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> 737 : AMDGPUDimProfile<"GET_RESINFO", dim> { 738 let RetTypes = [llvm_anyfloat_ty]; 739 let DataArgs = []; 740 let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">]; 741 let LodClampMip = "mip"; 742} 743 744// Helper class for figuring out image intrinsic argument indexes. 745class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> { 746 int NumDataArgs = !size(P_.DataArgs); 747 int NumDmaskArgs = !not(P_.IsAtomic); 748 int NumOffsetArgs = !if(P_.Offset, 1, 0); 749 int NumBiasArgs = !if(P_.Bias, 1, 0); 750 int NumZCompareArgs = !if(P_.ZCompare, 1, 0); 751 int NumExtraAddrArgs = !add(NumOffsetArgs, NumBiasArgs, NumZCompareArgs); 752 int NumVAddrArgs = !size(P_.AddrArgs); 753 int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0); 754 int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs)); 755 int NumRSrcArgs = 1; 756 int NumSampArgs = !if(P_.IsSample, 2, 0); 757 int DmaskArgIndex = NumDataArgs; 758 int VAddrArgIndex = !add(DmaskArgIndex, NumDmaskArgs); 759 int OffsetArgIndex = VAddrArgIndex; 760 int BiasArgIndex = !add(VAddrArgIndex, NumOffsetArgs); 761 int ZCompareArgIndex = !add(BiasArgIndex, NumBiasArgs); 762 int GradientArgIndex = !add(VAddrArgIndex, NumExtraAddrArgs); 763 int CoordArgIndex = !add(GradientArgIndex, NumGradientArgs); 764 int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1); 765 int MipArgIndex = LodArgIndex; 766 int RsrcArgIndex = !add(VAddrArgIndex, NumVAddrArgs); 767 int SampArgIndex = !add(RsrcArgIndex, NumRSrcArgs); 768 int UnormArgIndex = !add(SampArgIndex, 1); 769 int TexFailCtrlArgIndex = !add(SampArgIndex, NumSampArgs); 770 int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1); 771} 772 773// All dimension-aware intrinsics are derived from this class. 774class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_, 775 list<IntrinsicProperty> props, 776 list<SDNodeProperty> sdnodeprops> : Intrinsic< 777 P_.RetTypes, // vdata(VGPR) -- for load/atomic-with-return 778 !listconcat( 779 !foreach(arg, P_.DataArgs, arg.Type), // vdata(VGPR) -- for store/atomic 780 !if(P_.IsAtomic, [], [llvm_i32_ty]), // dmask(imm) 781 P_.AddrTypes, // vaddr(VGPR) 782 [llvm_v8i32_ty], // rsrc(SGPR) 783 !if(P_.IsSample, [llvm_v4i32_ty, // samp(SGPR) 784 llvm_i1_ty], []), // unorm(imm) 785 [llvm_i32_ty, // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe) 786 llvm_i32_ty]), // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc) 787 788 !listconcat(props, 789 !if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]), 790 !if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []), 791 [IntrWillReturn], 792 [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>, 793 ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>]), 794 795 796 "", sdnodeprops>, 797 AMDGPURsrcIntrinsic<!add(!size(P_.DataArgs), !size(P_.AddrTypes), 798 !if(P_.IsAtomic, 0, 1)), 1> { 799 AMDGPUDimProfile P = P_; 800 801 AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME); 802 803 let TargetPrefix = "amdgcn"; 804} 805 806// Marker class for intrinsics with a DMask that determines the returned 807// channels. 808class AMDGPUImageDMaskIntrinsic; 809 810defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = { 811 812 ////////////////////////////////////////////////////////////////////////// 813 // Load and store intrinsics 814 ////////////////////////////////////////////////////////////////////////// 815 multiclass AMDGPUImageDimIntrinsicsNoMsaa<string opmod, 816 list<LLVMType> retty, 817 list<AMDGPUArg> dataargs, 818 list<IntrinsicProperty> props, 819 list<SDNodeProperty> sdnodeprops, 820 bit Mip = false> { 821 foreach dim = AMDGPUDims.NoMsaa in { 822 def !strconcat(NAME, "_", dim.Name) 823 : AMDGPUImageDimIntrinsic< 824 AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>, 825 props, sdnodeprops>; 826 } 827 } 828 829 multiclass AMDGPUImageDimIntrinsicsAll<string opmod, 830 list<LLVMType> retty, 831 list<AMDGPUArg> dataargs, 832 list<IntrinsicProperty> props, 833 list<SDNodeProperty> sdnodeprops, 834 bit Mip = false> { 835 foreach dim = AMDGPUDims.All in { 836 def !strconcat(NAME, "_", dim.Name) 837 : AMDGPUImageDimIntrinsic< 838 AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>, 839 props, sdnodeprops>; 840 } 841 } 842 843 defm int_amdgcn_image_load 844 : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem], 845 [SDNPMemOperand]>, 846 AMDGPUImageDMaskIntrinsic; 847 defm int_amdgcn_image_load_mip 848 : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [], 849 [IntrReadMem, IntrWillReturn], [SDNPMemOperand], 1>, 850 AMDGPUImageDMaskIntrinsic; 851 852 defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll< 853 "STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">], 854 [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>; 855 defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa< 856 "STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">], 857 [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>; 858 859 ////////////////////////////////////////////////////////////////////////// 860 // MSAA intrinsics 861 ////////////////////////////////////////////////////////////////////////// 862 foreach dim = AMDGPUDims.Msaa in { 863 def int_amdgcn_image_msaa_load_x # _ # dim.Name: 864 AMDGPUImageDimIntrinsic< 865 AMDGPUDimNoSampleProfile<"MSAA_LOAD_X", dim, [llvm_any_ty], []>, 866 [IntrReadMem], [SDNPMemOperand]>; 867 } 868 869 foreach dim = AMDGPUDims.Msaa in { 870 def int_amdgcn_image_msaa_load # _ # dim.Name: 871 AMDGPUImageDimIntrinsic< 872 AMDGPUDimNoSampleProfile<"MSAA_LOAD", dim, [llvm_any_ty], []>, 873 [IntrReadMem], [SDNPMemOperand]>; 874 } 875 876 ////////////////////////////////////////////////////////////////////////// 877 // sample and getlod intrinsics 878 ////////////////////////////////////////////////////////////////////////// 879 multiclass AMDGPUImageDimSampleDims<string opmod, 880 AMDGPUSampleVariant sample, 881 bit NoMem = false> { 882 foreach dim = AMDGPUDims.NoMsaa in { 883 def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic< 884 AMDGPUDimSampleProfile<opmod, dim, sample>, 885 !if(NoMem, [IntrNoMem], [IntrReadMem]), 886 !if(NoMem, [], [SDNPMemOperand])>; 887 } 888 } 889 890 foreach sample = AMDGPUSampleVariants in { 891 defm int_amdgcn_image_sample # sample.LowerCaseMod 892 : AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>, 893 AMDGPUImageDMaskIntrinsic; 894 } 895 896 defm int_amdgcn_image_getlod 897 : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>, 898 AMDGPUImageDMaskIntrinsic; 899 900 ////////////////////////////////////////////////////////////////////////// 901 // getresinfo intrinsics 902 ////////////////////////////////////////////////////////////////////////// 903 foreach dim = AMDGPUDims.All in { 904 def !strconcat("int_amdgcn_image_getresinfo_", dim.Name) 905 : AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>, 906 AMDGPUImageDMaskIntrinsic; 907 } 908 909 ////////////////////////////////////////////////////////////////////////// 910 // gather4 intrinsics 911 ////////////////////////////////////////////////////////////////////////// 912 foreach sample = AMDGPUSampleVariantsNoGradients in { 913 foreach dim = [AMDGPUDim2D, AMDGPUDimCube, AMDGPUDim2DArray] in { 914 def int_amdgcn_image_gather4 # sample.LowerCaseMod # _ # dim.Name: 915 AMDGPUImageDimIntrinsic< 916 AMDGPUDimSampleProfile<"GATHER4" # sample.UpperCaseMod, dim, sample>, 917 [IntrReadMem], [SDNPMemOperand]>; 918 } 919 } 920} 921 922////////////////////////////////////////////////////////////////////////// 923// atomic intrinsics 924////////////////////////////////////////////////////////////////////////// 925defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = { 926 multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs, 927 int isFloat = 0> { 928 foreach dim = AMDGPUDims.All in { 929 def !strconcat(NAME, "_", dim.Name): AMDGPUImageDimIntrinsic< 930 !if (isFloat, AMDGPUDimAtomicFloatProfile<opmod, dim, dataargs>, 931 AMDGPUDimAtomicProfile<opmod, dim, dataargs>), 932 [], [SDNPMemOperand]>; 933 } 934 } 935 936 multiclass AMDGPUImageDimAtomic<string opmod, int isFloat = 0> { 937 defm "" 938 : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">], 939 isFloat>; 940 } 941 942 multiclass AMDGPUImageDimFloatAtomic<string opmod> { 943 defm "" : AMDGPUImageDimAtomic<opmod, 1 /*isFloat*/>; 944 } 945 946 defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">; 947 defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">; 948 defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">; 949 defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">; 950 defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">; 951 defm int_amdgcn_image_atomic_fmin : AMDGPUImageDimFloatAtomic<"ATOMIC_FMIN">; 952 defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">; 953 defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">; 954 defm int_amdgcn_image_atomic_fmax : AMDGPUImageDimFloatAtomic<"ATOMIC_FMAX">; 955 defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">; 956 defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">; 957 defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">; 958 defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">; 959 defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">; 960 961 defm int_amdgcn_image_atomic_cmpswap : 962 AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">, 963 AMDGPUArg<LLVMMatchType<0>, "cmp">]>; 964} 965 966////////////////////////////////////////////////////////////////////////// 967// Buffer intrinsics 968////////////////////////////////////////////////////////////////////////// 969 970let TargetPrefix = "amdgcn" in { 971 972defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = { 973 974class AMDGPUBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic < 975 [data_ty], 976 [llvm_v4i32_ty, // rsrc(SGPR) 977 llvm_i32_ty, // vindex(VGPR) 978 llvm_i32_ty, // offset(SGPR/VGPR/imm) 979 llvm_i1_ty, // glc(imm) 980 llvm_i1_ty], // slc(imm) 981 [IntrReadMem, IntrWillReturn, 982 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 983 AMDGPURsrcIntrinsic<0>; 984def int_amdgcn_buffer_load_format : AMDGPUBufferLoad<llvm_anyfloat_ty>; 985def int_amdgcn_buffer_load : AMDGPUBufferLoad; 986 987// Generate a buffer_load instruction that may be optimized to s_buffer_load if 988// the offset argument is uniform. 989def int_amdgcn_s_buffer_load : Intrinsic < 990 [llvm_any_ty], 991 [llvm_v4i32_ty, // rsrc(SGPR) 992 llvm_i32_ty, // byte offset 993 llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 2 = dlc) 994 [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<2>>]>, 995 AMDGPURsrcIntrinsic<0>; 996 997class AMDGPUBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic < 998 [], 999 [data_ty, // vdata(VGPR) 1000 llvm_v4i32_ty, // rsrc(SGPR) 1001 llvm_i32_ty, // vindex(VGPR) 1002 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1003 llvm_i1_ty, // glc(imm) 1004 llvm_i1_ty], // slc(imm) 1005 [IntrWriteMem, IntrWillReturn, 1006 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1007 AMDGPURsrcIntrinsic<1>; 1008def int_amdgcn_buffer_store_format : AMDGPUBufferStore<llvm_anyfloat_ty>; 1009def int_amdgcn_buffer_store : AMDGPUBufferStore; 1010 1011// New buffer intrinsics with separate raw and struct variants. The raw 1012// variant never has an index. The struct variant always has an index, even if 1013// it is const 0. A struct intrinsic with constant 0 index is different to the 1014// corresponding raw intrinsic on gfx9+ because the behavior of bound checking 1015// and swizzling changes depending on whether idxen is set in the instruction. 1016// These new instrinsics also keep the offset and soffset arguments separate as 1017// they behave differently in bounds checking and swizzling. 1018class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1019 [data_ty], 1020 [llvm_v4i32_ty, // rsrc(SGPR) 1021 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1022 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1023 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1024 // bit 1 = slc, 1025 // bit 2 = dlc on gfx10+), 1026 // swizzled buffer (bit 3 = swz)) 1027 [IntrReadMem, IntrWillReturn, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>, 1028 AMDGPURsrcIntrinsic<0>; 1029def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>; 1030def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad; 1031 1032class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1033 [data_ty], 1034 [llvm_v4i32_ty, // rsrc(SGPR) 1035 llvm_i32_ty, // vindex(VGPR) 1036 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1037 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1038 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1039 // bit 1 = slc, 1040 // bit 2 = dlc on gfx10+), 1041 // swizzled buffer (bit 3 = swz)) 1042 [IntrReadMem, IntrWillReturn, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1043 AMDGPURsrcIntrinsic<0>; 1044def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad; 1045def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad; 1046 1047class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1048 [], 1049 [data_ty, // vdata(VGPR) 1050 llvm_v4i32_ty, // rsrc(SGPR) 1051 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1052 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1053 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1054 // bit 1 = slc, 1055 // bit 2 = dlc on gfx10+), 1056 // swizzled buffer (bit 3 = swz)) 1057 [IntrWriteMem, IntrWillReturn, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1058 AMDGPURsrcIntrinsic<1>; 1059def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore<llvm_anyfloat_ty>; 1060def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore; 1061 1062class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1063 [], 1064 [data_ty, // vdata(VGPR) 1065 llvm_v4i32_ty, // rsrc(SGPR) 1066 llvm_i32_ty, // vindex(VGPR) 1067 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1068 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1069 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1070 // bit 1 = slc, 1071 // bit 2 = dlc on gfx10+), 1072 // swizzled buffer (bit 3 = swz)) 1073 [IntrWriteMem, IntrWillReturn, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1074 AMDGPURsrcIntrinsic<1>; 1075def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; 1076def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore; 1077 1078class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic < 1079 !if(NoRtn, [], [data_ty]), 1080 [!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR) 1081 llvm_v4i32_ty, // rsrc(SGPR) 1082 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1083 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1084 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1085 [ImmArg<ArgIndex<4>>, IntrWillReturn], "", [SDNPMemOperand]>, 1086 AMDGPURsrcIntrinsic<1, 0>; 1087def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic; 1088def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic; 1089def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic; 1090def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic; 1091def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic; 1092def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1093def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic; 1094def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic; 1095def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1096def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic; 1097def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic; 1098def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic; 1099def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic; 1100def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic; 1101def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< 1102 [llvm_anyint_ty], 1103 [LLVMMatchType<0>, // src(VGPR) 1104 LLVMMatchType<0>, // cmp(VGPR) 1105 llvm_v4i32_ty, // rsrc(SGPR) 1106 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1107 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1108 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1109 [ImmArg<ArgIndex<5>>, IntrWillReturn], "", [SDNPMemOperand]>, 1110 AMDGPURsrcIntrinsic<2, 0>; 1111 1112// gfx908 intrinsic 1113def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1114 1115class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic < 1116 !if(NoRtn, [], [data_ty]), 1117 [!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR) 1118 llvm_v4i32_ty, // rsrc(SGPR) 1119 llvm_i32_ty, // vindex(VGPR) 1120 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1121 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1122 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1123 [ImmArg<ArgIndex<5>>, IntrWillReturn], "", [SDNPMemOperand]>, 1124 AMDGPURsrcIntrinsic<1, 0>; 1125def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic; 1126def int_amdgcn_struct_buffer_atomic_add : AMDGPUStructBufferAtomic; 1127def int_amdgcn_struct_buffer_atomic_sub : AMDGPUStructBufferAtomic; 1128def int_amdgcn_struct_buffer_atomic_smin : AMDGPUStructBufferAtomic; 1129def int_amdgcn_struct_buffer_atomic_umin : AMDGPUStructBufferAtomic; 1130def int_amdgcn_struct_buffer_atomic_smax : AMDGPUStructBufferAtomic; 1131def int_amdgcn_struct_buffer_atomic_umax : AMDGPUStructBufferAtomic; 1132def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic; 1133def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic; 1134def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic; 1135def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic; 1136def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic; 1137def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< 1138 [llvm_anyint_ty], 1139 [LLVMMatchType<0>, // src(VGPR) 1140 LLVMMatchType<0>, // cmp(VGPR) 1141 llvm_v4i32_ty, // rsrc(SGPR) 1142 llvm_i32_ty, // vindex(VGPR) 1143 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1144 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1145 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1146 [ImmArg<ArgIndex<6>>, IntrWillReturn], "", [SDNPMemOperand]>, 1147 AMDGPURsrcIntrinsic<2, 0>; 1148 1149// gfx908 intrinsic 1150def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1151 1152// gfx90a intrinsics 1153def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1154def int_amdgcn_struct_buffer_atomic_fmax : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1155 1156 1157// Obsolescent tbuffer intrinsics. 1158def int_amdgcn_tbuffer_load : Intrinsic < 1159 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1160 [llvm_v4i32_ty, // rsrc(SGPR) 1161 llvm_i32_ty, // vindex(VGPR) 1162 llvm_i32_ty, // voffset(VGPR) 1163 llvm_i32_ty, // soffset(SGPR) 1164 llvm_i32_ty, // offset(imm) 1165 llvm_i32_ty, // dfmt(imm) 1166 llvm_i32_ty, // nfmt(imm) 1167 llvm_i1_ty, // glc(imm) 1168 llvm_i1_ty], // slc(imm) 1169 [IntrReadMem, IntrWillReturn, 1170 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, 1171 ImmArg<ArgIndex<7>>, ImmArg<ArgIndex<8>>], "", [SDNPMemOperand]>, 1172 AMDGPURsrcIntrinsic<0>; 1173 1174def int_amdgcn_tbuffer_store : Intrinsic < 1175 [], 1176 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1177 llvm_v4i32_ty, // rsrc(SGPR) 1178 llvm_i32_ty, // vindex(VGPR) 1179 llvm_i32_ty, // voffset(VGPR) 1180 llvm_i32_ty, // soffset(SGPR) 1181 llvm_i32_ty, // offset(imm) 1182 llvm_i32_ty, // dfmt(imm) 1183 llvm_i32_ty, // nfmt(imm) 1184 llvm_i1_ty, // glc(imm) 1185 llvm_i1_ty], // slc(imm) 1186 [IntrWriteMem, IntrWillReturn, ImmArg<ArgIndex<5>>, 1187 ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, 1188 ImmArg<ArgIndex<8>>, ImmArg<ArgIndex<9>>], "", [SDNPMemOperand]>, 1189 AMDGPURsrcIntrinsic<1>; 1190 1191// New tbuffer intrinsics, with: 1192// - raw and struct variants 1193// - joint format field 1194// - joint cachepolicy field 1195def int_amdgcn_raw_tbuffer_load : Intrinsic < 1196 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1197 [llvm_v4i32_ty, // rsrc(SGPR) 1198 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1199 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1200 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1201 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1202 // bit 1 = slc, 1203 // bit 2 = dlc on gfx10+), 1204 // swizzled buffer (bit 3 = swz)) 1205 [IntrReadMem, IntrWillReturn, 1206 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1207 AMDGPURsrcIntrinsic<0>; 1208 1209def int_amdgcn_raw_tbuffer_store : Intrinsic < 1210 [], 1211 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1212 llvm_v4i32_ty, // rsrc(SGPR) 1213 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1214 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1215 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1216 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1217 // bit 1 = slc, 1218 // bit 2 = dlc on gfx10+), 1219 // swizzled buffer (bit 3 = swz)) 1220 [IntrWriteMem, IntrWillReturn, 1221 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1222 AMDGPURsrcIntrinsic<1>; 1223 1224def int_amdgcn_struct_tbuffer_load : Intrinsic < 1225 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1226 [llvm_v4i32_ty, // rsrc(SGPR) 1227 llvm_i32_ty, // vindex(VGPR) 1228 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1229 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1230 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1231 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1232 // bit 1 = slc, 1233 // bit 2 = dlc on gfx10+), 1234 // swizzled buffer (bit 3 = swz)) 1235 [IntrReadMem, IntrWillReturn, 1236 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1237 AMDGPURsrcIntrinsic<0>; 1238 1239def int_amdgcn_struct_tbuffer_store : Intrinsic < 1240 [], 1241 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1242 llvm_v4i32_ty, // rsrc(SGPR) 1243 llvm_i32_ty, // vindex(VGPR) 1244 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1245 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1246 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1247 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1248 // bit 1 = slc, 1249 // bit 2 = dlc on gfx10+), 1250 // swizzled buffer (bit 3 = swz)) 1251 [IntrWriteMem, IntrWillReturn, 1252 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, 1253 AMDGPURsrcIntrinsic<1>; 1254 1255class AMDGPUBufferAtomic : Intrinsic < 1256 [llvm_anyint_ty], 1257 [LLVMMatchType<0>, // vdata(VGPR) 1258 llvm_v4i32_ty, // rsrc(SGPR) 1259 llvm_i32_ty, // vindex(VGPR) 1260 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1261 llvm_i1_ty], // slc(imm) 1262 [ImmArg<ArgIndex<4>>, IntrWillReturn], "", [SDNPMemOperand]>, 1263 AMDGPURsrcIntrinsic<1, 0>; 1264def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic; 1265def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic; 1266def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic; 1267def int_amdgcn_buffer_atomic_smin : AMDGPUBufferAtomic; 1268def int_amdgcn_buffer_atomic_umin : AMDGPUBufferAtomic; 1269def int_amdgcn_buffer_atomic_smax : AMDGPUBufferAtomic; 1270def int_amdgcn_buffer_atomic_umax : AMDGPUBufferAtomic; 1271def int_amdgcn_buffer_atomic_and : AMDGPUBufferAtomic; 1272def int_amdgcn_buffer_atomic_or : AMDGPUBufferAtomic; 1273def int_amdgcn_buffer_atomic_xor : AMDGPUBufferAtomic; 1274def int_amdgcn_buffer_atomic_cmpswap : Intrinsic< 1275 [llvm_i32_ty], 1276 [llvm_i32_ty, // src(VGPR) 1277 llvm_i32_ty, // cmp(VGPR) 1278 llvm_v4i32_ty, // rsrc(SGPR) 1279 llvm_i32_ty, // vindex(VGPR) 1280 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1281 llvm_i1_ty], // slc(imm) 1282 [ImmArg<ArgIndex<5>>, IntrWillReturn], "", [SDNPMemOperand]>, 1283 AMDGPURsrcIntrinsic<2, 0>; 1284 1285def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic; 1286 1287class AMDGPUBufferAtomicFP : Intrinsic < 1288 [llvm_anyfloat_ty], 1289 [LLVMMatchType<0>, // vdata(VGPR) 1290 llvm_v4i32_ty, // rsrc(SGPR) 1291 llvm_i32_ty, // vindex(VGPR) 1292 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1293 llvm_i1_ty], // slc(imm) 1294 [ImmArg<ArgIndex<4>>, IntrWillReturn], "", [SDNPMemOperand]>, 1295 AMDGPURsrcIntrinsic<1, 0>; 1296 1297// Legacy form of the intrinsic. raw and struct forms should be preferred. 1298def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP; 1299 1300class AMDGPURawBufferLoadLDS : Intrinsic < 1301 [], 1302 [llvm_v4i32_ty, // rsrc(SGPR) 1303 LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset 1304 llvm_i32_ty, // Data byte size: 1/2/4 1305 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1306 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1307 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1308 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1309 // bit 1 = slc, 1310 // bit 2 = dlc on gfx10+)) 1311 // swizzled buffer (bit 3 = swz)) 1312 [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, 1313 ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1314def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS; 1315 1316class AMDGPUStructBufferLoadLDS : Intrinsic < 1317 [], 1318 [llvm_v4i32_ty, // rsrc(SGPR) 1319 LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base offset 1320 llvm_i32_ty, // Data byte size: 1/2/4 1321 llvm_i32_ty, // vindex(VGPR) 1322 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1323 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1324 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1325 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1326 // bit 1 = slc, 1327 // bit 2 = dlc on gfx10+)) 1328 // swizzled buffer (bit 3 = swz)) 1329 [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, 1330 ImmArg<ArgIndex<7>>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1331def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS; 1332 1333} // defset AMDGPUBufferIntrinsics 1334 1335// Uses that do not set the done bit should set IntrWriteMem on the 1336// call site. 1337def int_amdgcn_exp : Intrinsic <[], [ 1338 llvm_i32_ty, // tgt, 1339 llvm_i32_ty, // en 1340 llvm_any_ty, // src0 (f32 or i32) 1341 LLVMMatchType<0>, // src1 1342 LLVMMatchType<0>, // src2 1343 LLVMMatchType<0>, // src3 1344 llvm_i1_ty, // done 1345 llvm_i1_ty // vm (ignored on GFX11+) 1346 ], 1347 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, 1348 ImmArg<ArgIndex<7>>, IntrWriteMem, IntrInaccessibleMemOnly, 1349 IntrWillReturn] 1350>; 1351 1352// exp with row_en bit set. Only supported on GFX11+. 1353def int_amdgcn_exp_row : Intrinsic <[], [ 1354 llvm_i32_ty, // tgt, 1355 llvm_i32_ty, // en 1356 llvm_any_ty, // src0 (f32 or i32) 1357 LLVMMatchType<0>, // src1 1358 LLVMMatchType<0>, // src2 1359 LLVMMatchType<0>, // src3 1360 llvm_i1_ty, // done 1361 llvm_i32_ty], // row number 1362 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, 1363 IntrWriteMem, IntrInaccessibleMemOnly, IntrWillReturn] 1364>; 1365 1366// exp with compr bit set. Not supported on GFX11+. 1367def int_amdgcn_exp_compr : Intrinsic <[], [ 1368 llvm_i32_ty, // tgt, 1369 llvm_i32_ty, // en 1370 llvm_anyvector_ty, // src0 (v2f16 or v2i16) 1371 LLVMMatchType<0>, // src1 1372 llvm_i1_ty, // done 1373 llvm_i1_ty], // vm 1374 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>, 1375 ImmArg<ArgIndex<5>>, IntrWriteMem, IntrInaccessibleMemOnly, 1376 IntrWillReturn] 1377>; 1378 1379def int_amdgcn_buffer_wbinvl1_sc : 1380 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">, 1381 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 1382 1383def int_amdgcn_buffer_wbinvl1 : 1384 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1">, 1385 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 1386 1387def int_amdgcn_s_dcache_inv : 1388 ClangBuiltin<"__builtin_amdgcn_s_dcache_inv">, 1389 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 1390 1391def int_amdgcn_s_memtime : 1392 ClangBuiltin<"__builtin_amdgcn_s_memtime">, 1393 Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 1394 1395def int_amdgcn_s_sleep : 1396 ClangBuiltin<"__builtin_amdgcn_s_sleep">, 1397 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1398 IntrHasSideEffects, IntrWillReturn]> { 1399} 1400 1401def int_amdgcn_s_incperflevel : 1402 ClangBuiltin<"__builtin_amdgcn_s_incperflevel">, 1403 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1404 IntrHasSideEffects, IntrWillReturn]> { 1405} 1406 1407def int_amdgcn_s_decperflevel : 1408 ClangBuiltin<"__builtin_amdgcn_s_decperflevel">, 1409 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1410 IntrHasSideEffects, IntrWillReturn]> { 1411} 1412 1413def int_amdgcn_s_sethalt : 1414 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1415 IntrHasSideEffects, IntrWillReturn]>; 1416 1417def int_amdgcn_s_setprio : 1418 ClangBuiltin<"__builtin_amdgcn_s_setprio">, 1419 Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1420 IntrHasSideEffects, IntrWillReturn]>; 1421 1422// This is IntrHasSideEffects so it can be used to read cycle counters. 1423def int_amdgcn_s_getreg : 1424 ClangBuiltin<"__builtin_amdgcn_s_getreg">, 1425 Intrinsic<[llvm_i32_ty], [llvm_i32_ty], 1426 [IntrNoMem, IntrHasSideEffects, IntrWillReturn, ImmArg<ArgIndex<0>>] 1427>; 1428 1429// Note this can be used to set FP environment properties that are 1430// unsafe to change in non-strictfp functions. The register properties 1431// available (and value required to access them) may differ per 1432// subtarget. llvm.amdgcn.s.setreg(hwmode, value) 1433def int_amdgcn_s_setreg : 1434 ClangBuiltin<"__builtin_amdgcn_s_setreg">, 1435 Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], 1436 [IntrNoMem, IntrHasSideEffects, IntrWillReturn, ImmArg<ArgIndex<0>>] 1437>; 1438 1439// int_amdgcn_s_getpc is provided to allow a specific style of position 1440// independent code to determine the high part of its address when it is 1441// known (through convention) that the code and any data of interest does 1442// not cross a 4Gb address boundary. Use for any other purpose may not 1443// produce the desired results as optimizations may cause code movement, 1444// especially as we explicitly use IntrNoMem to allow optimizations. 1445def int_amdgcn_s_getpc : 1446 ClangBuiltin<"__builtin_amdgcn_s_getpc">, 1447 Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable, 1448 IntrWillReturn]>; 1449 1450// __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0> 1451// param values: 0 = P10, 1 = P20, 2 = P0 1452def int_amdgcn_interp_mov : 1453 ClangBuiltin<"__builtin_amdgcn_interp_mov">, 1454 Intrinsic<[llvm_float_ty], 1455 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1456 [IntrNoMem, IntrSpeculatable, IntrWillReturn, 1457 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; 1458 1459// __builtin_amdgcn_interp_p1 <i>, <attr_chan>, <attr>, <m0> 1460// This intrinsic reads from lds, but the memory values are constant, 1461// so it behaves like IntrNoMem. 1462def int_amdgcn_interp_p1 : 1463 ClangBuiltin<"__builtin_amdgcn_interp_p1">, 1464 Intrinsic<[llvm_float_ty], 1465 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1466 [IntrNoMem, IntrSpeculatable, IntrWillReturn, 1467 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; 1468 1469// __builtin_amdgcn_interp_p2 <p1>, <j>, <attr_chan>, <attr>, <m0> 1470def int_amdgcn_interp_p2 : 1471 ClangBuiltin<"__builtin_amdgcn_interp_p2">, 1472 Intrinsic<[llvm_float_ty], 1473 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1474 [IntrNoMem, IntrSpeculatable, IntrWillReturn, 1475 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; 1476 // See int_amdgcn_v_interp_p1 for why this is IntrNoMem. 1477 1478// __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0> 1479// high selects whether high or low 16-bits are loaded from LDS 1480def int_amdgcn_interp_p1_f16 : 1481 ClangBuiltin<"__builtin_amdgcn_interp_p1_f16">, 1482 Intrinsic<[llvm_float_ty], 1483 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], 1484 [IntrNoMem, IntrSpeculatable, IntrWillReturn, 1485 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; 1486 1487// __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0> 1488// high selects whether high or low 16-bits are loaded from LDS 1489def int_amdgcn_interp_p2_f16 : 1490 ClangBuiltin<"__builtin_amdgcn_interp_p2_f16">, 1491 Intrinsic<[llvm_half_ty], 1492 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], 1493 [IntrNoMem, IntrSpeculatable, IntrWillReturn, 1494 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>; 1495 1496// llvm.amdgcn.lds.direct.load <m0> 1497// The input argument is m0, which contains a packed combination of address 1498// offset and flags describing the data type. 1499def int_amdgcn_lds_direct_load : 1500 Intrinsic<[llvm_any_ty], // overloaded for types u8, u16, i32/f32, i8, i16 1501 [llvm_i32_ty], 1502 [IntrReadMem, IntrSpeculatable, IntrWillReturn]>; 1503 1504// llvm.amdgcn.lds.param.load <attr_chan>, <attr>, <m0> 1505// Like interp intrinsics, this reads from lds, but the memory values are constant, 1506// so it behaves like IntrNoMem. 1507def int_amdgcn_lds_param_load : 1508 Intrinsic<[llvm_float_ty], 1509 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1510 [IntrNoMem, IntrSpeculatable, IntrWillReturn, 1511 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; 1512 1513// llvm.amdgcn.interp.inreg.p10 <p>, <i>, <p0> 1514def int_amdgcn_interp_inreg_p10 : 1515 Intrinsic<[llvm_float_ty], 1516 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 1517 [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 1518 1519// llvm.amdgcn.interp.inreg.p2 <p>, <j>, <tmp> 1520def int_amdgcn_interp_inreg_p2 : 1521 Intrinsic<[llvm_float_ty], 1522 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 1523 [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 1524 1525// llvm.amdgcn.interp.inreg.p10.f16 <p>, <i>, <p0>, <high> 1526// high selects whether high or low 16-bits are used for p and p0 operands 1527def int_amdgcn_interp_inreg_p10_f16: 1528 Intrinsic<[llvm_float_ty], 1529 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 1530 [IntrNoMem, IntrSpeculatable, IntrWillReturn, 1531 ImmArg<ArgIndex<3>>]>; 1532 1533// llvm.amdgcn.interp.inreg.p2.f16 <p>, <j>, <tmp>, <high> 1534// high selects whether high or low 16-bits are used for p operand 1535def int_amdgcn_interp_inreg_p2_f16 : 1536 Intrinsic<[llvm_half_ty], 1537 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 1538 [IntrNoMem, IntrSpeculatable, IntrWillReturn, 1539 ImmArg<ArgIndex<3>>]>; 1540 1541// Deprecated: use llvm.amdgcn.live.mask instead. 1542def int_amdgcn_ps_live : Intrinsic < 1543 [llvm_i1_ty], 1544 [], 1545 [IntrNoMem, IntrWillReturn]>; 1546 1547// Query currently live lanes. 1548// Returns true if lane is live (and not a helper lane). 1549def int_amdgcn_live_mask : Intrinsic <[llvm_i1_ty], 1550 [], [IntrReadMem, IntrInaccessibleMemOnly, IntrWillReturn] 1551>; 1552 1553def int_amdgcn_mbcnt_lo : 1554 ClangBuiltin<"__builtin_amdgcn_mbcnt_lo">, 1555 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1556 [IntrNoMem, IntrWillReturn]>; 1557 1558def int_amdgcn_mbcnt_hi : 1559 ClangBuiltin<"__builtin_amdgcn_mbcnt_hi">, 1560 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1561 [IntrNoMem, IntrWillReturn]>; 1562 1563// llvm.amdgcn.ds.swizzle src offset 1564def int_amdgcn_ds_swizzle : 1565 ClangBuiltin<"__builtin_amdgcn_ds_swizzle">, 1566 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1567 [IntrNoMem, IntrConvergent, IntrWillReturn, 1568 ImmArg<ArgIndex<1>>]>; 1569 1570def int_amdgcn_ubfe : Intrinsic<[llvm_anyint_ty], 1571 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], 1572 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1573>; 1574 1575def int_amdgcn_sbfe : Intrinsic<[llvm_anyint_ty], 1576 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], 1577 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1578>; 1579 1580def int_amdgcn_lerp : 1581 ClangBuiltin<"__builtin_amdgcn_lerp">, 1582 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1583 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1584>; 1585 1586def int_amdgcn_sad_u8 : 1587 ClangBuiltin<"__builtin_amdgcn_sad_u8">, 1588 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1589 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1590>; 1591 1592def int_amdgcn_msad_u8 : 1593 ClangBuiltin<"__builtin_amdgcn_msad_u8">, 1594 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1595 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1596>; 1597 1598def int_amdgcn_sad_hi_u8 : 1599 ClangBuiltin<"__builtin_amdgcn_sad_hi_u8">, 1600 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1601 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1602>; 1603 1604def int_amdgcn_sad_u16 : 1605 ClangBuiltin<"__builtin_amdgcn_sad_u16">, 1606 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1607 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1608>; 1609 1610def int_amdgcn_qsad_pk_u16_u8 : 1611 ClangBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">, 1612 Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], 1613 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1614>; 1615 1616def int_amdgcn_mqsad_pk_u16_u8 : 1617 ClangBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">, 1618 Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], 1619 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1620>; 1621 1622def int_amdgcn_mqsad_u32_u8 : 1623 ClangBuiltin<"__builtin_amdgcn_mqsad_u32_u8">, 1624 Intrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty], 1625 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1626>; 1627 1628def int_amdgcn_cvt_pk_u8_f32 : 1629 ClangBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">, 1630 Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], 1631 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1632>; 1633 1634def int_amdgcn_icmp : 1635 Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>, llvm_i32_ty], 1636 [IntrNoMem, IntrConvergent, IntrWillReturn, 1637 ImmArg<ArgIndex<2>>]>; 1638 1639def int_amdgcn_fcmp : 1640 Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>, llvm_i32_ty], 1641 [IntrNoMem, IntrConvergent, IntrWillReturn, 1642 ImmArg<ArgIndex<2>>]>; 1643 1644def int_amdgcn_ballot : 1645 Intrinsic<[llvm_anyint_ty], [llvm_i1_ty], 1646 [IntrNoMem, IntrConvergent, IntrWillReturn]>; 1647 1648def int_amdgcn_readfirstlane : 1649 ClangBuiltin<"__builtin_amdgcn_readfirstlane">, 1650 Intrinsic<[llvm_i32_ty], [llvm_i32_ty], 1651 [IntrNoMem, IntrConvergent, IntrWillReturn]>; 1652 1653// The lane argument must be uniform across the currently active threads of the 1654// current wave. Otherwise, the result is undefined. 1655def int_amdgcn_readlane : 1656 ClangBuiltin<"__builtin_amdgcn_readlane">, 1657 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1658 [IntrNoMem, IntrConvergent, IntrWillReturn]>; 1659 1660// The value to write and lane select arguments must be uniform across the 1661// currently active threads of the current wave. Otherwise, the result is 1662// undefined. 1663def int_amdgcn_writelane : 1664 ClangBuiltin<"__builtin_amdgcn_writelane">, 1665 Intrinsic<[llvm_i32_ty], [ 1666 llvm_i32_ty, // uniform value to write: returned by the selected lane 1667 llvm_i32_ty, // uniform lane select 1668 llvm_i32_ty // returned by all lanes other than the selected one 1669 ], 1670 [IntrNoMem, IntrConvergent, IntrWillReturn] 1671>; 1672 1673def int_amdgcn_alignbyte : ClangBuiltin<"__builtin_amdgcn_alignbyte">, 1674 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1675 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1676>; 1677 1678def int_amdgcn_mul_i24 : Intrinsic<[llvm_i32_ty], 1679 [llvm_i32_ty, llvm_i32_ty], 1680 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1681>; 1682 1683def int_amdgcn_mul_u24 : Intrinsic<[llvm_i32_ty], 1684 [llvm_i32_ty, llvm_i32_ty], 1685 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1686>; 1687 1688def int_amdgcn_mulhi_i24 : Intrinsic<[llvm_i32_ty], 1689 [llvm_i32_ty, llvm_i32_ty], 1690 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1691>; 1692 1693def int_amdgcn_mulhi_u24 : Intrinsic<[llvm_i32_ty], 1694 [llvm_i32_ty, llvm_i32_ty], 1695 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1696>; 1697 1698// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id) 1699// 1700// bar_val is the total number of waves that will wait on this 1701// barrier, minus 1. 1702def int_amdgcn_ds_gws_init : 1703 ClangBuiltin<"__builtin_amdgcn_ds_gws_init">, 1704 Intrinsic<[], 1705 [llvm_i32_ty, llvm_i32_ty], 1706 [IntrConvergent, IntrWriteMem, 1707 IntrInaccessibleMemOnly, IntrWillReturn], "", 1708 [SDNPMemOperand] 1709>; 1710 1711// llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id) 1712// bar_val is the total number of waves that will wait on this 1713// barrier, minus 1. 1714def int_amdgcn_ds_gws_barrier : 1715 ClangBuiltin<"__builtin_amdgcn_ds_gws_barrier">, 1716 Intrinsic<[], 1717 [llvm_i32_ty, llvm_i32_ty], 1718 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "", 1719 [SDNPMemOperand] 1720>; 1721 1722// llvm.amdgcn.ds.gws.sema.v(i32 resource_id) 1723def int_amdgcn_ds_gws_sema_v : 1724 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_v">, 1725 Intrinsic<[], 1726 [llvm_i32_ty], 1727 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "", 1728 [SDNPMemOperand] 1729>; 1730 1731// llvm.amdgcn.ds.gws.sema.br(i32 vsrc, i32 resource_id) 1732def int_amdgcn_ds_gws_sema_br : 1733 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_br">, 1734 Intrinsic<[], 1735 [llvm_i32_ty, llvm_i32_ty], 1736 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "", 1737 [SDNPMemOperand] 1738>; 1739 1740// llvm.amdgcn.ds.gws.sema.p(i32 resource_id) 1741def int_amdgcn_ds_gws_sema_p : 1742 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_p">, 1743 Intrinsic<[], 1744 [llvm_i32_ty], 1745 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "", 1746 [SDNPMemOperand] 1747>; 1748 1749// llvm.amdgcn.ds.gws.sema.release.all(i32 resource_id) 1750def int_amdgcn_ds_gws_sema_release_all : 1751 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_release_all">, 1752 Intrinsic<[], 1753 [llvm_i32_ty], 1754 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn], "", 1755 [SDNPMemOperand] 1756>; 1757 1758 1759// Copies the source value to the destination value, with the guarantee that 1760// the source value is computed as if the entire program were executed in WQM. 1761def int_amdgcn_wqm : Intrinsic<[llvm_any_ty], 1762 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1763>; 1764 1765// Copies the source value to the destination value, such that the source 1766// is computed as if the entire program were executed in WQM if any other 1767// program code executes in WQM. 1768def int_amdgcn_softwqm : Intrinsic<[llvm_any_ty], 1769 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn] 1770>; 1771 1772// Return true if at least one thread within the pixel quad passes true into 1773// the function. 1774def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty], 1775 [llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn] 1776>; 1777 1778// If false, set EXEC=0 for the current thread until the end of program. 1779// FIXME: Should this be IntrNoMem, IntrHasSideEffects, or IntrWillReturn? 1780def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>; 1781 1782def int_amdgcn_endpgm : ClangBuiltin<"__builtin_amdgcn_endpgm">, 1783 Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects] 1784>; 1785 1786// If false, mark all active lanes as helper lanes until the end of program. 1787def int_amdgcn_wqm_demote : Intrinsic<[], 1788 [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly] 1789>; 1790 1791// Copies the active channels of the source value to the destination value, 1792// with the guarantee that the source value is computed as if the entire 1793// program were executed in Whole Wavefront Mode, i.e. with all channels 1794// enabled, with a few exceptions: - Phi nodes which require WWM return an 1795// undefined value. 1796def int_amdgcn_strict_wwm : Intrinsic<[llvm_any_ty], 1797 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 1798 IntrConvergent, IntrWillReturn] 1799>; 1800// Deprecated. Use int_amdgcn_strict_wwm instead. 1801def int_amdgcn_wwm : Intrinsic<[llvm_any_ty], 1802 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 1803 IntrConvergent, IntrWillReturn] 1804>; 1805def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty], 1806 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 1807 IntrConvergent, IntrWillReturn] 1808>; 1809 1810// Given a value, copies it while setting all the inactive lanes to a given 1811// value. Note that OpenGL helper lanes are considered active, so if the 1812// program ever uses WQM, then the instruction and the first source will be 1813// computed in WQM. 1814def int_amdgcn_set_inactive : 1815 Intrinsic<[llvm_anyint_ty], 1816 [LLVMMatchType<0>, // value to be copied 1817 LLVMMatchType<0>], // value for the inactive lanes to take 1818 [IntrNoMem, IntrConvergent, IntrWillReturn]>; 1819 1820// Return if the given flat pointer points to a local memory address. 1821def int_amdgcn_is_shared : ClangBuiltin<"__builtin_amdgcn_is_shared">, 1822 Intrinsic<[llvm_i1_ty], [llvm_ptr_ty], 1823 [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>, IntrWillReturn] 1824>; 1825 1826// Return if the given flat pointer points to a prvate memory address. 1827def int_amdgcn_is_private : ClangBuiltin<"__builtin_amdgcn_is_private">, 1828 Intrinsic<[llvm_i1_ty], [llvm_ptr_ty], 1829 [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>, IntrWillReturn] 1830>; 1831 1832//===----------------------------------------------------------------------===// 1833// CI+ Intrinsics 1834//===----------------------------------------------------------------------===// 1835 1836def int_amdgcn_s_dcache_inv_vol : 1837 ClangBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">, 1838 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 1839 1840def int_amdgcn_buffer_wbinvl1_vol : 1841 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">, 1842 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 1843 1844//===----------------------------------------------------------------------===// 1845// VI Intrinsics 1846//===----------------------------------------------------------------------===// 1847 1848// llvm.amdgcn.mov.dpp.i32 <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 1849def int_amdgcn_mov_dpp : 1850 Intrinsic<[llvm_anyint_ty], 1851 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, 1852 llvm_i1_ty], 1853 [IntrNoMem, IntrConvergent, IntrWillReturn, 1854 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, 1855 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>; 1856 1857// llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 1858// Should be equivalent to: 1859// v_mov_b32 <dest> <old> 1860// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 1861def int_amdgcn_update_dpp : 1862 Intrinsic<[llvm_anyint_ty], 1863 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, 1864 llvm_i32_ty, llvm_i32_ty, llvm_i1_ty], 1865 [IntrNoMem, IntrConvergent, IntrWillReturn, 1866 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, 1867 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 1868 1869def int_amdgcn_s_dcache_wb : 1870 ClangBuiltin<"__builtin_amdgcn_s_dcache_wb">, 1871 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 1872 1873def int_amdgcn_s_dcache_wb_vol : 1874 ClangBuiltin<"__builtin_amdgcn_s_dcache_wb_vol">, 1875 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 1876 1877def int_amdgcn_s_memrealtime : 1878 ClangBuiltin<"__builtin_amdgcn_s_memrealtime">, 1879 Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 1880 1881// llvm.amdgcn.ds.permute <index> <src> 1882def int_amdgcn_ds_permute : 1883 ClangBuiltin<"__builtin_amdgcn_ds_permute">, 1884 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1885 [IntrNoMem, IntrConvergent, IntrWillReturn]>; 1886 1887// llvm.amdgcn.ds.bpermute <index> <src> 1888def int_amdgcn_ds_bpermute : 1889 ClangBuiltin<"__builtin_amdgcn_ds_bpermute">, 1890 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1891 [IntrNoMem, IntrConvergent, IntrWillReturn]>; 1892 1893// llvm.amdgcn.perm <src0> <src1> <selector> 1894def int_amdgcn_perm : 1895 ClangBuiltin<"__builtin_amdgcn_perm">, 1896 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1897 [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; 1898 1899//===----------------------------------------------------------------------===// 1900// GFX9 Intrinsics 1901//===----------------------------------------------------------------------===// 1902 1903class AMDGPUGlobalLoadLDS : Intrinsic < 1904 [], 1905 [LLVMQualPointerType<llvm_i8_ty, 1>, // Base global pointer to load from 1906 LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base pointer to store to 1907 llvm_i32_ty, // Data byte size: 1/2/4 1908 llvm_i32_ty, // imm offset (applied to both global and LDS address) 1909 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0, 1910 // bit 1 = slc/sc1, 1911 // bit 2 = dlc on gfx10+)) 1912 // bit 4 = scc/nt on gfx90a+)) 1913 [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, 1914 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], 1915 "", [SDNPMemOperand]>; 1916def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS; 1917 1918//===----------------------------------------------------------------------===// 1919// GFX10 Intrinsics 1920//===----------------------------------------------------------------------===// 1921 1922// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control> 1923def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">, 1924 Intrinsic<[llvm_i32_ty], 1925 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 1926 [IntrNoMem, IntrConvergent, IntrWillReturn, 1927 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 1928 1929// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control> 1930def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">, 1931 Intrinsic<[llvm_i32_ty], 1932 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 1933 [IntrNoMem, IntrConvergent, IntrWillReturn, 1934 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 1935 1936// llvm.amdgcn.mov.dpp8.i32 <src> <sel> 1937// <sel> is a 32-bit constant whose high 8 bits must be zero which selects 1938// the lanes to read from. 1939def int_amdgcn_mov_dpp8 : 1940 Intrinsic<[llvm_anyint_ty], 1941 [LLVMMatchType<0>, llvm_i32_ty], 1942 [IntrNoMem, IntrConvergent, IntrWillReturn, 1943 ImmArg<ArgIndex<1>>]>; 1944 1945def int_amdgcn_s_get_waveid_in_workgroup : 1946 ClangBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">, 1947 Intrinsic<[llvm_i32_ty], [], 1948 [IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; 1949 1950class AMDGPUGlobalAtomicRtn<LLVMType vt> : Intrinsic < 1951 [vt], 1952 [llvm_anyptr_ty, // vaddr 1953 vt], // vdata(VGPR) 1954 [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>], "", 1955 [SDNPMemOperand]>; 1956 1957def int_amdgcn_global_atomic_csub : AMDGPUGlobalAtomicRtn<llvm_i32_ty>; 1958 1959// uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>, 1960// <ray_dir>, <ray_inv_dir>, <texture_descr> 1961// <node_ptr> is i32 or i64. 1962// <ray_dir> and <ray_inv_dir> are both v3f16 or both v3f32. 1963def int_amdgcn_image_bvh_intersect_ray : 1964 Intrinsic<[llvm_v4i32_ty], 1965 [llvm_anyint_ty, llvm_float_ty, llvm_v3f32_ty, llvm_anyvector_ty, 1966 LLVMMatchType<1>, llvm_v4i32_ty], 1967 [IntrReadMem, IntrWillReturn]>; 1968 1969//===----------------------------------------------------------------------===// 1970// GFX11 Intrinsics 1971//===----------------------------------------------------------------------===// 1972 1973// llvm.amdgcn.permlane64 <src0> 1974def int_amdgcn_permlane64 : 1975 Intrinsic<[llvm_i32_ty], [llvm_i32_ty], 1976 [IntrNoMem, IntrConvergent, IntrWillReturn]>; 1977 1978def int_amdgcn_ds_add_gs_reg_rtn : 1979 ClangBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">, 1980 Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], 1981 [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn]>; 1982 1983def int_amdgcn_ds_sub_gs_reg_rtn : 1984 ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">, 1985 Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], 1986 [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn]>; 1987 1988// WMMA (Wave Matrix Multiply-Accumulate) intrinsics 1989// 1990// These operations perform a matrix multiplication and accumulation of 1991// the form: D = A * B + C . 1992 1993class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> : 1994 Intrinsic< 1995 [CD], // %D 1996 [ 1997 AB, // %A 1998 AB, // %B 1999 LLVMMatchType<0>, // %C 2000 ], 2001 [IntrNoMem, IntrConvergent, IntrWillReturn] 2002>; 2003 2004class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> : 2005 Intrinsic< 2006 [CD], // %D 2007 [ 2008 AB, // %A 2009 AB, // %B 2010 LLVMMatchType<0>, // %C 2011 llvm_i1_ty, // %high 2012 ], 2013 [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<3>>] 2014>; 2015 2016class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> : 2017 Intrinsic< 2018 [CD], // %D 2019 [ 2020 llvm_i1_ty, // %A_sign 2021 AB, // %A 2022 llvm_i1_ty, // %B_sign 2023 AB, // %B 2024 LLVMMatchType<0>, // %C 2025 llvm_i1_ty, // %clamp 2026 ], 2027 [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] 2028>; 2029 2030def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llvm_anyfloat_ty>; 2031def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>; 2032def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>; 2033def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>; 2034def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>; 2035def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>; 2036 2037 2038//===----------------------------------------------------------------------===// 2039// Deep learning intrinsics. 2040//===----------------------------------------------------------------------===// 2041 2042// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp) 2043// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2044def int_amdgcn_fdot2 : 2045 ClangBuiltin<"__builtin_amdgcn_fdot2">, 2046 Intrinsic< 2047 [llvm_float_ty], // %r 2048 [ 2049 llvm_v2f16_ty, // %a 2050 llvm_v2f16_ty, // %b 2051 llvm_float_ty, // %c 2052 llvm_i1_ty // %clamp 2053 ], 2054 [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>] 2055 >; 2056 2057// f16 %r = llvm.amdgcn.fdot2.f16.f16(v2f16 %a, v2f16 %b, f16 %c) 2058// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2059def int_amdgcn_fdot2_f16_f16 : 2060 ClangBuiltin<"__builtin_amdgcn_fdot2_f16_f16">, 2061 Intrinsic< 2062 [llvm_half_ty], // %r 2063 [ 2064 llvm_v2f16_ty, // %a 2065 llvm_v2f16_ty, // %b 2066 llvm_half_ty // %c 2067 ], 2068 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 2069 >; 2070 2071// bf16 %r = llvm.amdgcn.fdot2.bf16.bf16(v2bf16 %a, v2bf16 %b, bf16 %c) 2072// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2073def int_amdgcn_fdot2_bf16_bf16 : 2074 ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, 2075 Intrinsic< 2076 [llvm_i16_ty], // %r 2077 [ 2078 llvm_v2i16_ty, // %a 2079 llvm_v2i16_ty, // %b 2080 llvm_i16_ty // %c 2081 ], 2082 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 2083 >; 2084 2085// f32 %r = llvm.amdgcn.fdot2.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp) 2086// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2087def int_amdgcn_fdot2_f32_bf16 : 2088 ClangBuiltin<"__builtin_amdgcn_fdot2_f32_bf16">, 2089 Intrinsic< 2090 [llvm_float_ty], // %r 2091 [ 2092 llvm_v2i16_ty, // %a 2093 llvm_v2i16_ty, // %b 2094 llvm_float_ty, // %c 2095 llvm_i1_ty // %clamp 2096 ], 2097 [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>] 2098 >; 2099 2100// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp) 2101// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2102def int_amdgcn_sdot2 : 2103 ClangBuiltin<"__builtin_amdgcn_sdot2">, 2104 Intrinsic< 2105 [llvm_i32_ty], // %r 2106 [ 2107 llvm_v2i16_ty, // %a 2108 llvm_v2i16_ty, // %b 2109 llvm_i32_ty, // %c 2110 llvm_i1_ty // %clamp 2111 ], 2112 [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>] 2113 >; 2114 2115// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp) 2116// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2117def int_amdgcn_udot2 : 2118 ClangBuiltin<"__builtin_amdgcn_udot2">, 2119 Intrinsic< 2120 [llvm_i32_ty], // %r 2121 [ 2122 llvm_v2i16_ty, // %a 2123 llvm_v2i16_ty, // %b 2124 llvm_i32_ty, // %c 2125 llvm_i1_ty // %clamp 2126 ], 2127 [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>] 2128 >; 2129 2130// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp) 2131// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2132def int_amdgcn_sdot4 : 2133 ClangBuiltin<"__builtin_amdgcn_sdot4">, 2134 Intrinsic< 2135 [llvm_i32_ty], // %r 2136 [ 2137 llvm_i32_ty, // %a 2138 llvm_i32_ty, // %b 2139 llvm_i32_ty, // %c 2140 llvm_i1_ty // %clamp 2141 ], 2142 [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>] 2143 >; 2144 2145// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp) 2146// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2147def int_amdgcn_udot4 : 2148 ClangBuiltin<"__builtin_amdgcn_udot4">, 2149 Intrinsic< 2150 [llvm_i32_ty], // %r 2151 [ 2152 llvm_i32_ty, // %a 2153 llvm_i32_ty, // %b 2154 llvm_i32_ty, // %c 2155 llvm_i1_ty // %clamp 2156 ], 2157 [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>] 2158 >; 2159 2160// i32 %r = llvm.amdgcn.sudot4(i1 %a_sign, v4i8 (as i32) %a, i1 %b_sign, v4i8 (as i32) %b, i32 %c, i1 %clamp) 2161// Treat input as signed (_sign = 1) or unsigned (_sign = 0). 2162// a[i in 0. . . 3] = (%a_sign ? a.i8[i] : promoteToSigned(a.u8[i])); 2163// b[i in 0. . . 3] = (%b_sign ? b.i8[i] : promoteToSigned(b.u8[i])); 2164// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2165def int_amdgcn_sudot4 : 2166 ClangBuiltin<"__builtin_amdgcn_sudot4">, 2167 Intrinsic< 2168 [llvm_i32_ty], // %r 2169 [ 2170 llvm_i1_ty, // %a_sign 2171 llvm_i32_ty, // %a 2172 llvm_i1_ty, // %b_sign 2173 llvm_i32_ty, // %b 2174 llvm_i32_ty, // %c 2175 llvm_i1_ty // %clamp 2176 ], 2177 [IntrNoMem, IntrSpeculatable, IntrWillReturn, 2178 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] 2179 >; 2180 2181// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp) 2182// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2183// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2184def int_amdgcn_sdot8 : 2185 ClangBuiltin<"__builtin_amdgcn_sdot8">, 2186 Intrinsic< 2187 [llvm_i32_ty], // %r 2188 [ 2189 llvm_i32_ty, // %a 2190 llvm_i32_ty, // %b 2191 llvm_i32_ty, // %c 2192 llvm_i1_ty // %clamp 2193 ], 2194 [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>] 2195 >; 2196 2197// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp) 2198// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2199// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2200def int_amdgcn_udot8 : 2201 ClangBuiltin<"__builtin_amdgcn_udot8">, 2202 Intrinsic< 2203 [llvm_i32_ty], // %r 2204 [ 2205 llvm_i32_ty, // %a 2206 llvm_i32_ty, // %b 2207 llvm_i32_ty, // %c 2208 llvm_i1_ty // %clamp 2209 ], 2210 [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<3>>] 2211 >; 2212 2213// i32 %r = llvm.amdgcn.sudot8(i1 %a_sign, v8i4 (as i32) %a, i1 %b_sign, v8i4 (as i32) %b, i32 %c, i1 %clamp) 2214// Treat input as signed (_sign = 1) or unsigned (_sign = 0). 2215// a[i in 0. . . 7] = (%a_sign ? a.i4[i] : promoteToSigned(a.u4[i])); 2216// b[i in 0. . . 7] = (%b_sign ? b.i4[i] : promoteToSigned(b.u4[i])); 2217// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2218// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2219 def int_amdgcn_sudot8 : 2220 ClangBuiltin<"__builtin_amdgcn_sudot8">, 2221 Intrinsic< 2222 [llvm_i32_ty], // %r 2223 [ 2224 llvm_i1_ty, // %a_sign 2225 llvm_i32_ty, // %a 2226 llvm_i1_ty, // %b_sign 2227 llvm_i32_ty, // %b 2228 llvm_i32_ty, // %c 2229 llvm_i1_ty // %clamp 2230 ], 2231 [IntrNoMem, IntrSpeculatable, IntrWillReturn, 2232 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] 2233 >; 2234 2235//===----------------------------------------------------------------------===// 2236// gfx908 intrinsics 2237// ===----------------------------------------------------------------------===// 2238 2239def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; 2240 2241// llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp 2242class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> : 2243 ClangBuiltin<!subst("int", "__builtin", NAME)>, 2244 Intrinsic<[DestTy], 2245 [SrcABTy, SrcABTy, DestTy, 2246 llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2247 [IntrConvergent, IntrNoMem, IntrWillReturn, 2248 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 2249 2250def int_amdgcn_mfma_f32_32x32x1f32 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>; 2251def int_amdgcn_mfma_f32_16x16x1f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>; 2252def int_amdgcn_mfma_f32_4x4x1f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>; 2253def int_amdgcn_mfma_f32_32x32x2f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>; 2254def int_amdgcn_mfma_f32_16x16x4f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>; 2255def int_amdgcn_mfma_f32_32x32x4f16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4f16_ty>; 2256def int_amdgcn_mfma_f32_16x16x4f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>; 2257def int_amdgcn_mfma_f32_4x4x4f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>; 2258def int_amdgcn_mfma_f32_32x32x8f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>; 2259def int_amdgcn_mfma_f32_16x16x16f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>; 2260def int_amdgcn_mfma_i32_32x32x4i8 : AMDGPUMfmaIntrinsic<llvm_v32i32_ty, llvm_i32_ty>; 2261def int_amdgcn_mfma_i32_16x16x4i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>; 2262def int_amdgcn_mfma_i32_4x4x4i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>; 2263def int_amdgcn_mfma_i32_32x32x8i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>; 2264def int_amdgcn_mfma_i32_16x16x16i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>; 2265def int_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v2i16_ty>; 2266def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>; 2267def int_amdgcn_mfma_f32_4x4x2bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>; 2268def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>; 2269def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>; 2270 2271//===----------------------------------------------------------------------===// 2272// gfx90a intrinsics 2273// ===----------------------------------------------------------------------===// 2274 2275def int_amdgcn_global_atomic_fmin : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; 2276def int_amdgcn_global_atomic_fmax : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; 2277def int_amdgcn_flat_atomic_fadd : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; 2278def int_amdgcn_flat_atomic_fmin : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; 2279def int_amdgcn_flat_atomic_fmax : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; 2280 2281def int_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>; 2282def int_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>; 2283def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>; 2284def int_amdgcn_mfma_f32_32x32x8bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>; 2285def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>; 2286 2287// Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA. 2288// Three bits corresponding to the neg modifier applied to the respective 2289// source operand. 2290def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic<llvm_v4f64_ty, llvm_double_ty>; 2291def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>; 2292 2293//===----------------------------------------------------------------------===// 2294// gfx940 intrinsics 2295// ===----------------------------------------------------------------------===// 2296 2297// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. 2298def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>; 2299def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>; 2300def int_amdgcn_ds_fadd_v2bf16 : Intrinsic< 2301 [llvm_v2i16_ty], 2302 [LLVMQualPointerType<llvm_v2i16_ty, 3>, llvm_v2i16_ty], 2303 [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>]>, 2304 ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">; 2305 2306def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>; 2307def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>; 2308def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>; 2309def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>; 2310 2311class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> : 2312 AMDGPUMfmaIntrinsic<DestTy, llvm_i64_ty>; 2313 2314multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> { 2315 foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in 2316 def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>; 2317} 2318 2319defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>; 2320defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>; 2321 2322// llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid 2323class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> : 2324 ClangBuiltin<!subst("int", "__builtin", NAME)>, 2325 Intrinsic<[DestTy], 2326 [SrcA, SrcB, DestTy, llvm_i32_ty, 2327 llvm_i32_ty, llvm_i32_ty], 2328 [IntrConvergent, IntrNoMem, IntrWillReturn, 2329 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 2330 2331def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>; 2332def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>; 2333def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>; 2334def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>; 2335def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>; 2336def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>; 2337 2338class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> : 2339 AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>; 2340 2341multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> { 2342 foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in 2343 def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>; 2344} 2345 2346defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>; 2347defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>; 2348 2349// llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3] 2350// byte_sel selects byte from srcA. 2351def int_amdgcn_cvt_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_bf8">, 2352 Intrinsic<[llvm_float_ty], 2353 [llvm_i32_ty, llvm_i32_ty], 2354 [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<1>>]>; 2355 2356// llvm.amdgcn.cvt.f32.fp8 float vdst, int srcA, imm byte_sel [0..3] 2357def int_amdgcn_cvt_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_fp8">, 2358 Intrinsic<[llvm_float_ty], 2359 [llvm_i32_ty, llvm_i32_ty], 2360 [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<1>>]>; 2361 2362// llvm.amdgcn.cvt.pk.f32.bf8 float2 vdst, int srcA, imm word_sel 2363// word_sel = 1 selects 2 high bytes, 0 selects 2 low bytes. 2364def int_amdgcn_cvt_pk_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_bf8">, 2365 Intrinsic<[llvm_v2f32_ty], 2366 [llvm_i32_ty, llvm_i1_ty], 2367 [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<1>>]>; 2368 2369// llvm.amdgcn.cvt.pk.f32.fp8 float2 vdst, int srcA, imm word_sel. 2370def int_amdgcn_cvt_pk_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_fp8">, 2371 Intrinsic<[llvm_v2f32_ty], 2372 [llvm_i32_ty, llvm_i1_ty], 2373 [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<1>>]>; 2374 2375// llvm.amdgcn.cvt.pk.bf8.f32 int vdst, float srcA, float srcB, int old, imm word_sel 2376// word_sel = 1 selects 2 high bytes in the vdst, 0 selects 2 low bytes. 2377def int_amdgcn_cvt_pk_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f32">, 2378 Intrinsic<[llvm_i32_ty], 2379 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty], 2380 [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<3>>]>; 2381 2382// llvm.amdgcn.cvt.pk.fp8.f32 int vdst, float srcA, float srcB, int old, imm word_sel 2383def int_amdgcn_cvt_pk_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32">, 2384 Intrinsic<[llvm_i32_ty], 2385 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty], 2386 [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<3>>]>; 2387 2388// llvm.amdgcn.cvt.sr.bf8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3] 2389// byte_sel selects byte to write into vdst. 2390def int_amdgcn_cvt_sr_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f32">, 2391 Intrinsic<[llvm_i32_ty], 2392 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2393 [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<3>>]>; 2394 2395// llvm.amdgcn.cvt.sr.fp8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3] 2396def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">, 2397 Intrinsic<[llvm_i32_ty], 2398 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2399 [IntrNoMem, IntrWillReturn, ImmArg<ArgIndex<3>>]>; 2400 2401//===----------------------------------------------------------------------===// 2402// Special Intrinsics for backend internal use only. No frontend 2403// should emit calls to these. 2404// ===----------------------------------------------------------------------===// 2405def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_anyint_ty], 2406 [llvm_i1_ty], [IntrConvergent, IntrWillReturn] 2407>; 2408 2409def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_anyint_ty], 2410 [llvm_anyint_ty], [IntrConvergent, IntrWillReturn] 2411>; 2412 2413def int_amdgcn_if_break : Intrinsic<[llvm_anyint_ty], 2414 [llvm_i1_ty, LLVMMatchType<0>], 2415 [IntrNoMem, IntrConvergent, IntrWillReturn] 2416>; 2417 2418def int_amdgcn_loop : Intrinsic<[llvm_i1_ty], 2419 [llvm_anyint_ty], [IntrConvergent, IntrWillReturn] 2420>; 2421 2422def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty], 2423 [IntrConvergent, IntrWillReturn]>; 2424 2425// Represent unreachable in a divergent region. 2426def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent]>; 2427 2428// Emit 2.5 ulp, no denormal division. Should only be inserted by 2429// pass based on !fpmath metadata. 2430def int_amdgcn_fdiv_fast : Intrinsic< 2431 [llvm_float_ty], [llvm_float_ty, llvm_float_ty], 2432 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 2433>; 2434 2435// Represent a relocation constant. 2436def int_amdgcn_reloc_constant : Intrinsic< 2437 [llvm_i32_ty], [llvm_metadata_ty], 2438 [IntrNoMem, IntrSpeculatable, IntrWillReturn] 2439>; 2440} 2441