1//===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics -----*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines all of the R600-specific intrinsics. 10// 11//===----------------------------------------------------------------------===// 12 13class AMDGPUReadPreloadRegisterIntrinsic 14 : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; 15 16class AMDGPUReadPreloadRegisterIntrinsicNamed<string name> 17 : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>, ClangBuiltin<name>; 18 19// Used to tag image and resource intrinsics with information used to generate 20// mem operands. 21class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = false> { 22 int RsrcArg = rsrcarg; 23 bit IsImage = isimage; 24} 25 26let TargetPrefix = "r600" in { 27 28multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz { 29 def _x : AMDGPUReadPreloadRegisterIntrinsic; 30 def _y : AMDGPUReadPreloadRegisterIntrinsic; 31 def _z : AMDGPUReadPreloadRegisterIntrinsic; 32} 33 34multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> { 35 def _x : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_x")>; 36 def _y : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_y")>; 37 def _z : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_z")>; 38} 39 40defm int_r600_read_global_size : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 41 <"__builtin_r600_read_global_size">; 42defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 43 <"__builtin_r600_read_ngroups">; 44defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 45 <"__builtin_r600_read_tgid">; 46 47defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz; 48defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz; 49 50def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">, 51 Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>; 52 53// AS 7 is PARAM_I_ADDRESS, used for kernel arguments 54def int_r600_implicitarg_ptr : 55 ClangBuiltin<"__builtin_r600_implicitarg_ptr">, 56 DefaultAttrsIntrinsic<[LLVMQualPointerType<7>], [], 57 [IntrNoMem, IntrSpeculatable]>; 58 59def int_r600_rat_store_typed : 60 // 1st parameter: Data 61 // 2nd parameter: Index 62 // 3rd parameter: Constant RAT ID 63 DefaultAttrsIntrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], []>, 64 ClangBuiltin<"__builtin_r600_rat_store_typed">; 65 66def int_r600_recipsqrt_ieee : DefaultAttrsIntrinsic< 67 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 68>; 69 70def int_r600_recipsqrt_clamped : DefaultAttrsIntrinsic< 71 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 72>; 73 74def int_r600_cube : DefaultAttrsIntrinsic< 75 [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] 76>; 77 78def int_r600_store_stream_output : DefaultAttrsIntrinsic< 79 [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [] 80>; 81 82class TextureIntrinsicFloatInput : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [ 83 llvm_v4f32_ty, // Coord 84 llvm_i32_ty, // offset_x 85 llvm_i32_ty, // offset_y, 86 llvm_i32_ty, // offset_z, 87 llvm_i32_ty, // resource_id 88 llvm_i32_ty, // samplerid 89 llvm_i32_ty, // coord_type_x 90 llvm_i32_ty, // coord_type_y 91 llvm_i32_ty, // coord_type_z 92 llvm_i32_ty], // coord_type_w 93 [IntrNoMem] 94>; 95 96class TextureIntrinsicInt32Input : DefaultAttrsIntrinsic<[llvm_v4i32_ty], [ 97 llvm_v4i32_ty, // Coord 98 llvm_i32_ty, // offset_x 99 llvm_i32_ty, // offset_y, 100 llvm_i32_ty, // offset_z, 101 llvm_i32_ty, // resource_id 102 llvm_i32_ty, // samplerid 103 llvm_i32_ty, // coord_type_x 104 llvm_i32_ty, // coord_type_y 105 llvm_i32_ty, // coord_type_z 106 llvm_i32_ty], // coord_type_w 107 [IntrNoMem] 108>; 109 110def int_r600_store_swizzle : 111 Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree] 112>; 113 114def int_r600_tex : TextureIntrinsicFloatInput; 115def int_r600_texc : TextureIntrinsicFloatInput; 116def int_r600_txl : TextureIntrinsicFloatInput; 117def int_r600_txlc : TextureIntrinsicFloatInput; 118def int_r600_txb : TextureIntrinsicFloatInput; 119def int_r600_txbc : TextureIntrinsicFloatInput; 120def int_r600_txf : TextureIntrinsicInt32Input; 121def int_r600_txq : TextureIntrinsicInt32Input; 122def int_r600_ddx : TextureIntrinsicFloatInput; 123def int_r600_ddy : TextureIntrinsicFloatInput; 124 125def int_r600_dot4 : DefaultAttrsIntrinsic<[llvm_float_ty], 126 [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] 127>; 128 129def int_r600_kill : DefaultAttrsIntrinsic<[], [llvm_float_ty], []>; 130 131} // End TargetPrefix = "r600" 132 133let TargetPrefix = "amdgcn" in { 134 135//===----------------------------------------------------------------------===// 136// ABI Special Intrinsics 137//===----------------------------------------------------------------------===// 138 139defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz; 140defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 141 <"__builtin_amdgcn_workgroup_id">; 142 143def int_amdgcn_dispatch_ptr : 144 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 145 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; 146 147def int_amdgcn_queue_ptr : 148 ClangBuiltin<"__builtin_amdgcn_queue_ptr">, 149 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 150 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; 151 152def int_amdgcn_kernarg_segment_ptr : 153 ClangBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">, 154 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 155 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; 156 157def int_amdgcn_implicitarg_ptr : 158 ClangBuiltin<"__builtin_amdgcn_implicitarg_ptr">, 159 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 160 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; 161 162def int_amdgcn_groupstaticsize : 163 ClangBuiltin<"__builtin_amdgcn_groupstaticsize">, 164 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; 165 166def int_amdgcn_dispatch_id : 167 ClangBuiltin<"__builtin_amdgcn_dispatch_id">, 168 DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable]>; 169 170// For internal use. Coordinates LDS lowering between IR transform and backend. 171def int_amdgcn_lds_kernel_id : 172 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; 173 174def int_amdgcn_implicit_buffer_ptr : 175 ClangBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">, 176 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 177 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; 178 179// Set EXEC to the 64-bit value given. 180// This is always moved to the beginning of the basic block. 181// FIXME: Should be mangled for wave size. 182def int_amdgcn_init_exec : Intrinsic<[], 183 [llvm_i64_ty], // 64-bit literal constant 184 [IntrConvergent, IntrNoMem, IntrHasSideEffects, IntrNoCallback, 185 IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<0>>]>; 186 187// Set EXEC according to a thread count packed in an SGPR input: 188// thread_count = (input >> bitoffset) & 0x7f; 189// This is always moved to the beginning of the basic block. 190// Note: only inreg arguments to the parent function are valid as 191// inputs to this intrinsic, computed values cannot be used. 192def int_amdgcn_init_exec_from_input : Intrinsic<[], 193 [llvm_i32_ty, // 32-bit SGPR input 194 llvm_i32_ty], // bit offset of the thread count 195 [IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback, 196 IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>; 197 198def int_amdgcn_wavefrontsize : 199 ClangBuiltin<"__builtin_amdgcn_wavefrontsize">, 200 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; 201 202 203//===----------------------------------------------------------------------===// 204// Instruction Intrinsics 205//===----------------------------------------------------------------------===// 206 207// The first parameter is s_sendmsg immediate (i16), 208// the second one is copied to m0 209def int_amdgcn_s_sendmsg : ClangBuiltin<"__builtin_amdgcn_s_sendmsg">, 210 Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], 211 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 212def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">, 213 Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], 214 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 215 216 217// gfx11 intrinsic 218// The first parameter is s_sendmsg immediate (i16). Return type is i32 or i64. 219def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty], 220 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 221 222def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">, 223 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 224 225def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">, 226 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 227 228// The 1st parameter is a mask for the types of instructions that may be allowed 229// to cross the SCHED_BARRIER during scheduling. 230// MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER. 231// MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be 232// scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass. 233// MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER. 234// MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER. 235// MASK = 0x0000 0008: MFMA/WMMA instructions may be scheduled across SCHED_BARRIER. 236// MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER. 237// MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER. 238// MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER. 239// MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER. 240// MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER. 241// MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER. 242def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">, 243 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 244 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 245 246// The first parameter is a mask that determines the types of instructions that 247// you would like to synchronize around and add to a scheduling group. The 248// values of the mask are defined above for sched_barrier. These instructions 249// will be selected from the bottom up starting from the sched_group_barrier's 250// location during instruction scheduling. The second parameter is the number of 251// matching instructions that will be associated with this sched_group_barrier. 252// The third parameter is an identifier which is used to describe what other 253// sched_group_barriers should be synchronized with. 254def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_barrier">, 255 Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 256 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects, 257 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 258 259// Scheduler optimization hint. 260// MASK = 0: Small gemm opt 261def int_amdgcn_iglp_opt : ClangBuiltin<"__builtin_amdgcn_iglp_opt">, 262 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 263 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 264 265def int_amdgcn_s_waitcnt : ClangBuiltin<"__builtin_amdgcn_s_waitcnt">, 266 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 267 268def int_amdgcn_div_scale : DefaultAttrsIntrinsic< 269 // 1st parameter: Numerator 270 // 2nd parameter: Denominator 271 // 3rd parameter: Select quotient. Must equal Numerator or Denominator. 272 // (0 = Denominator, 1 = Numerator). 273 [llvm_anyfloat_ty, llvm_i1_ty], 274 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], 275 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>] 276>; 277 278def int_amdgcn_div_fmas : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 279 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], 280 [IntrNoMem, IntrSpeculatable] 281>; 282 283def int_amdgcn_div_fixup : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 284 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 285 [IntrNoMem, IntrSpeculatable] 286>; 287 288// Look Up 2.0 / pi src0 with segment select src1[4:0] 289def int_amdgcn_trig_preop : DefaultAttrsIntrinsic< 290 [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], 291 [IntrNoMem, IntrSpeculatable] 292>; 293 294def int_amdgcn_sin : DefaultAttrsIntrinsic< 295 [llvm_anyfloat_ty], [LLVMMatchType<0>], 296 [IntrNoMem, IntrSpeculatable] 297>; 298 299def int_amdgcn_cos : DefaultAttrsIntrinsic< 300 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 301>; 302 303// v_log_{f16|f32}, performs log2. f32 version does not handle 304// denormals. There is no reason to use this for f16 as it does 305// support denormals, and the generic log2 intrinsic should be 306// preferred. 307def int_amdgcn_log : DefaultAttrsIntrinsic< 308 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 309>; 310 311// v_exp_{f16|f32} (int_amdgcn_exp was taken by export 312// already). Performs exp2. f32 version does not handle 313// denormals. There is no reason to use this for f16 as it does 314// support denormals, and the generic exp2 intrinsic should be 315// preferred. 316def int_amdgcn_exp2 : DefaultAttrsIntrinsic< 317 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 318>; 319 320def int_amdgcn_log_clamp : DefaultAttrsIntrinsic< 321 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 322>; 323 324def int_amdgcn_fmul_legacy : ClangBuiltin<"__builtin_amdgcn_fmul_legacy">, 325 DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], 326 [IntrNoMem, IntrSpeculatable, Commutative] 327>; 328 329// Fused single-precision multiply-add with legacy behaviour for the multiply, 330// which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is 331// intended for use on subtargets that have the v_fma_legacy_f32 and/or 332// v_fmac_legacy_f32 instructions. (Note that v_fma_legacy_f16 is unrelated and 333// has a completely different kind of legacy behaviour.) 334def int_amdgcn_fma_legacy : 335 DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], 336 [IntrNoMem, IntrSpeculatable, Commutative] 337>; 338 339def int_amdgcn_rcp : DefaultAttrsIntrinsic< 340 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 341>; 342 343def int_amdgcn_rcp_legacy : ClangBuiltin<"__builtin_amdgcn_rcp_legacy">, 344 DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], 345 [IntrNoMem, IntrSpeculatable] 346>; 347 348def int_amdgcn_sqrt : DefaultAttrsIntrinsic< 349 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 350>; 351 352def int_amdgcn_rsq : DefaultAttrsIntrinsic< 353 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 354>; 355 356def int_amdgcn_rsq_legacy : ClangBuiltin<"__builtin_amdgcn_rsq_legacy">, 357 DefaultAttrsIntrinsic< 358 [llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable] 359>; 360 361// out = 1.0 / sqrt(a) result clamped to +/- max_float. 362def int_amdgcn_rsq_clamp : DefaultAttrsIntrinsic< 363 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; 364 365// For int_amdgcn_ldexp_f16, only the low 16 bits of the i32 src1 operand will used. 366def int_amdgcn_ldexp : DefaultAttrsIntrinsic< 367 [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], 368 [IntrNoMem, IntrSpeculatable] 369>; 370 371def int_amdgcn_frexp_mant : DefaultAttrsIntrinsic< 372 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 373>; 374 375def int_amdgcn_frexp_exp : DefaultAttrsIntrinsic< 376 [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable] 377>; 378 379// v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0 380// and always uses rtz, so is not suitable for implementing the OpenCL 381// fract function. It should be ok on VI. 382def int_amdgcn_fract : DefaultAttrsIntrinsic< 383 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 384>; 385 386def int_amdgcn_cvt_pkrtz : ClangBuiltin<"__builtin_amdgcn_cvt_pkrtz">, 387 DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], 388 [IntrNoMem, IntrSpeculatable] 389>; 390 391def int_amdgcn_cvt_pknorm_i16 : 392 ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_i16">, 393 DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], 394 [IntrNoMem, IntrSpeculatable] 395>; 396 397def int_amdgcn_cvt_pknorm_u16 : 398 ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_u16">, 399 DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], 400 [IntrNoMem, IntrSpeculatable] 401>; 402 403def int_amdgcn_cvt_pk_i16 : 404 ClangBuiltin<"__builtin_amdgcn_cvt_pk_i16">, 405 DefaultAttrsIntrinsic< 406 [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], 407 [IntrNoMem, IntrSpeculatable] 408>; 409 410def int_amdgcn_cvt_pk_u16 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_u16">, 411 DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], 412 [IntrNoMem, IntrSpeculatable] 413>; 414 415def int_amdgcn_class : DefaultAttrsIntrinsic< 416 [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], 417 [IntrNoMem, IntrSpeculatable] 418>; 419 420def int_amdgcn_fmed3 : 421 DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 422 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 423 [IntrNoMem, IntrSpeculatable] 424>; 425 426def int_amdgcn_cubeid : ClangBuiltin<"__builtin_amdgcn_cubeid">, 427 DefaultAttrsIntrinsic<[llvm_float_ty], 428 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 429 [IntrNoMem, IntrSpeculatable] 430>; 431 432def int_amdgcn_cubema : ClangBuiltin<"__builtin_amdgcn_cubema">, 433 DefaultAttrsIntrinsic<[llvm_float_ty], 434 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 435 [IntrNoMem, IntrSpeculatable] 436>; 437 438def int_amdgcn_cubesc : ClangBuiltin<"__builtin_amdgcn_cubesc">, 439 DefaultAttrsIntrinsic<[llvm_float_ty], 440 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 441 [IntrNoMem, IntrSpeculatable] 442>; 443 444def int_amdgcn_cubetc : ClangBuiltin<"__builtin_amdgcn_cubetc">, 445 DefaultAttrsIntrinsic<[llvm_float_ty], 446 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 447 [IntrNoMem, IntrSpeculatable] 448>; 449 450// v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz 451// should be used. 452def int_amdgcn_sffbh : 453 DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], 454 [IntrNoMem, IntrSpeculatable] 455>; 456 457// v_mad_f32|f16/v_mac_f32|f16, selected regardless of denorm support. 458def int_amdgcn_fmad_ftz : 459 DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 460 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 461 [IntrNoMem, IntrSpeculatable] 462>; 463 464class AMDGPULDSIntrin : 465 Intrinsic<[llvm_any_ty], 466 [LLVMQualPointerType<3>, 467 LLVMMatchType<0>, 468 llvm_i32_ty, // ordering 469 llvm_i32_ty, // scope 470 llvm_i1_ty], // isVolatile 471 [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, 472 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree] 473>; 474 475// FIXME: The m0 argument should be moved after the normal arguments 476class AMDGPUDSOrderedIntrinsic : Intrinsic< 477 [llvm_i32_ty], 478 // M0 = {hi16:address, lo16:waveID}. Allow passing M0 as a pointer, so that 479 // the bit packing can be optimized at the IR level. 480 [LLVMQualPointerType<2>, // IntToPtr(M0) 481 llvm_i32_ty, // value to add or swap 482 llvm_i32_ty, // ordering 483 llvm_i32_ty, // scope 484 llvm_i1_ty, // isVolatile 485 llvm_i32_ty, // ordered count index (OA index), also added to the address 486 // gfx10: bits 24-27 indicate the number of active threads/dwords 487 llvm_i1_ty, // wave release, usually set to 1 488 llvm_i1_ty], // wave done, set to 1 for the last ordered instruction 489 [IntrWillReturn, NoCapture<ArgIndex<0>>, 490 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, 491 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree 492 ] 493>; 494 495class AMDGPUDSAppendConsumedIntrinsic : Intrinsic< 496 [llvm_i32_ty], 497 [llvm_anyptr_ty, // LDS or GDS ptr 498 llvm_i1_ty], // isVolatile 499 [IntrConvergent, IntrWillReturn, IntrArgMemOnly, 500 NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree], 501 "", 502 [SDNPMemOperand] 503>; 504 505def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic; 506def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic; 507 508// The pointer argument is assumed to be dynamically uniform if a VGPR. 509def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic; 510def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic; 511 512def int_amdgcn_ds_fadd : AMDGPULDSIntrin; 513def int_amdgcn_ds_fmin : AMDGPULDSIntrin; 514def int_amdgcn_ds_fmax : AMDGPULDSIntrin; 515 516} // TargetPrefix = "amdgcn" 517 518// New-style image intrinsics 519 520////////////////////////////////////////////////////////////////////////// 521// Dimension-aware image intrinsics framework 522////////////////////////////////////////////////////////////////////////// 523 524// Helper class to represent (type, name) combinations of arguments. The 525// argument names are explanatory and used as DAG operand names for codegen 526// pattern matching. 527class AMDGPUArg<LLVMType ty, string name> { 528 LLVMType Type = ty; 529 string Name = name; 530} 531 532// Return [AMDGPUArg<basety, names[0]>, AMDGPUArg<LLVMMatchType<0>, names[1]>, ...] 533class makeArgList<list<string> names, LLVMType basety> { 534 list<AMDGPUArg> ret = 535 !listconcat([AMDGPUArg<basety, names[0]>], 536 !foreach(name, !tail(names), AMDGPUArg<LLVMMatchType<0>, name>)); 537} 538 539// Return arglist, with LLVMMatchType's references shifted by 'shift'. 540class arglistmatchshift<list<AMDGPUArg> arglist, int shift> { 541 list<AMDGPUArg> ret = 542 !foreach(arg, arglist, 543 !if(!isa<LLVMMatchType>(arg.Type), 544 AMDGPUArg<LLVMMatchType<!add(!cast<LLVMMatchType>(arg.Type).Number, shift)>, 545 arg.Name>, 546 arg)); 547} 548 549// Return the concatenation of the given arglists. LLVMMatchType's are adjusted 550// accordingly, and shifted by an additional 'shift'. 551class arglistconcat<list<list<AMDGPUArg>> arglists, int shift = 0> { 552 list<AMDGPUArg> ret = 553 !foldl([]<AMDGPUArg>, arglists, lhs, rhs, 554 !listconcat( 555 lhs, 556 arglistmatchshift<rhs, 557 !add(shift, !foldl(0, lhs, a, b, 558 !add(a, b.Type.isAny)))>.ret)); 559} 560 561// Represent texture/image types / dimensionality. 562class AMDGPUDimProps<bits<3> enc, string name, string asmsuffix, 563 list<string> coord_names, list<string> slice_names, 564 bit msaa = 0> { 565 AMDGPUDimProps Dim = !cast<AMDGPUDimProps>(NAME); 566 string Name = name; // e.g. "2darraymsaa" 567 string AsmSuffix = asmsuffix; // e.g. 2D_MSAA_ARRAY (used in assembly strings) 568 bits<3> Encoding = enc; 569 bit DA = 0; // DA bit in MIMG encoding 570 bit MSAA = msaa; 571 572 list<AMDGPUArg> CoordSliceArgs = 573 makeArgList<!listconcat(coord_names, slice_names), llvm_anyfloat_ty>.ret; 574 list<AMDGPUArg> CoordSliceIntArgs = 575 makeArgList<!listconcat(coord_names, slice_names), llvm_anyint_ty>.ret; 576 list<AMDGPUArg> GradientArgs = 577 makeArgList<!listconcat(!foreach(name, coord_names, "d" # name # "dh"), 578 !foreach(name, coord_names, "d" # name # "dv")), 579 llvm_anyfloat_ty>.ret; 580 581 bits<8> NumCoords = !size(CoordSliceArgs); 582 bits<8> NumGradients = !size(GradientArgs); 583} 584 585def AMDGPUDim1D : AMDGPUDimProps<0x0, "1d", "1D", ["s"], []>; 586def AMDGPUDim2D : AMDGPUDimProps<0x1, "2d", "2D", ["s", "t"], []>; 587def AMDGPUDim3D : AMDGPUDimProps<0x2, "3d", "3D", ["s", "t", "r"], []>; 588let DA = 1 in { 589 def AMDGPUDimCube : AMDGPUDimProps<0x3, "cube", "CUBE", ["s", "t"], ["face"]>; 590 def AMDGPUDim1DArray : AMDGPUDimProps<0x4, "1darray", "1D_ARRAY", ["s"], ["slice"]>; 591 def AMDGPUDim2DArray : AMDGPUDimProps<0x5, "2darray", "2D_ARRAY", ["s", "t"], ["slice"]>; 592} 593def AMDGPUDim2DMsaa : AMDGPUDimProps<0x6, "2dmsaa", "2D_MSAA", ["s", "t"], ["fragid"], 1>; 594let DA = 1 in { 595 def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<0x7, "2darraymsaa", "2D_MSAA_ARRAY", ["s", "t"], ["slice", "fragid"], 1>; 596} 597 598def AMDGPUDims { 599 list<AMDGPUDimProps> NoMsaa = [AMDGPUDim1D, AMDGPUDim2D, AMDGPUDim3D, 600 AMDGPUDimCube, AMDGPUDim1DArray, 601 AMDGPUDim2DArray]; 602 list<AMDGPUDimProps> Msaa = [AMDGPUDim2DMsaa, AMDGPUDim2DArrayMsaa]; 603 list<AMDGPUDimProps> All = !listconcat(NoMsaa, Msaa); 604} 605 606// Represent sample variants, i.e. _C, _O, _B, ... and combinations thereof. 607class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr> { 608 string UpperCaseMod = ucmod; 609 string LowerCaseMod = lcmod; 610 611 // {offset} {bias} {z-compare} 612 list<AMDGPUArg> ExtraAddrArgs = extra_addr; 613 bit Offset = false; 614 bit Bias = false; 615 bit ZCompare = false; 616 bit Gradients = false; 617 618 // Name of the {lod} or {clamp} argument that is appended to the coordinates, 619 // if any. 620 string LodOrClamp = ""; 621} 622 623// AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE 624// AMDGPUSampleVariantsNoGradients: variants supported by IMAGE_GATHER4 625defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = { 626 multiclass AMDGPUSampleHelper_Offset<string ucmod, string lcmod, 627 list<AMDGPUArg> extra_addr> { 628 def NAME#lcmod : AMDGPUSampleVariant<ucmod, lcmod, extra_addr>; 629 let Offset = true in 630 def NAME#lcmod#_o : AMDGPUSampleVariant< 631 ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg<llvm_i32_ty, "offset">], extra_addr)>; 632 } 633 634 multiclass AMDGPUSampleHelper_Compare<string ucmod, string lcmod, 635 list<AMDGPUArg> extra_addr> { 636 defm NAME : AMDGPUSampleHelper_Offset<ucmod, lcmod, extra_addr>; 637 let ZCompare = true in 638 defm NAME : AMDGPUSampleHelper_Offset< 639 "_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg<llvm_float_ty, "zcompare">])>; 640 } 641 642 multiclass AMDGPUSampleHelper_Clamp<string ucmod, string lcmod, 643 list<AMDGPUArg> extra_addr> { 644 defm NAME : AMDGPUSampleHelper_Compare<ucmod, lcmod, extra_addr>; 645 let LodOrClamp = "clamp" in 646 defm NAME : AMDGPUSampleHelper_Compare<ucmod#"_CL", lcmod#"_cl", extra_addr>; 647 } 648 649 defset list<AMDGPUSampleVariant> AMDGPUSampleVariantsNoGradients = { 650 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>; 651 let Bias = true in 652 defm AMDGPUSample : AMDGPUSampleHelper_Clamp< 653 "_B", "_b", [AMDGPUArg<llvm_anyfloat_ty, "bias">]>; 654 let LodOrClamp = "lod" in 655 defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_L", "_l", []>; 656 defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_LZ", "_lz", []>; 657 } 658 659 let Gradients = true in { 660 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_D", "_d", []>; 661 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_CD", "_cd", []>; 662 } 663} 664 665// Helper class to capture the profile of a dimension-aware image intrinsic. 666// This information is used to generate the intrinsic's type and to inform 667// codegen pattern matching. 668class AMDGPUDimProfile<string opmod, 669 AMDGPUDimProps dim> { 670 AMDGPUDimProps Dim = dim; 671 string OpMod = opmod; // the corresponding instruction is named IMAGE_OpMod 672 673 // These are intended to be overwritten by subclasses 674 bit IsSample = false; 675 bit IsAtomic = false; 676 list<LLVMType> RetTypes = []; 677 list<AMDGPUArg> DataArgs = []; 678 list<AMDGPUArg> ExtraAddrArgs = []; 679 bit Offset = false; 680 bit Bias = false; 681 bit ZCompare = false; 682 bit Gradients = false; 683 string LodClampMip = ""; 684 685 int NumRetAndDataAnyTypes = 686 !foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b, 687 !add(a, b.isAny)); 688 689 list<AMDGPUArg> AddrArgs = 690 arglistconcat<[ExtraAddrArgs, 691 !if(Gradients, dim.GradientArgs, []), 692 !listconcat(!if(IsSample, dim.CoordSliceArgs, dim.CoordSliceIntArgs), 693 !if(!empty(LodClampMip), 694 []<AMDGPUArg>, 695 [AMDGPUArg<LLVMMatchType<0>, LodClampMip>]))], 696 NumRetAndDataAnyTypes>.ret; 697 list<LLVMType> AddrTypes = !foreach(arg, AddrArgs, arg.Type); 698 list<AMDGPUArg> AddrDefaultArgs = 699 !foreach(arg, AddrArgs, 700 AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)), 701 !if(IsSample, llvm_float_ty, llvm_i32_ty), arg.Type), 702 arg.Name>); 703 list<AMDGPUArg> AddrA16Args = 704 !foreach(arg, AddrArgs, 705 AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)), 706 !if(IsSample, llvm_half_ty, llvm_i16_ty), arg.Type), 707 arg.Name>); 708} 709 710class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod, base.Dim> { 711 let IsSample = base.IsSample; 712 let IsAtomic = base.IsAtomic; 713 let RetTypes = base.RetTypes; 714 let DataArgs = base.DataArgs; 715 let ExtraAddrArgs = base.ExtraAddrArgs; 716 let Offset = base.Offset; 717 let Bias = base.Bias; 718 let ZCompare = base.ZCompare; 719 let Gradients = base.Gradients; 720 let LodClampMip = base.LodClampMip; 721} 722 723class AMDGPUDimSampleProfile<string opmod, 724 AMDGPUDimProps dim, 725 AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> { 726 let IsSample = true; 727 let RetTypes = [llvm_any_ty]; 728 let ExtraAddrArgs = sample.ExtraAddrArgs; 729 let Offset = sample.Offset; 730 let Bias = sample.Bias; 731 let ZCompare = sample.ZCompare; 732 let Gradients = sample.Gradients; 733 let LodClampMip = sample.LodOrClamp; 734} 735 736class AMDGPUDimNoSampleProfile<string opmod, 737 AMDGPUDimProps dim, 738 list<LLVMType> retty, 739 list<AMDGPUArg> dataargs, 740 bit Mip = false> : AMDGPUDimProfile<opmod, dim> { 741 let RetTypes = retty; 742 let DataArgs = dataargs; 743 let LodClampMip = !if(Mip, "mip", ""); 744} 745 746class AMDGPUDimAtomicProfile<string opmod, 747 AMDGPUDimProps dim, 748 list<AMDGPUArg> dataargs> : AMDGPUDimProfile<opmod, dim> { 749 let RetTypes = [llvm_anyint_ty]; 750 let DataArgs = dataargs; 751 let IsAtomic = true; 752} 753 754class AMDGPUDimAtomicFloatProfile<string opmod, AMDGPUDimProps dim, 755 list<AMDGPUArg> dataargs> 756 : AMDGPUDimAtomicProfile<opmod, dim, dataargs> { 757 let RetTypes = [llvm_anyfloat_ty]; 758} 759 760class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> 761 : AMDGPUDimProfile<"GET_RESINFO", dim> { 762 let RetTypes = [llvm_anyfloat_ty]; 763 let DataArgs = []; 764 let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">]; 765 let LodClampMip = "mip"; 766} 767 768// Helper class for figuring out image intrinsic argument indexes. 769class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> { 770 int NumDataArgs = !size(P_.DataArgs); 771 int NumDmaskArgs = !not(P_.IsAtomic); 772 int NumOffsetArgs = !if(P_.Offset, 1, 0); 773 int NumBiasArgs = !if(P_.Bias, 1, 0); 774 int NumZCompareArgs = !if(P_.ZCompare, 1, 0); 775 int NumExtraAddrArgs = !add(NumOffsetArgs, NumBiasArgs, NumZCompareArgs); 776 int NumVAddrArgs = !size(P_.AddrArgs); 777 int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0); 778 int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs)); 779 int NumRSrcArgs = 1; 780 int NumSampArgs = !if(P_.IsSample, 2, 0); 781 int DmaskArgIndex = NumDataArgs; 782 int VAddrArgIndex = !add(DmaskArgIndex, NumDmaskArgs); 783 int OffsetArgIndex = VAddrArgIndex; 784 int BiasArgIndex = !add(VAddrArgIndex, NumOffsetArgs); 785 int ZCompareArgIndex = !add(BiasArgIndex, NumBiasArgs); 786 int GradientArgIndex = !add(VAddrArgIndex, NumExtraAddrArgs); 787 int CoordArgIndex = !add(GradientArgIndex, NumGradientArgs); 788 int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1); 789 int MipArgIndex = LodArgIndex; 790 int RsrcArgIndex = !add(VAddrArgIndex, NumVAddrArgs); 791 int SampArgIndex = !add(RsrcArgIndex, NumRSrcArgs); 792 int UnormArgIndex = !add(SampArgIndex, 1); 793 int TexFailCtrlArgIndex = !add(SampArgIndex, NumSampArgs); 794 int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1); 795} 796 797// All dimension-aware intrinsics are derived from this class. 798class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_, 799 list<IntrinsicProperty> props, 800 list<SDNodeProperty> sdnodeprops> : DefaultAttrsIntrinsic< 801 P_.RetTypes, // vdata(VGPR) -- for load/atomic-with-return 802 !listconcat( 803 !foreach(arg, P_.DataArgs, arg.Type), // vdata(VGPR) -- for store/atomic 804 !if(P_.IsAtomic, [], [llvm_i32_ty]), // dmask(imm) 805 P_.AddrTypes, // vaddr(VGPR) 806 [llvm_v8i32_ty], // rsrc(SGPR) 807 !if(P_.IsSample, [llvm_v4i32_ty, // samp(SGPR) 808 llvm_i1_ty], []), // unorm(imm) 809 [llvm_i32_ty, // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe) 810 llvm_i32_ty]), // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc) 811 812 !listconcat(props, 813 !if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]), 814 !if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []), 815 [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>, 816 ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>]), 817 818 819 "", sdnodeprops>, 820 AMDGPURsrcIntrinsic<!add(!size(P_.DataArgs), !size(P_.AddrTypes), 821 !if(P_.IsAtomic, 0, 1)), 1> { 822 AMDGPUDimProfile P = P_; 823 824 AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME); 825 826 let TargetPrefix = "amdgcn"; 827} 828 829// Marker class for intrinsics with a DMask that determines the returned 830// channels. 831class AMDGPUImageDMaskIntrinsic; 832 833defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = { 834 835 ////////////////////////////////////////////////////////////////////////// 836 // Load and store intrinsics 837 ////////////////////////////////////////////////////////////////////////// 838 multiclass AMDGPUImageDimIntrinsicsNoMsaa<string opmod, 839 list<LLVMType> retty, 840 list<AMDGPUArg> dataargs, 841 list<IntrinsicProperty> props, 842 list<SDNodeProperty> sdnodeprops, 843 bit Mip = false> { 844 foreach dim = AMDGPUDims.NoMsaa in { 845 def !strconcat(NAME, "_", dim.Name) 846 : AMDGPUImageDimIntrinsic< 847 AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>, 848 props, sdnodeprops>; 849 } 850 } 851 852 multiclass AMDGPUImageDimIntrinsicsAll<string opmod, 853 list<LLVMType> retty, 854 list<AMDGPUArg> dataargs, 855 list<IntrinsicProperty> props, 856 list<SDNodeProperty> sdnodeprops, 857 bit Mip = false> { 858 foreach dim = AMDGPUDims.All in { 859 def !strconcat(NAME, "_", dim.Name) 860 : AMDGPUImageDimIntrinsic< 861 AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>, 862 props, sdnodeprops>; 863 } 864 } 865 866 defm int_amdgcn_image_load 867 : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem], 868 [SDNPMemOperand]>, 869 AMDGPUImageDMaskIntrinsic; 870 defm int_amdgcn_image_load_mip 871 : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [], 872 [IntrReadMem, IntrWillReturn], [SDNPMemOperand], 1>, 873 AMDGPUImageDMaskIntrinsic; 874 875 defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll< 876 "STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">], 877 [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>, 878 AMDGPUImageDMaskIntrinsic; 879 defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa< 880 "STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">], 881 [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>, 882 AMDGPUImageDMaskIntrinsic; 883 884 ////////////////////////////////////////////////////////////////////////// 885 // MSAA intrinsics 886 ////////////////////////////////////////////////////////////////////////// 887 foreach dim = AMDGPUDims.Msaa in { 888 def int_amdgcn_image_msaa_load_x # _ # dim.Name: 889 AMDGPUImageDimIntrinsic< 890 AMDGPUDimNoSampleProfile<"MSAA_LOAD_X", dim, [llvm_any_ty], []>, 891 [IntrReadMem], [SDNPMemOperand]>; 892 } 893 894 foreach dim = AMDGPUDims.Msaa in { 895 def int_amdgcn_image_msaa_load # _ # dim.Name: 896 AMDGPUImageDimIntrinsic< 897 AMDGPUDimNoSampleProfile<"MSAA_LOAD", dim, [llvm_any_ty], []>, 898 [IntrReadMem], [SDNPMemOperand]>; 899 } 900 901 ////////////////////////////////////////////////////////////////////////// 902 // sample and getlod intrinsics 903 ////////////////////////////////////////////////////////////////////////// 904 multiclass AMDGPUImageDimSampleDims<string opmod, 905 AMDGPUSampleVariant sample, 906 bit NoMem = false> { 907 foreach dim = AMDGPUDims.NoMsaa in { 908 def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic< 909 AMDGPUDimSampleProfile<opmod, dim, sample>, 910 !if(NoMem, [IntrNoMem], [IntrReadMem]), 911 !if(NoMem, [], [SDNPMemOperand])>; 912 } 913 } 914 915 foreach sample = AMDGPUSampleVariants in { 916 defm int_amdgcn_image_sample # sample.LowerCaseMod 917 : AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>, 918 AMDGPUImageDMaskIntrinsic; 919 } 920 921 defm int_amdgcn_image_getlod 922 : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>, 923 AMDGPUImageDMaskIntrinsic; 924 925 ////////////////////////////////////////////////////////////////////////// 926 // getresinfo intrinsics 927 ////////////////////////////////////////////////////////////////////////// 928 foreach dim = AMDGPUDims.All in { 929 def !strconcat("int_amdgcn_image_getresinfo_", dim.Name) 930 : AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>, 931 AMDGPUImageDMaskIntrinsic; 932 } 933 934 ////////////////////////////////////////////////////////////////////////// 935 // gather4 intrinsics 936 ////////////////////////////////////////////////////////////////////////// 937 foreach sample = AMDGPUSampleVariantsNoGradients in { 938 foreach dim = [AMDGPUDim2D, AMDGPUDimCube, AMDGPUDim2DArray] in { 939 def int_amdgcn_image_gather4 # sample.LowerCaseMod # _ # dim.Name: 940 AMDGPUImageDimIntrinsic< 941 AMDGPUDimSampleProfile<"GATHER4" # sample.UpperCaseMod, dim, sample>, 942 [IntrReadMem], [SDNPMemOperand]>; 943 } 944 } 945} 946 947////////////////////////////////////////////////////////////////////////// 948// atomic intrinsics 949////////////////////////////////////////////////////////////////////////// 950defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = { 951 multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs, 952 int isFloat = 0> { 953 foreach dim = AMDGPUDims.All in { 954 def !strconcat(NAME, "_", dim.Name): AMDGPUImageDimIntrinsic< 955 !if (isFloat, AMDGPUDimAtomicFloatProfile<opmod, dim, dataargs>, 956 AMDGPUDimAtomicProfile<opmod, dim, dataargs>), 957 [], [SDNPMemOperand]>; 958 } 959 } 960 961 multiclass AMDGPUImageDimAtomic<string opmod, int isFloat = 0> { 962 defm "" 963 : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">], 964 isFloat>; 965 } 966 967 multiclass AMDGPUImageDimFloatAtomic<string opmod> { 968 defm "" : AMDGPUImageDimAtomic<opmod, 1 /*isFloat*/>; 969 } 970 971 defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">; 972 defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">; 973 defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">; 974 defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">; 975 defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">; 976 defm int_amdgcn_image_atomic_fmin : AMDGPUImageDimFloatAtomic<"ATOMIC_FMIN">; 977 defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">; 978 defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">; 979 defm int_amdgcn_image_atomic_fmax : AMDGPUImageDimFloatAtomic<"ATOMIC_FMAX">; 980 defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">; 981 defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">; 982 defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">; 983 defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">; 984 defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">; 985 986 defm int_amdgcn_image_atomic_cmpswap : 987 AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">, 988 AMDGPUArg<LLVMMatchType<0>, "cmp">]>; 989} 990 991////////////////////////////////////////////////////////////////////////// 992// Buffer intrinsics 993////////////////////////////////////////////////////////////////////////// 994 995// Data type for buffer resources (V#). Maybe, in the future, we can create a 996// similar one for textures (T#). 997def AMDGPUBufferRsrcTy : LLVMQualPointerType<8>; 998 999let TargetPrefix = "amdgcn" in { 1000 1001def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic < 1002 [AMDGPUBufferRsrcTy], 1003 [llvm_anyptr_ty, // base 1004 llvm_i16_ty, // stride (and swizzle control) 1005 llvm_i32_ty, // NumRecords / extent 1006 llvm_i32_ty], // flags 1007 // Attributes lifted from ptrmask + some extra argument attributes. 1008 [IntrNoMem, NoCapture<ArgIndex<0>>, ReadNone<ArgIndex<0>>, 1009 IntrSpeculatable, IntrWillReturn]>; 1010 1011defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = { 1012 1013class AMDGPUBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1014 [data_ty], 1015 [llvm_v4i32_ty, // rsrc(SGPR) 1016 llvm_i32_ty, // vindex(VGPR) 1017 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1018 llvm_i1_ty, // glc(imm) 1019 llvm_i1_ty], // slc(imm) 1020 [IntrReadMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1021 AMDGPURsrcIntrinsic<0>; 1022def int_amdgcn_buffer_load_format : AMDGPUBufferLoad<llvm_anyfloat_ty>; 1023def int_amdgcn_buffer_load : AMDGPUBufferLoad; 1024 1025// Generate a buffer_load instruction that may be optimized to s_buffer_load if 1026// the offset argument is uniform. 1027def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic < 1028 [llvm_any_ty], 1029 [llvm_v4i32_ty, // rsrc(SGPR) 1030 llvm_i32_ty, // byte offset 1031 llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 2 = dlc) 1032 [IntrNoMem, ImmArg<ArgIndex<2>>]>, 1033 AMDGPURsrcIntrinsic<0>; 1034 1035class AMDGPUBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1036 [], 1037 [data_ty, // vdata(VGPR) 1038 llvm_v4i32_ty, // rsrc(SGPR) 1039 llvm_i32_ty, // vindex(VGPR) 1040 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1041 llvm_i1_ty, // glc(imm) 1042 llvm_i1_ty], // slc(imm) 1043 [IntrWriteMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1044 AMDGPURsrcIntrinsic<1>; 1045def int_amdgcn_buffer_store_format : AMDGPUBufferStore<llvm_anyfloat_ty>; 1046def int_amdgcn_buffer_store : AMDGPUBufferStore; 1047 1048// New buffer intrinsics with separate raw and struct variants. The raw 1049// variant never has an index. The struct variant always has an index, even if 1050// it is const 0. A struct intrinsic with constant 0 index is different to the 1051// corresponding raw intrinsic on gfx9+ because the behavior of bound checking 1052// and swizzling changes depending on whether idxen is set in the instruction. 1053// These new instrinsics also keep the offset and soffset arguments separate as 1054// they behave differently in bounds checking and swizzling. 1055 1056// The versions of these intrinsics that take <4 x i32> arguments are deprecated 1057// in favor of their .ptr.buffer variants that take ptr addrspace(8) arguments, 1058// which allow for improved reasoning about memory accesses. 1059class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1060 [data_ty], 1061 [llvm_v4i32_ty, // rsrc(SGPR) 1062 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1063 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1064 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1065 // bit 1 = slc, 1066 // bit 2 = dlc on gfx10+), 1067 // swizzled buffer (bit 3 = swz)) 1068 [IntrReadMem, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>, 1069 AMDGPURsrcIntrinsic<0>; 1070def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>; 1071def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad; 1072 1073class AMDGPURawPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1074 [data_ty], 1075 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1076 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1077 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1078 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1079 // bit 1 = slc, 1080 // bit 2 = dlc on gfx10+), 1081 // swizzled buffer (bit 3 = swz)) 1082 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1083 ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>, 1084 AMDGPURsrcIntrinsic<0>; 1085def int_amdgcn_raw_ptr_buffer_load_format : AMDGPURawPtrBufferLoad<llvm_anyfloat_ty>; 1086def int_amdgcn_raw_ptr_buffer_load : AMDGPURawPtrBufferLoad; 1087 1088class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1089 [data_ty], 1090 [llvm_v4i32_ty, // rsrc(SGPR) 1091 llvm_i32_ty, // vindex(VGPR) 1092 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1093 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1094 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1095 // bit 1 = slc, 1096 // bit 2 = dlc on gfx10+), 1097 // swizzled buffer (bit 3 = swz)) 1098 [IntrReadMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1099 AMDGPURsrcIntrinsic<0>; 1100def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad; 1101def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad; 1102 1103class AMDGPUStructPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1104 [data_ty], 1105 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1106 llvm_i32_ty, // vindex(VGPR) 1107 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1108 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1109 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1110 // bit 1 = slc, 1111 // bit 2 = dlc on gfx10+), 1112 // swizzled buffer (bit 3 = swz)) 1113 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1114 ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1115 AMDGPURsrcIntrinsic<0>; 1116def int_amdgcn_struct_ptr_buffer_load_format : AMDGPUStructPtrBufferLoad; 1117def int_amdgcn_struct_ptr_buffer_load : AMDGPUStructPtrBufferLoad; 1118 1119class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1120 [], 1121 [data_ty, // vdata(VGPR) 1122 llvm_v4i32_ty, // rsrc(SGPR) 1123 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1124 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1125 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1126 // bit 1 = slc, 1127 // bit 2 = dlc on gfx10+), 1128 // swizzled buffer (bit 3 = swz)) 1129 [IntrWriteMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1130 AMDGPURsrcIntrinsic<1>; 1131def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore<llvm_anyfloat_ty>; 1132def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore; 1133 1134class AMDGPURawPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1135 [], 1136 [data_ty, // vdata(VGPR) 1137 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1138 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1139 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1140 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1141 // bit 1 = slc, 1142 // bit 2 = dlc on gfx10+), 1143 // swizzled buffer (bit 3 = swz)) 1144 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1145 ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1146 AMDGPURsrcIntrinsic<1>; 1147def int_amdgcn_raw_ptr_buffer_store_format : AMDGPURawPtrBufferStore<llvm_anyfloat_ty>; 1148def int_amdgcn_raw_ptr_buffer_store : AMDGPURawPtrBufferStore; 1149 1150class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1151 [], 1152 [data_ty, // vdata(VGPR) 1153 llvm_v4i32_ty, // rsrc(SGPR) 1154 llvm_i32_ty, // vindex(VGPR) 1155 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1156 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1157 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1158 // bit 1 = slc, 1159 // bit 2 = dlc on gfx10+), 1160 // swizzled buffer (bit 3 = swz)) 1161 [IntrWriteMem, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1162 AMDGPURsrcIntrinsic<1>; 1163def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; 1164def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore; 1165 1166class AMDGPUStructPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1167 [], 1168 [data_ty, // vdata(VGPR) 1169 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1170 llvm_i32_ty, // vindex(VGPR) 1171 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1172 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1173 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1174 // bit 1 = slc, 1175 // bit 2 = dlc on gfx10+), 1176 // swizzled buffer (bit 3 = swz)) 1177 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1178 ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1179 AMDGPURsrcIntrinsic<1>; 1180def int_amdgcn_struct_ptr_buffer_store_format : AMDGPUStructPtrBufferStore; 1181def int_amdgcn_struct_ptr_buffer_store : AMDGPUStructPtrBufferStore; 1182 1183class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic < 1184 !if(NoRtn, [], [data_ty]), 1185 [!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR) 1186 llvm_v4i32_ty, // rsrc(SGPR) 1187 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1188 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1189 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1190 [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1191 AMDGPURsrcIntrinsic<1, 0>; 1192def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic; 1193def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic; 1194def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic; 1195def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic; 1196def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic; 1197def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1198def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic; 1199def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic; 1200def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1201def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic; 1202def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic; 1203def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic; 1204def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic; 1205def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic; 1206def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< 1207 [llvm_anyint_ty], 1208 [LLVMMatchType<0>, // src(VGPR) 1209 LLVMMatchType<0>, // cmp(VGPR) 1210 llvm_v4i32_ty, // rsrc(SGPR) 1211 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1212 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1213 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1214 [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1215 AMDGPURsrcIntrinsic<2, 0>; 1216 1217class AMDGPURawPtrBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic < 1218 !if(NoRtn, [], [data_ty]), 1219 [!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR) 1220 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1221 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1222 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1223 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1224 [IntrArgMemOnly, NoCapture<ArgIndex<1>>, 1225 ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1226 AMDGPURsrcIntrinsic<1, 0>; 1227 1228def int_amdgcn_raw_ptr_buffer_atomic_swap : AMDGPURawPtrBufferAtomic; 1229def int_amdgcn_raw_ptr_buffer_atomic_add : AMDGPURawPtrBufferAtomic; 1230def int_amdgcn_raw_ptr_buffer_atomic_sub : AMDGPURawPtrBufferAtomic; 1231def int_amdgcn_raw_ptr_buffer_atomic_smin : AMDGPURawPtrBufferAtomic; 1232def int_amdgcn_raw_ptr_buffer_atomic_umin : AMDGPURawPtrBufferAtomic; 1233def int_amdgcn_raw_ptr_buffer_atomic_fmin : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>; 1234def int_amdgcn_raw_ptr_buffer_atomic_smax : AMDGPURawPtrBufferAtomic; 1235def int_amdgcn_raw_ptr_buffer_atomic_umax : AMDGPURawPtrBufferAtomic; 1236def int_amdgcn_raw_ptr_buffer_atomic_fmax : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>; 1237def int_amdgcn_raw_ptr_buffer_atomic_and : AMDGPURawPtrBufferAtomic; 1238def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic; 1239def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic; 1240def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic; 1241def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic; 1242def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic< 1243 [llvm_anyint_ty], 1244 [LLVMMatchType<0>, // src(VGPR) 1245 LLVMMatchType<0>, // cmp(VGPR) 1246 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1247 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1248 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1249 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1250 [IntrArgMemOnly, NoCapture<ArgIndex<2>>, 1251 ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1252 AMDGPURsrcIntrinsic<2, 0>; 1253 1254// gfx908 intrinsic 1255def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1256def int_amdgcn_raw_ptr_buffer_atomic_fadd : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>; 1257 1258class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic < 1259 !if(NoRtn, [], [data_ty]), 1260 [!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR) 1261 llvm_v4i32_ty, // rsrc(SGPR) 1262 llvm_i32_ty, // vindex(VGPR) 1263 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1264 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1265 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1266 [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1267 AMDGPURsrcIntrinsic<1, 0>; 1268def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic; 1269def int_amdgcn_struct_buffer_atomic_add : AMDGPUStructBufferAtomic; 1270def int_amdgcn_struct_buffer_atomic_sub : AMDGPUStructBufferAtomic; 1271def int_amdgcn_struct_buffer_atomic_smin : AMDGPUStructBufferAtomic; 1272def int_amdgcn_struct_buffer_atomic_umin : AMDGPUStructBufferAtomic; 1273def int_amdgcn_struct_buffer_atomic_smax : AMDGPUStructBufferAtomic; 1274def int_amdgcn_struct_buffer_atomic_umax : AMDGPUStructBufferAtomic; 1275def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic; 1276def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic; 1277def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic; 1278def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic; 1279def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic; 1280def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< 1281 [llvm_anyint_ty], 1282 [LLVMMatchType<0>, // src(VGPR) 1283 LLVMMatchType<0>, // cmp(VGPR) 1284 llvm_v4i32_ty, // rsrc(SGPR) 1285 llvm_i32_ty, // vindex(VGPR) 1286 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1287 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1288 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1289 [ImmArg<ArgIndex<6>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1290 AMDGPURsrcIntrinsic<2, 0>; 1291 1292class AMDGPUStructPtrBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic < 1293 !if(NoRtn, [], [data_ty]), 1294 [!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR) 1295 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1296 llvm_i32_ty, // vindex(VGPR) 1297 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1298 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1299 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1300 [IntrArgMemOnly, NoCapture<ArgIndex<1>>, 1301 ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1302 AMDGPURsrcIntrinsic<1, 0>; 1303def int_amdgcn_struct_ptr_buffer_atomic_swap : AMDGPUStructPtrBufferAtomic; 1304def int_amdgcn_struct_ptr_buffer_atomic_add : AMDGPUStructPtrBufferAtomic; 1305def int_amdgcn_struct_ptr_buffer_atomic_sub : AMDGPUStructPtrBufferAtomic; 1306def int_amdgcn_struct_ptr_buffer_atomic_smin : AMDGPUStructPtrBufferAtomic; 1307def int_amdgcn_struct_ptr_buffer_atomic_umin : AMDGPUStructPtrBufferAtomic; 1308def int_amdgcn_struct_ptr_buffer_atomic_smax : AMDGPUStructPtrBufferAtomic; 1309def int_amdgcn_struct_ptr_buffer_atomic_umax : AMDGPUStructPtrBufferAtomic; 1310def int_amdgcn_struct_ptr_buffer_atomic_and : AMDGPUStructPtrBufferAtomic; 1311def int_amdgcn_struct_ptr_buffer_atomic_or : AMDGPUStructPtrBufferAtomic; 1312def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic; 1313def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic; 1314def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic; 1315def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic< 1316 [llvm_anyint_ty], 1317 [LLVMMatchType<0>, // src(VGPR) 1318 LLVMMatchType<0>, // cmp(VGPR) 1319 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1320 llvm_i32_ty, // vindex(VGPR) 1321 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1322 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1323 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1324 [IntrArgMemOnly, NoCapture<ArgIndex<2>>, 1325 ImmArg<ArgIndex<6>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1326 AMDGPURsrcIntrinsic<2, 0>; 1327 1328// gfx908 intrinsic 1329def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1330def int_amdgcn_struct_ptr_buffer_atomic_fadd : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>; 1331 1332// gfx90a intrinsics 1333def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1334def int_amdgcn_struct_buffer_atomic_fmax : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1335 1336def int_amdgcn_struct_ptr_buffer_atomic_fmin : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>; 1337def int_amdgcn_struct_ptr_buffer_atomic_fmax : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>; 1338 1339// Obsolescent tbuffer intrinsics. 1340def int_amdgcn_tbuffer_load : DefaultAttrsIntrinsic < 1341 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1342 [llvm_v4i32_ty, // rsrc(SGPR) 1343 llvm_i32_ty, // vindex(VGPR) 1344 llvm_i32_ty, // voffset(VGPR) 1345 llvm_i32_ty, // soffset(SGPR) 1346 llvm_i32_ty, // offset(imm) 1347 llvm_i32_ty, // dfmt(imm) 1348 llvm_i32_ty, // nfmt(imm) 1349 llvm_i1_ty, // glc(imm) 1350 llvm_i1_ty], // slc(imm) 1351 [IntrReadMem, 1352 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, 1353 ImmArg<ArgIndex<7>>, ImmArg<ArgIndex<8>>], "", [SDNPMemOperand]>, 1354 AMDGPURsrcIntrinsic<0>; 1355 1356def int_amdgcn_tbuffer_store : DefaultAttrsIntrinsic < 1357 [], 1358 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1359 llvm_v4i32_ty, // rsrc(SGPR) 1360 llvm_i32_ty, // vindex(VGPR) 1361 llvm_i32_ty, // voffset(VGPR) 1362 llvm_i32_ty, // soffset(SGPR) 1363 llvm_i32_ty, // offset(imm) 1364 llvm_i32_ty, // dfmt(imm) 1365 llvm_i32_ty, // nfmt(imm) 1366 llvm_i1_ty, // glc(imm) 1367 llvm_i1_ty], // slc(imm) 1368 [IntrWriteMem, ImmArg<ArgIndex<5>>, 1369 ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, 1370 ImmArg<ArgIndex<8>>, ImmArg<ArgIndex<9>>], "", [SDNPMemOperand]>, 1371 AMDGPURsrcIntrinsic<1>; 1372 1373// New tbuffer intrinsics, with: 1374// - raw and struct variants 1375// - joint format field 1376// - joint cachepolicy field 1377def int_amdgcn_raw_tbuffer_load : DefaultAttrsIntrinsic < 1378 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1379 [llvm_v4i32_ty, // rsrc(SGPR) 1380 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1381 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1382 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1383 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1384 // bit 1 = slc, 1385 // bit 2 = dlc on gfx10+), 1386 // swizzled buffer (bit 3 = swz)) 1387 [IntrReadMem, 1388 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1389 AMDGPURsrcIntrinsic<0>; 1390 1391def int_amdgcn_raw_ptr_tbuffer_load : DefaultAttrsIntrinsic < 1392 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1393 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1394 llvm_i32_ty, // offset(VGPR/imm, included in bounds` checking and swizzling) 1395 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1396 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1397 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1398 // bit 1 = slc, 1399 // bit 2 = dlc on gfx10+), 1400 // swizzled buffer (bit 3 = swz)) 1401 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1402 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1403 AMDGPURsrcIntrinsic<0>; 1404 1405def int_amdgcn_raw_tbuffer_store : DefaultAttrsIntrinsic < 1406 [], 1407 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1408 llvm_v4i32_ty, // rsrc(SGPR) 1409 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1410 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1411 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1412 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1413 // bit 1 = slc, 1414 // bit 2 = dlc on gfx10+), 1415 // swizzled buffer (bit 3 = swz)) 1416 [IntrWriteMem, 1417 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1418 AMDGPURsrcIntrinsic<1>; 1419 1420def int_amdgcn_raw_ptr_tbuffer_store : DefaultAttrsIntrinsic < 1421 [], 1422 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1423 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1424 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1425 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1426 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1427 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1428 // bit 1 = slc, 1429 // bit 2 = dlc on gfx10+), 1430 // swizzled buffer (bit 3 = swz)) 1431 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1432 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1433 AMDGPURsrcIntrinsic<1>; 1434 1435def int_amdgcn_struct_tbuffer_load : DefaultAttrsIntrinsic < 1436 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1437 [llvm_v4i32_ty, // rsrc(SGPR) 1438 llvm_i32_ty, // vindex(VGPR) 1439 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1440 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1441 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1442 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1443 // bit 1 = slc, 1444 // bit 2 = dlc on gfx10+), 1445 // swizzled buffer (bit 3 = swz)) 1446 [IntrReadMem, 1447 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1448 AMDGPURsrcIntrinsic<0>; 1449 1450def int_amdgcn_struct_ptr_tbuffer_load : DefaultAttrsIntrinsic < 1451 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1452 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1453 llvm_i32_ty, // vindex(VGPR) 1454 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1455 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1456 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1457 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1458 // bit 1 = slc, 1459 // bit 2 = dlc on gfx10+), 1460 // swizzled buffer (bit 3 = swz)) 1461 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1462 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1463 AMDGPURsrcIntrinsic<0>; 1464 1465def int_amdgcn_struct_ptr_tbuffer_store : DefaultAttrsIntrinsic < 1466 [], 1467 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1468 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1469 llvm_i32_ty, // vindex(VGPR) 1470 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1471 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1472 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1473 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1474 // bit 1 = slc, 1475 // bit 2 = dlc on gfx10+), 1476 // swizzled buffer (bit 3 = swz)) 1477 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1478 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, 1479 AMDGPURsrcIntrinsic<1>; 1480 1481def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic < 1482 [], 1483 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1484 llvm_v4i32_ty, // rsrc(SGPR) 1485 llvm_i32_ty, // vindex(VGPR) 1486 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1487 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1488 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1489 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1490 // bit 1 = slc, 1491 // bit 2 = dlc on gfx10+), 1492 // swizzled buffer (bit 3 = swz)) 1493 [IntrWriteMem, 1494 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, 1495 AMDGPURsrcIntrinsic<1>; 1496 1497class AMDGPUBufferAtomic : Intrinsic < 1498 [llvm_anyint_ty], 1499 [LLVMMatchType<0>, // vdata(VGPR) 1500 llvm_v4i32_ty, // rsrc(SGPR) 1501 llvm_i32_ty, // vindex(VGPR) 1502 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1503 llvm_i1_ty], // slc(imm) 1504 [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1505 AMDGPURsrcIntrinsic<1, 0>; 1506def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic; 1507def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic; 1508def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic; 1509def int_amdgcn_buffer_atomic_smin : AMDGPUBufferAtomic; 1510def int_amdgcn_buffer_atomic_umin : AMDGPUBufferAtomic; 1511def int_amdgcn_buffer_atomic_smax : AMDGPUBufferAtomic; 1512def int_amdgcn_buffer_atomic_umax : AMDGPUBufferAtomic; 1513def int_amdgcn_buffer_atomic_and : AMDGPUBufferAtomic; 1514def int_amdgcn_buffer_atomic_or : AMDGPUBufferAtomic; 1515def int_amdgcn_buffer_atomic_xor : AMDGPUBufferAtomic; 1516def int_amdgcn_buffer_atomic_cmpswap : Intrinsic< 1517 [llvm_i32_ty], 1518 [llvm_i32_ty, // src(VGPR) 1519 llvm_i32_ty, // cmp(VGPR) 1520 llvm_v4i32_ty, // rsrc(SGPR) 1521 llvm_i32_ty, // vindex(VGPR) 1522 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1523 llvm_i1_ty], // slc(imm) 1524 [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1525 AMDGPURsrcIntrinsic<2, 0>; 1526 1527def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic; 1528 1529class AMDGPUBufferAtomicFP : Intrinsic < 1530 [llvm_anyfloat_ty], 1531 [LLVMMatchType<0>, // vdata(VGPR) 1532 llvm_v4i32_ty, // rsrc(SGPR) 1533 llvm_i32_ty, // vindex(VGPR) 1534 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1535 llvm_i1_ty], // slc(imm) 1536 [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1537 AMDGPURsrcIntrinsic<1, 0>; 1538 1539// Legacy form of the intrinsic. raw and struct forms should be preferred. 1540def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP; 1541 1542class AMDGPURawBufferLoadLDS : Intrinsic < 1543 [], 1544 [llvm_v4i32_ty, // rsrc(SGPR) 1545 LLVMQualPointerType<3>, // LDS base offset 1546 llvm_i32_ty, // Data byte size: 1/2/4 1547 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1548 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1549 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1550 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1551 // bit 1 = slc, 1552 // bit 2 = dlc on gfx10+)) 1553 // swizzled buffer (bit 3 = swz)) 1554 [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, 1555 ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1556def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS; 1557 1558class AMDGPURawPtrBufferLoadLDS : Intrinsic < 1559 [], 1560 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1561 LLVMQualPointerType<3>, // LDS base offset 1562 llvm_i32_ty, // Data byte size: 1/2/4 1563 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1564 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1565 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1566 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1567 // bit 1 = slc, 1568 // bit 2 = dlc on gfx10+)) 1569 // swizzled buffer (bit 3 = swz)) 1570 [IntrWillReturn, IntrArgMemOnly, 1571 ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1572 WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1573 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, 1574 ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1575def int_amdgcn_raw_ptr_buffer_load_lds : AMDGPURawPtrBufferLoadLDS; 1576 1577class AMDGPUStructBufferLoadLDS : Intrinsic < 1578 [], 1579 [llvm_v4i32_ty, // rsrc(SGPR) 1580 LLVMQualPointerType<3>, // LDS base offset 1581 llvm_i32_ty, // Data byte size: 1/2/4 1582 llvm_i32_ty, // vindex(VGPR) 1583 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1584 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1585 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1586 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1587 // bit 1 = slc, 1588 // bit 2 = dlc on gfx10+)) 1589 // swizzled buffer (bit 3 = swz)) 1590 [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, 1591 ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1592def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS; 1593 1594class AMDGPUStructPtrBufferLoadLDS : Intrinsic < 1595 [], 1596 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1597 LLVMQualPointerType<3> , // LDS base offset 1598 llvm_i32_ty, // Data byte size: 1/2/4 1599 llvm_i32_ty, // vindex(VGPR) 1600 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1601 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1602 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1603 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1604 // bit 1 = slc, 1605 // bit 2 = dlc on gfx10+)) 1606 // swizzled buffer (bit 3 = swz)) 1607 [IntrWillReturn, IntrArgMemOnly, 1608 ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1609 WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1610 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, 1611 ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1612def int_amdgcn_struct_ptr_buffer_load_lds : AMDGPUStructPtrBufferLoadLDS; 1613 1614} // defset AMDGPUBufferIntrinsics 1615 1616// Uses that do not set the done bit should set IntrWriteMem on the 1617// call site. 1618def int_amdgcn_exp : DefaultAttrsIntrinsic <[], [ 1619 llvm_i32_ty, // tgt, 1620 llvm_i32_ty, // en 1621 llvm_any_ty, // src0 (f32 or i32) 1622 LLVMMatchType<0>, // src1 1623 LLVMMatchType<0>, // src2 1624 LLVMMatchType<0>, // src3 1625 llvm_i1_ty, // done 1626 llvm_i1_ty // vm (ignored on GFX11+) 1627 ], 1628 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, 1629 ImmArg<ArgIndex<7>>, IntrWriteMem, IntrInaccessibleMemOnly] 1630>; 1631 1632// exp with row_en bit set. Only supported on GFX11+. 1633def int_amdgcn_exp_row : DefaultAttrsIntrinsic <[], [ 1634 llvm_i32_ty, // tgt, 1635 llvm_i32_ty, // en 1636 llvm_any_ty, // src0 (f32 or i32) 1637 LLVMMatchType<0>, // src1 1638 LLVMMatchType<0>, // src2 1639 LLVMMatchType<0>, // src3 1640 llvm_i1_ty, // done 1641 llvm_i32_ty], // row number 1642 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, 1643 IntrWriteMem, IntrInaccessibleMemOnly] 1644>; 1645 1646// exp with compr bit set. Not supported on GFX11+. 1647def int_amdgcn_exp_compr : DefaultAttrsIntrinsic <[], [ 1648 llvm_i32_ty, // tgt, 1649 llvm_i32_ty, // en 1650 llvm_anyvector_ty, // src0 (v2f16 or v2i16) 1651 LLVMMatchType<0>, // src1 1652 llvm_i1_ty, // done 1653 llvm_i1_ty], // vm 1654 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>, 1655 ImmArg<ArgIndex<5>>, IntrWriteMem, IntrInaccessibleMemOnly] 1656>; 1657 1658def int_amdgcn_buffer_wbinvl1_sc : 1659 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">, 1660 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 1661 1662def int_amdgcn_buffer_wbinvl1 : 1663 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1">, 1664 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 1665 1666def int_amdgcn_s_dcache_inv : 1667 ClangBuiltin<"__builtin_amdgcn_s_dcache_inv">, 1668 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 1669 1670def int_amdgcn_s_memtime : 1671 ClangBuiltin<"__builtin_amdgcn_s_memtime">, 1672 DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects]>; 1673 1674def int_amdgcn_s_sleep : 1675 ClangBuiltin<"__builtin_amdgcn_s_sleep">, 1676 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1677 IntrHasSideEffects]> { 1678} 1679 1680def int_amdgcn_s_incperflevel : 1681 ClangBuiltin<"__builtin_amdgcn_s_incperflevel">, 1682 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1683 IntrHasSideEffects]> { 1684} 1685 1686def int_amdgcn_s_decperflevel : 1687 ClangBuiltin<"__builtin_amdgcn_s_decperflevel">, 1688 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1689 IntrHasSideEffects]> { 1690} 1691 1692def int_amdgcn_s_sethalt : 1693 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1694 IntrHasSideEffects]>; 1695 1696def int_amdgcn_s_setprio : 1697 ClangBuiltin<"__builtin_amdgcn_s_setprio">, 1698 DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1699 IntrHasSideEffects]>; 1700 1701// This is IntrHasSideEffects so it can be used to read cycle counters. 1702def int_amdgcn_s_getreg : 1703 ClangBuiltin<"__builtin_amdgcn_s_getreg">, 1704 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], 1705 [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>] 1706>; 1707 1708// Note this can be used to set FP environment properties that are 1709// unsafe to change in non-strictfp functions. The register properties 1710// available (and value required to access them) may differ per 1711// subtarget. llvm.amdgcn.s.setreg(hwmode, value) 1712def int_amdgcn_s_setreg : 1713 ClangBuiltin<"__builtin_amdgcn_s_setreg">, 1714 DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_i32_ty], 1715 [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>] 1716>; 1717 1718// int_amdgcn_s_getpc is provided to allow a specific style of position 1719// independent code to determine the high part of its address when it is 1720// known (through convention) that the code and any data of interest does 1721// not cross a 4Gb address boundary. Use for any other purpose may not 1722// produce the desired results as optimizations may cause code movement, 1723// especially as we explicitly use IntrNoMem to allow optimizations. 1724def int_amdgcn_s_getpc : 1725 ClangBuiltin<"__builtin_amdgcn_s_getpc">, 1726 DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable, 1727 IntrWillReturn]>; 1728 1729// __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0> 1730// param values: 0 = P10, 1 = P20, 2 = P0 1731def int_amdgcn_interp_mov : 1732 ClangBuiltin<"__builtin_amdgcn_interp_mov">, 1733 DefaultAttrsIntrinsic<[llvm_float_ty], 1734 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1735 [IntrNoMem, IntrSpeculatable, 1736 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; 1737 1738// __builtin_amdgcn_interp_p1 <i>, <attr_chan>, <attr>, <m0> 1739// This intrinsic reads from lds, but the memory values are constant, 1740// so it behaves like IntrNoMem. 1741def int_amdgcn_interp_p1 : 1742 ClangBuiltin<"__builtin_amdgcn_interp_p1">, 1743 DefaultAttrsIntrinsic<[llvm_float_ty], 1744 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1745 [IntrNoMem, IntrSpeculatable, 1746 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; 1747 1748// __builtin_amdgcn_interp_p2 <p1>, <j>, <attr_chan>, <attr>, <m0> 1749def int_amdgcn_interp_p2 : 1750 ClangBuiltin<"__builtin_amdgcn_interp_p2">, 1751 DefaultAttrsIntrinsic<[llvm_float_ty], 1752 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1753 [IntrNoMem, IntrSpeculatable, 1754 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; 1755 // See int_amdgcn_v_interp_p1 for why this is IntrNoMem. 1756 1757// __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0> 1758// high selects whether high or low 16-bits are loaded from LDS 1759def int_amdgcn_interp_p1_f16 : 1760 ClangBuiltin<"__builtin_amdgcn_interp_p1_f16">, 1761 DefaultAttrsIntrinsic<[llvm_float_ty], 1762 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], 1763 [IntrNoMem, IntrSpeculatable, 1764 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; 1765 1766// __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0> 1767// high selects whether high or low 16-bits are loaded from LDS 1768def int_amdgcn_interp_p2_f16 : 1769 ClangBuiltin<"__builtin_amdgcn_interp_p2_f16">, 1770 DefaultAttrsIntrinsic<[llvm_half_ty], 1771 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], 1772 [IntrNoMem, IntrSpeculatable, 1773 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>; 1774 1775// llvm.amdgcn.lds.direct.load <m0> 1776// The input argument is m0, which contains a packed combination of address 1777// offset and flags describing the data type. 1778def int_amdgcn_lds_direct_load : 1779 DefaultAttrsIntrinsic<[llvm_any_ty], // overloaded for types u8, u16, i32/f32, i8, i16 1780 [llvm_i32_ty], 1781 [IntrReadMem, IntrSpeculatable]>; 1782 1783// llvm.amdgcn.lds.param.load <attr_chan>, <attr>, <m0> 1784// Like interp intrinsics, this reads from lds, but the memory values are constant, 1785// so it behaves like IntrNoMem. 1786def int_amdgcn_lds_param_load : 1787 DefaultAttrsIntrinsic<[llvm_float_ty], 1788 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1789 [IntrNoMem, IntrSpeculatable, 1790 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; 1791 1792// llvm.amdgcn.interp.inreg.p10 <p>, <i>, <p0> 1793def int_amdgcn_interp_inreg_p10 : 1794 DefaultAttrsIntrinsic<[llvm_float_ty], 1795 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 1796 [IntrNoMem, IntrSpeculatable]>; 1797 1798// llvm.amdgcn.interp.inreg.p2 <p>, <j>, <tmp> 1799def int_amdgcn_interp_inreg_p2 : 1800 DefaultAttrsIntrinsic<[llvm_float_ty], 1801 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 1802 [IntrNoMem, IntrSpeculatable]>; 1803 1804// llvm.amdgcn.interp.inreg.p10.f16 <p>, <i>, <p0>, <high> 1805// high selects whether high or low 16-bits are used for p and p0 operands 1806def int_amdgcn_interp_inreg_p10_f16: 1807 DefaultAttrsIntrinsic<[llvm_float_ty], 1808 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 1809 [IntrNoMem, IntrSpeculatable, 1810 ImmArg<ArgIndex<3>>]>; 1811 1812// llvm.amdgcn.interp.inreg.p2.f16 <p>, <j>, <tmp>, <high> 1813// high selects whether high or low 16-bits are used for p operand 1814def int_amdgcn_interp_inreg_p2_f16 : 1815 DefaultAttrsIntrinsic<[llvm_half_ty], 1816 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 1817 [IntrNoMem, IntrSpeculatable, 1818 ImmArg<ArgIndex<3>>]>; 1819 1820// Deprecated: use llvm.amdgcn.live.mask instead. 1821def int_amdgcn_ps_live : DefaultAttrsIntrinsic < 1822 [llvm_i1_ty], 1823 [], 1824 [IntrNoMem]>; 1825 1826// Query currently live lanes. 1827// Returns true if lane is live (and not a helper lane). 1828def int_amdgcn_live_mask : DefaultAttrsIntrinsic <[llvm_i1_ty], 1829 [], [IntrReadMem, IntrInaccessibleMemOnly] 1830>; 1831 1832def int_amdgcn_mbcnt_lo : 1833 ClangBuiltin<"__builtin_amdgcn_mbcnt_lo">, 1834 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1835 [IntrNoMem]>; 1836 1837def int_amdgcn_mbcnt_hi : 1838 ClangBuiltin<"__builtin_amdgcn_mbcnt_hi">, 1839 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1840 [IntrNoMem]>; 1841 1842// llvm.amdgcn.ds.swizzle src offset 1843def int_amdgcn_ds_swizzle : 1844 ClangBuiltin<"__builtin_amdgcn_ds_swizzle">, 1845 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1846 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, 1847 ImmArg<ArgIndex<1>>]>; 1848 1849def int_amdgcn_ubfe : DefaultAttrsIntrinsic<[llvm_anyint_ty], 1850 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], 1851 [IntrNoMem, IntrSpeculatable] 1852>; 1853 1854def int_amdgcn_sbfe : DefaultAttrsIntrinsic<[llvm_anyint_ty], 1855 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], 1856 [IntrNoMem, IntrSpeculatable] 1857>; 1858 1859def int_amdgcn_lerp : 1860 ClangBuiltin<"__builtin_amdgcn_lerp">, 1861 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1862 [IntrNoMem, IntrSpeculatable] 1863>; 1864 1865def int_amdgcn_sad_u8 : 1866 ClangBuiltin<"__builtin_amdgcn_sad_u8">, 1867 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1868 [IntrNoMem, IntrSpeculatable] 1869>; 1870 1871def int_amdgcn_msad_u8 : 1872 ClangBuiltin<"__builtin_amdgcn_msad_u8">, 1873 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1874 [IntrNoMem, IntrSpeculatable] 1875>; 1876 1877def int_amdgcn_sad_hi_u8 : 1878 ClangBuiltin<"__builtin_amdgcn_sad_hi_u8">, 1879 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1880 [IntrNoMem, IntrSpeculatable] 1881>; 1882 1883def int_amdgcn_sad_u16 : 1884 ClangBuiltin<"__builtin_amdgcn_sad_u16">, 1885 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1886 [IntrNoMem, IntrSpeculatable] 1887>; 1888 1889def int_amdgcn_qsad_pk_u16_u8 : 1890 ClangBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">, 1891 DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], 1892 [IntrNoMem, IntrSpeculatable] 1893>; 1894 1895def int_amdgcn_mqsad_pk_u16_u8 : 1896 ClangBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">, 1897 DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], 1898 [IntrNoMem, IntrSpeculatable] 1899>; 1900 1901def int_amdgcn_mqsad_u32_u8 : 1902 ClangBuiltin<"__builtin_amdgcn_mqsad_u32_u8">, 1903 DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty], 1904 [IntrNoMem, IntrSpeculatable] 1905>; 1906 1907def int_amdgcn_cvt_pk_u8_f32 : 1908 ClangBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">, 1909 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], 1910 [IntrNoMem, IntrSpeculatable] 1911>; 1912 1913def int_amdgcn_icmp : 1914 Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>, llvm_i32_ty], 1915 [IntrNoMem, IntrConvergent, 1916 ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 1917 1918def int_amdgcn_fcmp : 1919 Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>, llvm_i32_ty], 1920 [IntrNoMem, IntrConvergent, 1921 ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 1922 1923def int_amdgcn_ballot : 1924 Intrinsic<[llvm_anyint_ty], [llvm_i1_ty], 1925 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 1926 1927def int_amdgcn_inverse_ballot : 1928 Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], 1929 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 1930 1931class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic< 1932 [data_ty], 1933 [ 1934 LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR) 1935 llvm_i32_ty // Reduction Strategy Switch for lowering ( 0: Default, 1936 // 1: Iterative strategy, and 1937 // 2. DPP) 1938 ], 1939 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>; 1940 1941def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce; 1942def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce; 1943 1944def int_amdgcn_readfirstlane : 1945 ClangBuiltin<"__builtin_amdgcn_readfirstlane">, 1946 Intrinsic<[llvm_i32_ty], [llvm_i32_ty], 1947 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 1948 1949// The lane argument must be uniform across the currently active threads of the 1950// current wave. Otherwise, the result is undefined. 1951def int_amdgcn_readlane : 1952 ClangBuiltin<"__builtin_amdgcn_readlane">, 1953 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1954 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 1955 1956// The value to write and lane select arguments must be uniform across the 1957// currently active threads of the current wave. Otherwise, the result is 1958// undefined. 1959def int_amdgcn_writelane : 1960 ClangBuiltin<"__builtin_amdgcn_writelane">, 1961 Intrinsic<[llvm_i32_ty], [ 1962 llvm_i32_ty, // uniform value to write: returned by the selected lane 1963 llvm_i32_ty, // uniform lane select 1964 llvm_i32_ty // returned by all lanes other than the selected one 1965 ], 1966 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 1967>; 1968 1969def int_amdgcn_alignbyte : ClangBuiltin<"__builtin_amdgcn_alignbyte">, 1970 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1971 [IntrNoMem, IntrSpeculatable] 1972>; 1973 1974def int_amdgcn_mul_i24 : DefaultAttrsIntrinsic<[llvm_i32_ty], 1975 [llvm_i32_ty, llvm_i32_ty], 1976 [IntrNoMem, IntrSpeculatable] 1977>; 1978 1979def int_amdgcn_mul_u24 : DefaultAttrsIntrinsic<[llvm_i32_ty], 1980 [llvm_i32_ty, llvm_i32_ty], 1981 [IntrNoMem, IntrSpeculatable] 1982>; 1983 1984def int_amdgcn_mulhi_i24 : DefaultAttrsIntrinsic<[llvm_i32_ty], 1985 [llvm_i32_ty, llvm_i32_ty], 1986 [IntrNoMem, IntrSpeculatable] 1987>; 1988 1989def int_amdgcn_mulhi_u24 : DefaultAttrsIntrinsic<[llvm_i32_ty], 1990 [llvm_i32_ty, llvm_i32_ty], 1991 [IntrNoMem, IntrSpeculatable] 1992>; 1993 1994// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id) 1995// 1996// bar_val is the total number of waves that will wait on this 1997// barrier, minus 1. 1998def int_amdgcn_ds_gws_init : 1999 ClangBuiltin<"__builtin_amdgcn_ds_gws_init">, 2000 Intrinsic<[], 2001 [llvm_i32_ty, llvm_i32_ty], 2002 [IntrConvergent, IntrWriteMem, 2003 IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2004 [SDNPMemOperand] 2005>; 2006 2007// llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id) 2008// bar_val is the total number of waves that will wait on this 2009// barrier, minus 1. 2010def int_amdgcn_ds_gws_barrier : 2011 ClangBuiltin<"__builtin_amdgcn_ds_gws_barrier">, 2012 Intrinsic<[], 2013 [llvm_i32_ty, llvm_i32_ty], 2014 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2015 [SDNPMemOperand] 2016>; 2017 2018// llvm.amdgcn.ds.gws.sema.v(i32 resource_id) 2019def int_amdgcn_ds_gws_sema_v : 2020 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_v">, 2021 Intrinsic<[], 2022 [llvm_i32_ty], 2023 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2024 [SDNPMemOperand] 2025>; 2026 2027// llvm.amdgcn.ds.gws.sema.br(i32 vsrc, i32 resource_id) 2028def int_amdgcn_ds_gws_sema_br : 2029 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_br">, 2030 Intrinsic<[], 2031 [llvm_i32_ty, llvm_i32_ty], 2032 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2033 [SDNPMemOperand] 2034>; 2035 2036// llvm.amdgcn.ds.gws.sema.p(i32 resource_id) 2037def int_amdgcn_ds_gws_sema_p : 2038 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_p">, 2039 Intrinsic<[], 2040 [llvm_i32_ty], 2041 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2042 [SDNPMemOperand] 2043>; 2044 2045// llvm.amdgcn.ds.gws.sema.release.all(i32 resource_id) 2046def int_amdgcn_ds_gws_sema_release_all : 2047 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_release_all">, 2048 Intrinsic<[], 2049 [llvm_i32_ty], 2050 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2051 [SDNPMemOperand] 2052>; 2053 2054 2055// Copies the source value to the destination value, with the guarantee that 2056// the source value is computed as if the entire program were executed in WQM. 2057def int_amdgcn_wqm : Intrinsic<[llvm_any_ty], 2058 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree] 2059>; 2060 2061// Copies the source value to the destination value, such that the source 2062// is computed as if the entire program were executed in WQM if any other 2063// program code executes in WQM. 2064def int_amdgcn_softwqm : Intrinsic<[llvm_any_ty], 2065 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree] 2066>; 2067 2068// Return true if at least one thread within the pixel quad passes true into 2069// the function. 2070def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty], 2071 [llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2072>; 2073 2074// If false, set EXEC=0 for the current thread until the end of program. 2075// FIXME: Should this be IntrNoMem, IntrHasSideEffects, or IntrWillReturn? 2076def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], [IntrNoCallback, IntrNoFree]>; 2077 2078def int_amdgcn_endpgm : ClangBuiltin<"__builtin_amdgcn_endpgm">, 2079 Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects, IntrNoCallback, IntrNoFree] 2080>; 2081 2082// If false, mark all active lanes as helper lanes until the end of program. 2083def int_amdgcn_wqm_demote : Intrinsic<[], 2084 [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly, IntrNoCallback, IntrNoFree] 2085>; 2086 2087// Copies the active channels of the source value to the destination value, 2088// with the guarantee that the source value is computed as if the entire 2089// program were executed in Whole Wavefront Mode, i.e. with all channels 2090// enabled, with a few exceptions: - Phi nodes which require WWM return an 2091// undefined value. 2092def int_amdgcn_strict_wwm : Intrinsic<[llvm_any_ty], 2093 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 2094 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2095>; 2096// Deprecated. Use int_amdgcn_strict_wwm instead. 2097def int_amdgcn_wwm : Intrinsic<[llvm_any_ty], 2098 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 2099 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2100>; 2101def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty], 2102 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 2103 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2104>; 2105 2106// Given a value, copies it while setting all the inactive lanes to a given 2107// value. Note that OpenGL helper lanes are considered active, so if the 2108// program ever uses WQM, then the instruction and the first source will be 2109// computed in WQM. 2110def int_amdgcn_set_inactive : 2111 Intrinsic<[llvm_anyint_ty], 2112 [LLVMMatchType<0>, // value to be copied 2113 LLVMMatchType<0>], // value for the inactive lanes to take 2114 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2115 2116// Return if the given flat pointer points to a local memory address. 2117def int_amdgcn_is_shared : ClangBuiltin<"__builtin_amdgcn_is_shared">, 2118 DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], 2119 [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>] 2120>; 2121 2122// Return if the given flat pointer points to a prvate memory address. 2123def int_amdgcn_is_private : ClangBuiltin<"__builtin_amdgcn_is_private">, 2124 DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], 2125 [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>] 2126>; 2127 2128// A uniform tail call to a function with the `amdgpu_cs_chain` or 2129// `amdgpu_cs_chain_preserve` calling convention. It will populate the SGPRs 2130// starting at s0 and the VGPRs starting at v8, set EXEC and perform a jump to 2131// the given function. 2132// Can only be used in functions with the `amdgpu_cs`, `amdgpu_cs_chain` or 2133// `amdgpu_cs_chain_preserve` calling conventions, and only in uniform control 2134// flow. 2135def int_amdgcn_cs_chain: 2136 Intrinsic<[], 2137 [llvm_anyptr_ty, // The function to jump to. 2138 llvm_anyint_ty, // Value to put in EXEC (should be i32 or i64). 2139 llvm_any_ty, // Arguments that will be copied into SGPRs (s0+). 2140 // Must be uniform. 2141 llvm_any_ty, // Arguments that will be copied into VGPRs (v8+). 2142 // Need not be uniform. 2143 llvm_i32_ty, // Flags. 2144 llvm_vararg_ty // Additional arguments. Only present if Flags is 2145 // non-zero. 2146 ], 2147 [IntrConvergent, IntrNoReturn, ImmArg<ArgIndex<4>>]>; 2148 2149 2150//===----------------------------------------------------------------------===// 2151// CI+ Intrinsics 2152//===----------------------------------------------------------------------===// 2153 2154def int_amdgcn_s_dcache_inv_vol : 2155 ClangBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">, 2156 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 2157 2158def int_amdgcn_buffer_wbinvl1_vol : 2159 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">, 2160 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 2161 2162//===----------------------------------------------------------------------===// 2163// VI Intrinsics 2164//===----------------------------------------------------------------------===// 2165 2166// llvm.amdgcn.mov.dpp.i32 <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 2167def int_amdgcn_mov_dpp : 2168 Intrinsic<[llvm_anyint_ty], 2169 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, 2170 llvm_i1_ty], 2171 [IntrNoMem, IntrConvergent, IntrWillReturn, 2172 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, 2173 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>; 2174 2175// llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 2176// Should be equivalent to: 2177// v_mov_b32 <dest> <old> 2178// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 2179def int_amdgcn_update_dpp : 2180 Intrinsic<[llvm_anyint_ty], 2181 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, 2182 llvm_i32_ty, llvm_i32_ty, llvm_i1_ty], 2183 [IntrNoMem, IntrConvergent, IntrWillReturn, 2184 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, 2185 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; 2186 2187def int_amdgcn_s_dcache_wb : 2188 ClangBuiltin<"__builtin_amdgcn_s_dcache_wb">, 2189 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2190 2191def int_amdgcn_s_dcache_wb_vol : 2192 ClangBuiltin<"__builtin_amdgcn_s_dcache_wb_vol">, 2193 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2194 2195def int_amdgcn_s_memrealtime : 2196 ClangBuiltin<"__builtin_amdgcn_s_memrealtime">, 2197 Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2198 2199// llvm.amdgcn.ds.permute <index> <src> 2200def int_amdgcn_ds_permute : 2201 ClangBuiltin<"__builtin_amdgcn_ds_permute">, 2202 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2203 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2204 2205// llvm.amdgcn.ds.bpermute <index> <src> 2206def int_amdgcn_ds_bpermute : 2207 ClangBuiltin<"__builtin_amdgcn_ds_bpermute">, 2208 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2209 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2210 2211// llvm.amdgcn.perm <src0> <src1> <selector> 2212def int_amdgcn_perm : 2213 ClangBuiltin<"__builtin_amdgcn_perm">, 2214 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2215 [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2216 2217//===----------------------------------------------------------------------===// 2218// GFX9 Intrinsics 2219//===----------------------------------------------------------------------===// 2220 2221class AMDGPUGlobalLoadLDS : Intrinsic < 2222 [], 2223 [LLVMQualPointerType<1>, // Base global pointer to load from 2224 LLVMQualPointerType<3>, // LDS base pointer to store to 2225 llvm_i32_ty, // Data byte size: 1/2/4 2226 llvm_i32_ty, // imm offset (applied to both global and LDS address) 2227 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0, 2228 // bit 1 = slc/sc1, 2229 // bit 2 = dlc on gfx10+)) 2230 // bit 4 = scc/nt on gfx90a+)) 2231 [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, 2232 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree], 2233 "", [SDNPMemOperand]>; 2234def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS; 2235 2236//===----------------------------------------------------------------------===// 2237// GFX10 Intrinsics 2238//===----------------------------------------------------------------------===// 2239 2240// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control> 2241def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">, 2242 Intrinsic<[llvm_i32_ty], 2243 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2244 [IntrNoMem, IntrConvergent, IntrWillReturn, 2245 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; 2246 2247// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control> 2248def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">, 2249 Intrinsic<[llvm_i32_ty], 2250 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2251 [IntrNoMem, IntrConvergent, IntrWillReturn, 2252 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; 2253 2254// llvm.amdgcn.mov.dpp8.i32 <src> <sel> 2255// <sel> is a 32-bit constant whose high 8 bits must be zero which selects 2256// the lanes to read from. 2257def int_amdgcn_mov_dpp8 : 2258 Intrinsic<[llvm_anyint_ty], 2259 [LLVMMatchType<0>, llvm_i32_ty], 2260 [IntrNoMem, IntrConvergent, IntrWillReturn, 2261 ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree]>; 2262 2263def int_amdgcn_s_get_waveid_in_workgroup : 2264 ClangBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">, 2265 Intrinsic<[llvm_i32_ty], [], 2266 [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2267 2268class AMDGPUGlobalAtomicRtn<LLVMType vt> : Intrinsic < 2269 [vt], 2270 [llvm_anyptr_ty, // vaddr 2271 vt], // vdata(VGPR) 2272 [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "", 2273 [SDNPMemOperand]>; 2274 2275def int_amdgcn_global_atomic_csub : AMDGPUGlobalAtomicRtn<llvm_i32_ty>; 2276 2277// uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>, 2278// <ray_dir>, <ray_inv_dir>, <texture_descr> 2279// <node_ptr> is i32 or i64. 2280// <ray_dir> and <ray_inv_dir> are both v3f16 or both v3f32. 2281def int_amdgcn_image_bvh_intersect_ray : 2282 DefaultAttrsIntrinsic<[llvm_v4i32_ty], 2283 [llvm_anyint_ty, llvm_float_ty, llvm_v3f32_ty, llvm_anyvector_ty, 2284 LLVMMatchType<1>, llvm_v4i32_ty], 2285 [IntrReadMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2286 2287//===----------------------------------------------------------------------===// 2288// GFX11 Intrinsics 2289//===----------------------------------------------------------------------===// 2290 2291// llvm.amdgcn.permlane64 <src0> 2292def int_amdgcn_permlane64 : 2293 ClangBuiltin<"__builtin_amdgcn_permlane64">, 2294 Intrinsic<[llvm_i32_ty], [llvm_i32_ty], 2295 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2296 2297def int_amdgcn_ds_add_gs_reg_rtn : 2298 ClangBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">, 2299 Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], 2300 [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree], 2301 "", [SDNPMemOperand]>; 2302 2303def int_amdgcn_ds_sub_gs_reg_rtn : 2304 ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">, 2305 Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], 2306 [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree], 2307 "", [SDNPMemOperand]>; 2308 2309def int_amdgcn_ds_bvh_stack_rtn : 2310 Intrinsic< 2311 [llvm_i32_ty, llvm_i32_ty], // %vdst, %addr 2312 [ 2313 llvm_i32_ty, // %addr 2314 llvm_i32_ty, // %data0 2315 llvm_v4i32_ty, // %data1 2316 llvm_i32_ty, // %offset 2317 ], 2318 [ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree] 2319 >; 2320 2321// WMMA (Wave Matrix Multiply-Accumulate) intrinsics 2322// 2323// These operations perform a matrix multiplication and accumulation of 2324// the form: D = A * B + C . 2325 2326class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> : 2327 Intrinsic< 2328 [CD], // %D 2329 [ 2330 AB, // %A 2331 AB, // %B 2332 LLVMMatchType<0>, // %C 2333 ], 2334 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2335>; 2336 2337class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> : 2338 Intrinsic< 2339 [CD], // %D 2340 [ 2341 AB, // %A 2342 AB, // %B 2343 LLVMMatchType<0>, // %C 2344 llvm_i1_ty, // %high 2345 ], 2346 [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree] 2347>; 2348 2349class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> : 2350 Intrinsic< 2351 [CD], // %D 2352 [ 2353 llvm_i1_ty, // %A_sign 2354 AB, // %A 2355 llvm_i1_ty, // %B_sign 2356 AB, // %B 2357 LLVMMatchType<0>, // %C 2358 llvm_i1_ty, // %clamp 2359 ], 2360 [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree] 2361>; 2362 2363def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llvm_anyfloat_ty>; 2364def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>; 2365def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>; 2366def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>; 2367def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>; 2368def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>; 2369 2370def int_amdgcn_s_wait_event_export_ready : 2371 ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">, 2372 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn] 2373>; 2374 2375//===----------------------------------------------------------------------===// 2376// Deep learning intrinsics. 2377//===----------------------------------------------------------------------===// 2378 2379// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp) 2380// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2381def int_amdgcn_fdot2 : 2382 ClangBuiltin<"__builtin_amdgcn_fdot2">, 2383 DefaultAttrsIntrinsic< 2384 [llvm_float_ty], // %r 2385 [ 2386 llvm_v2f16_ty, // %a 2387 llvm_v2f16_ty, // %b 2388 llvm_float_ty, // %c 2389 llvm_i1_ty // %clamp 2390 ], 2391 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2392 >; 2393 2394// f16 %r = llvm.amdgcn.fdot2.f16.f16(v2f16 %a, v2f16 %b, f16 %c) 2395// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2396def int_amdgcn_fdot2_f16_f16 : 2397 ClangBuiltin<"__builtin_amdgcn_fdot2_f16_f16">, 2398 DefaultAttrsIntrinsic< 2399 [llvm_half_ty], // %r 2400 [ 2401 llvm_v2f16_ty, // %a 2402 llvm_v2f16_ty, // %b 2403 llvm_half_ty // %c 2404 ], 2405 [IntrNoMem, IntrSpeculatable] 2406 >; 2407 2408// bf16 %r = llvm.amdgcn.fdot2.bf16.bf16(v2bf16 %a, v2bf16 %b, bf16 %c) 2409// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2410def int_amdgcn_fdot2_bf16_bf16 : 2411 ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, 2412 DefaultAttrsIntrinsic< 2413 [llvm_i16_ty], // %r 2414 [ 2415 llvm_v2i16_ty, // %a 2416 llvm_v2i16_ty, // %b 2417 llvm_i16_ty // %c 2418 ], 2419 [IntrNoMem, IntrSpeculatable] 2420 >; 2421 2422// f32 %r = llvm.amdgcn.fdot2.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp) 2423// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2424def int_amdgcn_fdot2_f32_bf16 : 2425 ClangBuiltin<"__builtin_amdgcn_fdot2_f32_bf16">, 2426 DefaultAttrsIntrinsic< 2427 [llvm_float_ty], // %r 2428 [ 2429 llvm_v2i16_ty, // %a 2430 llvm_v2i16_ty, // %b 2431 llvm_float_ty, // %c 2432 llvm_i1_ty // %clamp 2433 ], 2434 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2435 >; 2436 2437// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp) 2438// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2439def int_amdgcn_sdot2 : 2440 ClangBuiltin<"__builtin_amdgcn_sdot2">, 2441 DefaultAttrsIntrinsic< 2442 [llvm_i32_ty], // %r 2443 [ 2444 llvm_v2i16_ty, // %a 2445 llvm_v2i16_ty, // %b 2446 llvm_i32_ty, // %c 2447 llvm_i1_ty // %clamp 2448 ], 2449 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2450 >; 2451 2452// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp) 2453// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2454def int_amdgcn_udot2 : 2455 ClangBuiltin<"__builtin_amdgcn_udot2">, 2456 DefaultAttrsIntrinsic< 2457 [llvm_i32_ty], // %r 2458 [ 2459 llvm_v2i16_ty, // %a 2460 llvm_v2i16_ty, // %b 2461 llvm_i32_ty, // %c 2462 llvm_i1_ty // %clamp 2463 ], 2464 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2465 >; 2466 2467// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp) 2468// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2469def int_amdgcn_sdot4 : 2470 ClangBuiltin<"__builtin_amdgcn_sdot4">, 2471 DefaultAttrsIntrinsic< 2472 [llvm_i32_ty], // %r 2473 [ 2474 llvm_i32_ty, // %a 2475 llvm_i32_ty, // %b 2476 llvm_i32_ty, // %c 2477 llvm_i1_ty // %clamp 2478 ], 2479 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2480 >; 2481 2482// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp) 2483// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2484def int_amdgcn_udot4 : 2485 ClangBuiltin<"__builtin_amdgcn_udot4">, 2486 DefaultAttrsIntrinsic< 2487 [llvm_i32_ty], // %r 2488 [ 2489 llvm_i32_ty, // %a 2490 llvm_i32_ty, // %b 2491 llvm_i32_ty, // %c 2492 llvm_i1_ty // %clamp 2493 ], 2494 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2495 >; 2496 2497// i32 %r = llvm.amdgcn.sudot4(i1 %a_sign, v4i8 (as i32) %a, i1 %b_sign, v4i8 (as i32) %b, i32 %c, i1 %clamp) 2498// Treat input as signed (_sign = 1) or unsigned (_sign = 0). 2499// a[i in 0. . . 3] = (%a_sign ? a.i8[i] : promoteToSigned(a.u8[i])); 2500// b[i in 0. . . 3] = (%b_sign ? b.i8[i] : promoteToSigned(b.u8[i])); 2501// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2502def int_amdgcn_sudot4 : 2503 ClangBuiltin<"__builtin_amdgcn_sudot4">, 2504 DefaultAttrsIntrinsic< 2505 [llvm_i32_ty], // %r 2506 [ 2507 llvm_i1_ty, // %a_sign 2508 llvm_i32_ty, // %a 2509 llvm_i1_ty, // %b_sign 2510 llvm_i32_ty, // %b 2511 llvm_i32_ty, // %c 2512 llvm_i1_ty // %clamp 2513 ], 2514 [IntrNoMem, IntrSpeculatable, 2515 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] 2516 >; 2517 2518// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp) 2519// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2520// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2521def int_amdgcn_sdot8 : 2522 ClangBuiltin<"__builtin_amdgcn_sdot8">, 2523 DefaultAttrsIntrinsic< 2524 [llvm_i32_ty], // %r 2525 [ 2526 llvm_i32_ty, // %a 2527 llvm_i32_ty, // %b 2528 llvm_i32_ty, // %c 2529 llvm_i1_ty // %clamp 2530 ], 2531 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2532 >; 2533 2534// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp) 2535// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2536// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2537def int_amdgcn_udot8 : 2538 ClangBuiltin<"__builtin_amdgcn_udot8">, 2539 DefaultAttrsIntrinsic< 2540 [llvm_i32_ty], // %r 2541 [ 2542 llvm_i32_ty, // %a 2543 llvm_i32_ty, // %b 2544 llvm_i32_ty, // %c 2545 llvm_i1_ty // %clamp 2546 ], 2547 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2548 >; 2549 2550// i32 %r = llvm.amdgcn.sudot8(i1 %a_sign, v8i4 (as i32) %a, i1 %b_sign, v8i4 (as i32) %b, i32 %c, i1 %clamp) 2551// Treat input as signed (_sign = 1) or unsigned (_sign = 0). 2552// a[i in 0. . . 7] = (%a_sign ? a.i4[i] : promoteToSigned(a.u4[i])); 2553// b[i in 0. . . 7] = (%b_sign ? b.i4[i] : promoteToSigned(b.u4[i])); 2554// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2555// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2556 def int_amdgcn_sudot8 : 2557 ClangBuiltin<"__builtin_amdgcn_sudot8">, 2558 DefaultAttrsIntrinsic< 2559 [llvm_i32_ty], // %r 2560 [ 2561 llvm_i1_ty, // %a_sign 2562 llvm_i32_ty, // %a 2563 llvm_i1_ty, // %b_sign 2564 llvm_i32_ty, // %b 2565 llvm_i32_ty, // %c 2566 llvm_i1_ty // %clamp 2567 ], 2568 [IntrNoMem, IntrSpeculatable, 2569 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] 2570 >; 2571 2572//===----------------------------------------------------------------------===// 2573// gfx908 intrinsics 2574// ===----------------------------------------------------------------------===// 2575 2576def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; 2577 2578// llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp 2579class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> : 2580 ClangBuiltin<!subst("int", "__builtin", NAME)>, 2581 DefaultAttrsIntrinsic<[DestTy], 2582 [SrcABTy, SrcABTy, DestTy, 2583 llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2584 [IntrConvergent, IntrNoMem, 2585 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 2586 2587def int_amdgcn_mfma_f32_32x32x1f32 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>; 2588def int_amdgcn_mfma_f32_16x16x1f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>; 2589def int_amdgcn_mfma_f32_4x4x1f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>; 2590def int_amdgcn_mfma_f32_32x32x2f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>; 2591def int_amdgcn_mfma_f32_16x16x4f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>; 2592def int_amdgcn_mfma_f32_32x32x4f16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4f16_ty>; 2593def int_amdgcn_mfma_f32_16x16x4f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>; 2594def int_amdgcn_mfma_f32_4x4x4f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>; 2595def int_amdgcn_mfma_f32_32x32x8f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>; 2596def int_amdgcn_mfma_f32_16x16x16f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>; 2597def int_amdgcn_mfma_i32_32x32x4i8 : AMDGPUMfmaIntrinsic<llvm_v32i32_ty, llvm_i32_ty>; 2598def int_amdgcn_mfma_i32_16x16x4i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>; 2599def int_amdgcn_mfma_i32_4x4x4i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>; 2600def int_amdgcn_mfma_i32_32x32x8i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>; 2601def int_amdgcn_mfma_i32_16x16x16i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>; 2602def int_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v2i16_ty>; 2603def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>; 2604def int_amdgcn_mfma_f32_4x4x2bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>; 2605def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>; 2606def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>; 2607 2608//===----------------------------------------------------------------------===// 2609// gfx90a intrinsics 2610// ===----------------------------------------------------------------------===// 2611 2612def int_amdgcn_global_atomic_fmin : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; 2613def int_amdgcn_global_atomic_fmax : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; 2614def int_amdgcn_flat_atomic_fadd : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; 2615def int_amdgcn_flat_atomic_fmin : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; 2616def int_amdgcn_flat_atomic_fmax : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>; 2617 2618def int_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>; 2619def int_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>; 2620def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>; 2621def int_amdgcn_mfma_f32_32x32x8bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>; 2622def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>; 2623 2624// Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA. 2625// Three bits corresponding to the neg modifier applied to the respective 2626// source operand. 2627def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic<llvm_v4f64_ty, llvm_double_ty>; 2628def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>; 2629 2630//===----------------------------------------------------------------------===// 2631// gfx940 intrinsics 2632// ===----------------------------------------------------------------------===// 2633 2634// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. 2635def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>; 2636def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>; 2637def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic< 2638 [llvm_v2i16_ty], 2639 [LLVMQualPointerType<3>, llvm_v2i16_ty], 2640 [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>, 2641 ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">; 2642 2643def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>; 2644def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>; 2645def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>; 2646def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>; 2647 2648class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> : 2649 AMDGPUMfmaIntrinsic<DestTy, llvm_i64_ty>; 2650 2651multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> { 2652 foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in 2653 def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>; 2654} 2655 2656defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>; 2657defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>; 2658 2659// llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid 2660class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> : 2661 ClangBuiltin<!subst("int", "__builtin", NAME)>, 2662 DefaultAttrsIntrinsic<[DestTy], 2663 [SrcA, SrcB, DestTy, llvm_i32_ty, 2664 llvm_i32_ty, llvm_i32_ty], 2665 [IntrConvergent, IntrNoMem, 2666 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 2667 2668def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>; 2669def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>; 2670def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>; 2671def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>; 2672def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>; 2673def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>; 2674 2675class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> : 2676 AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>; 2677 2678multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> { 2679 foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in 2680 def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>; 2681} 2682 2683defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>; 2684defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>; 2685 2686// llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3] 2687// byte_sel selects byte from srcA. 2688def int_amdgcn_cvt_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_bf8">, 2689 DefaultAttrsIntrinsic<[llvm_float_ty], 2690 [llvm_i32_ty, llvm_i32_ty], 2691 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 2692 2693// llvm.amdgcn.cvt.f32.fp8 float vdst, int srcA, imm byte_sel [0..3] 2694def int_amdgcn_cvt_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_fp8">, 2695 DefaultAttrsIntrinsic<[llvm_float_ty], 2696 [llvm_i32_ty, llvm_i32_ty], 2697 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 2698 2699// llvm.amdgcn.cvt.pk.f32.bf8 float2 vdst, int srcA, imm word_sel 2700// word_sel = 1 selects 2 high bytes, 0 selects 2 low bytes. 2701def int_amdgcn_cvt_pk_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_bf8">, 2702 DefaultAttrsIntrinsic<[llvm_v2f32_ty], 2703 [llvm_i32_ty, llvm_i1_ty], 2704 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 2705 2706// llvm.amdgcn.cvt.pk.f32.fp8 float2 vdst, int srcA, imm word_sel. 2707def int_amdgcn_cvt_pk_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_fp8">, 2708 DefaultAttrsIntrinsic<[llvm_v2f32_ty], 2709 [llvm_i32_ty, llvm_i1_ty], 2710 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 2711 2712// llvm.amdgcn.cvt.pk.bf8.f32 int vdst, float srcA, float srcB, int old, imm word_sel 2713// word_sel = 1 selects 2 high bytes in the vdst, 0 selects 2 low bytes. 2714def int_amdgcn_cvt_pk_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f32">, 2715 DefaultAttrsIntrinsic<[llvm_i32_ty], 2716 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty], 2717 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 2718 2719// llvm.amdgcn.cvt.pk.fp8.f32 int vdst, float srcA, float srcB, int old, imm word_sel 2720def int_amdgcn_cvt_pk_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32">, 2721 DefaultAttrsIntrinsic<[llvm_i32_ty], 2722 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty], 2723 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 2724 2725// llvm.amdgcn.cvt.sr.bf8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3] 2726// byte_sel selects byte to write into vdst. 2727def int_amdgcn_cvt_sr_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f32">, 2728 DefaultAttrsIntrinsic<[llvm_i32_ty], 2729 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2730 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 2731 2732// llvm.amdgcn.cvt.sr.fp8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3] 2733def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">, 2734 DefaultAttrsIntrinsic<[llvm_i32_ty], 2735 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2736 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 2737 2738// Represent a relocation constant. 2739def int_amdgcn_reloc_constant : DefaultAttrsIntrinsic< 2740 [llvm_i32_ty], [llvm_metadata_ty], 2741 [IntrNoMem, IntrSpeculatable] 2742>; 2743 2744//===----------------------------------------------------------------------===// 2745// Special Intrinsics for backend internal use only. No frontend 2746// should emit calls to these. 2747// ===----------------------------------------------------------------------===// 2748def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_anyint_ty], 2749 [llvm_i1_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2750>; 2751 2752def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_anyint_ty], 2753 [llvm_anyint_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2754>; 2755 2756def int_amdgcn_if_break : Intrinsic<[llvm_anyint_ty], 2757 [llvm_i1_ty, LLVMMatchType<0>], 2758 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2759>; 2760 2761def int_amdgcn_loop : Intrinsic<[llvm_i1_ty], 2762 [llvm_anyint_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2763>; 2764 2765def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty], 2766 [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2767 2768// Represent unreachable in a divergent region. 2769def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent, IntrNoCallback, IntrNoFree]>; 2770 2771// Emit 2.5 ulp, no denormal division. Should only be inserted by 2772// pass based on !fpmath metadata. 2773def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic< 2774 [llvm_float_ty], [llvm_float_ty, llvm_float_ty], 2775 [IntrNoMem, IntrSpeculatable] 2776>; 2777} 2778