1; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s 2 3declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) 4declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) 5declare i32 @llvm.amdgcn.workitem.id.x() 6 7; GCN-LABEL: {{^}}test_load_mfma_store16: 8; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] 9; GCN-NOT: v_accvgpr_write 10; GCN: v_mfma_f32_32x32x1f32 11; GCN-NEXT: s_nop 7 12; GCN-NEXT: s_nop 7 13; GCN-NEXT: s_nop 2 14; GCN-NOT: v_accvgpr_read 15; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}] 16define amdgpu_kernel void @test_load_mfma_store16(<32 x float> addrspace(1)* %arg) { 17bb: 18 %tid = call i32 @llvm.amdgcn.workitem.id.x() 19 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid 20 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep 21 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) 22 store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep 23 ret void 24} 25 26; GCN-LABEL: {{^}}test_load1_mfma_store1: 27; GCN: global_load_dword a{{[0-9]+}}, v{{[0-9:]+}}, s[{{[0-9:]+}}] 28; GCN-NOT: v_accvgpr_read 29; GCN: v_mfma_f32_32x32x1f32 a{{\[}}[[N:[0-9]+]]: 30; GCN-NEXT: s_nop 7 31; GCN-NEXT: s_nop 7 32; GCN-NEXT: s_nop 2 33; GCN-NOT: v_accvgpr_read 34; GCN-NEXT: global_store_dword v{{[0-9:]+}}, a[[N]], s[{{[0-9:]+}}] 35define amdgpu_kernel void @test_load1_mfma_store1(float addrspace(1)* %arg) { 36bb: 37 %tid = call i32 @llvm.amdgcn.workitem.id.x() 38 %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tid 39 %in.1 = load float, float addrspace(1)* %gep 40 %init = insertelement <32 x float> zeroinitializer, float %in.1, i32 0 41 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %init, i32 1, i32 2, i32 3) 42 %elt = extractelement <32 x float> %mai.1, i32 0 43 store float %elt, float addrspace(1)* %gep 44 ret void 45} 46 47; GCN-LABEL: {{^}}test_load4_mfma_store4: 48; GCN: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] 49; GCN-NOT: v_accvgpr_write 50; GCN: v_mfma_i32_4x4x4i8 [[A:a\[[0-9:]+\]]] 51; GCN-NEXT: s_nop 4 52; GCN-NOT: v_accvgpr_read 53; GCN-NEXT: global_store_dwordx4 v{{[0-9:]+}}, [[A]], s[{{[0-9:]+}}] 54define amdgpu_kernel void @test_load4_mfma_store4(<4 x i32> addrspace(1)* %arg) { 55bb: 56 %tid = call i32 @llvm.amdgcn.workitem.id.x() 57 %gep = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %tid 58 %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %gep 59 %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 0, i32 0, i32 0) 60 store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %gep 61 ret void 62} 63 64; GCN-LABEL: {{^}}test_load_store: 65; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] 66; GCN-NOT: v_accvgpr 67; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}] 68define amdgpu_kernel void @test_load_store(<32 x float> addrspace(1)* %arg) { 69bb: 70 %tid = call i32 @llvm.amdgcn.workitem.id.x() 71 %gep.1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid 72 %gep.2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %gep.1, i32 32 73 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep.1 74 store <32 x float> %in.1, <32 x float> addrspace(1)* %gep.2 75 ret void 76} 77 78; GCN-LABEL: {{^}}test_load_add_mfma_store: 79; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] 80; GCN-COUNT-32: v_accvgpr_write 81; GCN: v_mfma_f32_32x32x1f32 82; GCN-NEXT: s_nop 7 83; GCN-NEXT: s_nop 7 84; GCN-NEXT: s_nop 2 85; GCN-NOT: v_accvgpr_read 86; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}] 87define amdgpu_kernel void @test_load_add_mfma_store(<32 x float> addrspace(1)* %arg) { 88bb: 89 %tid = call i32 @llvm.amdgcn.workitem.id.x() 90 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid 91 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep 92 %add.1 = fadd <32 x float> %in.1, %in.1 93 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3) 94 store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep 95 ret void 96} 97 98; GCN-LABEL: {{^}}test_load_add_store: 99; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] 100; GCN-NOT: v_accvgpr 101; GCN-COUNT-16: v_pk_add_f32 102; GCN-NOT: v_accvgpr 103; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}] 104define amdgpu_kernel void @test_load_add_store(<32 x float> addrspace(1)* %arg) { 105bb: 106 %tid = call i32 @llvm.amdgcn.workitem.id.x() 107 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid 108 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep 109 %add.1 = fadd <32 x float> %in.1, %in.1 110 store <32 x float> %add.1, <32 x float> addrspace(1)* %gep 111 ret void 112} 113 114; GCN-LABEL: {{^}}test_load_mfma_add_store: 115; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] 116; GCN-COUNT-32: v_accvgpr_write 117; GCN: v_mfma_f32_32x32x1f32 118; GCN-COUNT-32: v_accvgpr_read 119; GCN: v_pk_add_f32 120; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}] 121define amdgpu_kernel void @test_load_mfma_add_store(<32 x float> addrspace(1)* %arg) { 122bb: 123 %tid = call i32 @llvm.amdgcn.workitem.id.x() 124 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid 125 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep 126 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) 127 %add.1 = fadd <32 x float> %mai.1, %in.1 128 store <32 x float> %add.1, <32 x float> addrspace(1)* %gep 129 ret void 130} 131 132; GCN-LABEL: {{^}}test_load_add_mfma_mul_store: 133; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] 134; GCN: v_pk_add_f32 135; GCN-COUNT-32: v_accvgpr_write 136; GCN: v_mfma_f32_32x32x1f32 137; GCN-COUNT-32: v_accvgpr_read 138; GCN: v_pk_mul_f32 139; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}] 140define amdgpu_kernel void @test_load_add_mfma_mul_store(<32 x float> addrspace(1)* %arg) { 141bb: 142 %tid = call i32 @llvm.amdgcn.workitem.id.x() 143 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid 144 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep 145 %add.1 = fadd <32 x float> %in.1, %in.1 146 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3) 147 %mul.1 = fmul <32 x float> %mai.1, %mai.1 148 store <32 x float> %mul.1, <32 x float> addrspace(1)* %gep 149 ret void 150} 151 152; GCN-LABEL: {{^}}test_mixeduse_load_add_mfma_mul_store: 153; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] 154; GCN-COUNT-32: v_accvgpr_write 155; GCN: v_mfma_f32_32x32x1f32 156; GCN-COUNT-32: v_accvgpr_read 157; GCN: v_pk_mul_f32 158; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}] 159define amdgpu_kernel void @test_mixeduse_load_add_mfma_mul_store(<32 x float> addrspace(1)* %arg) { 160bb: 161 %tid = call i32 @llvm.amdgcn.workitem.id.x() 162 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid 163 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep 164 %add.1 = fadd <32 x float> %in.1, %in.1 165 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3) 166 %mul.1 = fmul <32 x float> %mai.1, %in.1 167 store <32 x float> %mul.1, <32 x float> addrspace(1)* %gep 168 ret void 169} 170 171; GCN-LABEL: {{^}}test_multiuse_load_mfma_mfma_store: 172; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] 173; GCN-NOT: v_accvgpr_write 174; GCN: v_mfma_f32_32x32x1f32 175; GCN-NOT: v_accvgpr_read 176; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}] 177define amdgpu_kernel void @test_multiuse_load_mfma_mfma_store(<32 x float> addrspace(1)* %arg) { 178bb: 179 %tid = call i32 @llvm.amdgcn.workitem.id.x() 180 %gep.1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid 181 %gep.2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %gep.1, i32 32 182 %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep.1 183 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) 184 %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) 185 store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep.1 186 store <32 x float> %mai.2, <32 x float> addrspace(1)* %gep.2 187 ret void 188} 189 190; NB: for atomics both vdata and vdst shall be either VGPR or AGPR 191; GCN-LABEL: {{^}}test_atomic_mfma_4xi32_atomic_store: 192; GCN: global_atomic_sub [[IN:v[0-9]+]], v{{[0-9:]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}] glc 193; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[IN]] 194; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 195; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 196; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 197; GCN: v_mfma_i32_4x4x4i8 a{{\[}}[[N:[0-9]+]]: 198; GCN: v_accvgpr_read_b32 [[V:v[0-9]+]], a[[N]]{{$}} 199; GCN: global_atomic_add v{{[0-9]+}}, v{{[0-9:]+}}, [[V]], s[{{[0-9:]+}}] glc 200; GCN: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, 201define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic_store(i32 addrspace(1)* %arg) { 202bb: 203 %tid = call i32 @llvm.amdgcn.workitem.id.x() 204 %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tid 205 %in.1 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 1 seq_cst 206 %tmp0 = insertelement <4 x i32> undef, i32 %in.1, i32 0 207 %tmp1 = insertelement <4 x i32> %tmp0, i32 0, i32 1 208 %tmp2 = insertelement <4 x i32> %tmp1, i32 0, i32 2 209 %tmp3 = insertelement <4 x i32> %tmp2, i32 0, i32 3 210 %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %tmp3, i32 0, i32 0, i32 0) 211 %elt = extractelement <4 x i32> %mai.1, i32 0 212 %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %elt seq_cst 213 store i32 %val, i32 addrspace(1)* %arg 214 ret void 215} 216 217; GCN-LABEL: {{^}}test_atomic_mfma_4xi32_atomic64_store: 218; GCN: global_atomic_sub_x2 v[{{[0-9:]+}}], v{{[0-9:]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] glc 219; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}} 220; GCN: v_mfma_i32_4x4x4i8 a{{\[}}[[N:[0-9]+]]: 221; GCN: v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}} 222; GCN: v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}} 223; GCN: global_atomic_add_x2 v[{{[0-9:]+}}], v{{[0-9:]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] glc 224define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic64_store(i64 addrspace(1)* %arg) { 225bb: 226 %tid = call i32 @llvm.amdgcn.workitem.id.x() 227 %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tid 228 %in.1 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 1 seq_cst 229 %tmp0 = insertelement <2 x i64> undef, i64 %in.1, i32 0 230 %tmp1 = insertelement <2 x i64> %tmp0, i64 0, i32 1 231 %tmp2 = bitcast <2 x i64> %tmp0 to <4 x i32> 232 %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %tmp2, i32 0, i32 0, i32 0) 233 %elt.1 = extractelement <4 x i32> %mai.1, i32 0 234 %elt.2 = extractelement <4 x i32> %mai.1, i32 1 235 %v2.1 = insertelement <2 x i32> undef, i32 %elt.1, i32 0 236 %v2.2 = insertelement <2 x i32> %v2.1, i32 %elt.2, i32 1 237 %v2 = bitcast <2 x i32> %v2.2 to i64 238 %val = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %v2 seq_cst 239 store i64 %val, i64 addrspace(1)* %arg 240 ret void 241} 242 243; NB: both data operands should be VGPR or AGPR 244; GCN-LABEL: {{^}}test_load_mfma_ds2_store: 245; GCN-DAG: ds_read_b128 [[IN:a\[[0-9:]+\]]], v{{[0-9:]+}} 246; GCN-NOT: v_accvgpr_write 247; GCN-DAG: v_mfma_i32_4x4x4i8 a{{\[}}[[N:[0-9]+]]:{{[0-9]+}}], v{{[0-9:]+}}, v{{[0-9:]+}}, [[IN]] 248; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} 249; GCN-NOT: v_accvgpr_read 250; GCN: ds_write_b32 v{{[0-9]+}}, a[[N]] offset:128 251define amdgpu_kernel void @test_load_mfma_ds2_store(<4 x i32> addrspace(3)* %arg) { 252bb: 253 %tid = call i32 @llvm.amdgcn.workitem.id.x() 254 %gep.1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(3)* %arg, i32 %tid 255 %in.1 = load <4 x i32>, <4 x i32> addrspace(3)* %gep.1 256 %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 0, i32 0, i32 0) 257 %elt = extractelement <4 x i32> %mai.1, i32 0 258 %ptr = bitcast <4 x i32> addrspace(3)* %arg to i32 addrspace(3)* 259 %gep.2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr, i32 32 260 store i32 1, i32 addrspace(3)* %ptr 261 store i32 %elt, i32 addrspace(3)* %gep.2 262 ret void 263} 264 265; GCN-LABEL: {{^}}test_mfma_loop_4xi32: 266; GCN: global_load_dwordx4 [[IN:a\[[0-9:]+\]]], v{{[0-9:]+}}, s[{{[0-9:]+}}] 267; GCN-NOT: v_accvgpr_write 268; GCN: v_mfma_i32_4x4x4i8 [[RES:a\[[0-9:]+\]]], v{{[0-9:]+}}, v{{[0-9:]+}}, [[IN]] 269; GCN-NOT: v_accvgpr_read 270; GCN: global_store_dwordx4 v[{{[0-9:]+}}], [[RES]], 271define amdgpu_kernel void @test_mfma_loop_4xi32(<4 x i32> addrspace(1)* %arg) { 272entry: 273 %tid = call i32 @llvm.amdgcn.workitem.id.x() 274 %gep = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %tid 275 %in = load <4 x i32>, <4 x i32> addrspace(1)* %gep 276 br label %for.cond.preheader 277 278for.cond.preheader: 279 %phi = phi <4 x i32> [ %in, %entry ], [ %mai.1, %for.cond.preheader ] 280 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 281 %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %phi, i32 0, i32 0, i32 0) 282 %inc = add nuw nsw i32 %c, 1 283 %cc = icmp eq i32 %inc, 16 284 br i1 %cc, label %exit, label %for.cond.preheader 285 286exit: 287 store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %gep 288 ret void 289} 290 291; GCN-LABEL: {{^}}test_mfma_loop_32xfloat: 292; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}] 293; GCN-NOT: v_accvgpr_write 294; GCN: v_mfma_f32_32x32x1f32 295; GCN-NOT: v_accvgpr_read 296; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}], 297; GCN: s_endpgm 298define amdgpu_kernel void @test_mfma_loop_32xfloat(<32 x float> addrspace(1)* %arg) { 299entry: 300 %tid = call i32 @llvm.amdgcn.workitem.id.x() 301 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid 302 %in = load <32 x float>, <32 x float> addrspace(1)* %gep 303 br label %for.cond.preheader 304 305for.cond.preheader: 306 %phi = phi <32 x float> [ %in, %entry ], [ %mai.1, %for.cond.preheader ] 307 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 308 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 309 %inc = add nuw nsw i32 %c, 1 310 %cc = icmp eq i32 %inc, 16 311 br i1 %cc, label %exit, label %for.cond.preheader 312 313exit: 314 store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep 315 ret void 316} 317