1; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
2
3declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
4declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
5declare i32 @llvm.amdgcn.workitem.id.x()
6
7; GCN-LABEL:  {{^}}test_load_mfma_store16:
8; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
9; GCN-NOT:     v_accvgpr_write
10; GCN:         v_mfma_f32_32x32x1f32
11; GCN-NEXT:    s_nop 7
12; GCN-NEXT:    s_nop 7
13; GCN-NEXT:    s_nop 2
14; GCN-NOT:     v_accvgpr_read
15; GCN-COUNT-8: global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}], s[{{[0-9:]+}}]
16define amdgpu_kernel void @test_load_mfma_store16(<32 x float> addrspace(1)* %arg) {
17bb:
18  %tid = call i32 @llvm.amdgcn.workitem.id.x()
19  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
20  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
21  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
22  store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
23  ret void
24}
25
26; GCN-LABEL: {{^}}test_load1_mfma_store1:
27; GCN:      global_load_dword a{{[0-9]+}}, v{{[0-9:]+}}, s[{{[0-9:]+}}]
28; GCN-NOT:  v_accvgpr_read
29; GCN:      v_mfma_f32_32x32x1f32 a{{\[}}[[N:[0-9]+]]:
30; GCN-NEXT: s_nop 7
31; GCN-NEXT: s_nop 7
32; GCN-NEXT: s_nop 2
33; GCN-NOT:  v_accvgpr_read
34; GCN-NEXT: global_store_dword v{{[0-9:]+}}, a[[N]], s[{{[0-9:]+}}]
35define amdgpu_kernel void @test_load1_mfma_store1(float addrspace(1)* %arg) {
36bb:
37  %tid = call i32 @llvm.amdgcn.workitem.id.x()
38  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tid
39  %in.1 = load float, float addrspace(1)* %gep
40  %init = insertelement <32 x float> zeroinitializer, float %in.1, i32 0
41  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %init, i32 1, i32 2, i32 3)
42  %elt = extractelement <32 x float> %mai.1, i32 0
43  store float %elt, float addrspace(1)* %gep
44  ret void
45}
46
47; GCN-LABEL: {{^}}test_load4_mfma_store4:
48; GCN:      global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
49; GCN-NOT:  v_accvgpr_write
50; GCN:      v_mfma_i32_4x4x4i8 [[A:a\[[0-9:]+\]]]
51; GCN-NEXT: s_nop 4
52; GCN-NOT:  v_accvgpr_read
53; GCN-NEXT: global_store_dwordx4 v{{[0-9:]+}}, [[A]], s[{{[0-9:]+}}]
54define amdgpu_kernel void @test_load4_mfma_store4(<4 x i32> addrspace(1)* %arg) {
55bb:
56  %tid = call i32 @llvm.amdgcn.workitem.id.x()
57  %gep = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %tid
58  %in.1 = load <4 x i32>, <4 x i32> addrspace(1)* %gep
59  %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 0, i32 0, i32 0)
60  store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %gep
61  ret void
62}
63
64; GCN-LABEL: {{^}}test_load_store:
65; GCN-COUNT-8: global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
66; GCN-NOT:     v_accvgpr
67; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}]
68define amdgpu_kernel void @test_load_store(<32 x float> addrspace(1)* %arg) {
69bb:
70  %tid = call i32 @llvm.amdgcn.workitem.id.x()
71  %gep.1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
72  %gep.2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %gep.1, i32 32
73  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep.1
74  store <32 x float> %in.1, <32 x float> addrspace(1)* %gep.2
75  ret void
76}
77
78; GCN-LABEL: {{^}}test_load_add_mfma_store:
79; GCN-COUNT-8:  global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
80; GCN-COUNT-32: v_accvgpr_write
81; GCN:          v_mfma_f32_32x32x1f32
82; GCN-NEXT:     s_nop 7
83; GCN-NEXT:     s_nop 7
84; GCN-NEXT:     s_nop 2
85; GCN-NOT:      v_accvgpr_read
86; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, a[{{[0-9:]+}}]
87define amdgpu_kernel void @test_load_add_mfma_store(<32 x float> addrspace(1)* %arg) {
88bb:
89  %tid = call i32 @llvm.amdgcn.workitem.id.x()
90  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
91  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
92  %add.1 = fadd <32 x float> %in.1, %in.1
93  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3)
94  store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
95  ret void
96}
97
98; GCN-LABEL: {{^}}test_load_add_store:
99; GCN-COUNT-8:  global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
100; GCN-NOT:      v_accvgpr
101; GCN-COUNT-16: v_pk_add_f32
102; GCN-NOT:      v_accvgpr
103; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}]
104define amdgpu_kernel void @test_load_add_store(<32 x float> addrspace(1)* %arg) {
105bb:
106  %tid = call i32 @llvm.amdgcn.workitem.id.x()
107  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
108  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
109  %add.1 = fadd <32 x float> %in.1, %in.1
110  store <32 x float> %add.1, <32 x float> addrspace(1)* %gep
111  ret void
112}
113
114; GCN-LABEL: {{^}}test_load_mfma_add_store:
115; GCN-COUNT-8:  global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
116; GCN-COUNT-32: v_accvgpr_write
117; GCN:          v_mfma_f32_32x32x1f32
118; GCN-COUNT-32: v_accvgpr_read
119; GCN:          v_pk_add_f32
120; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}]
121define amdgpu_kernel void @test_load_mfma_add_store(<32 x float> addrspace(1)* %arg) {
122bb:
123  %tid = call i32 @llvm.amdgcn.workitem.id.x()
124  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
125  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
126  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
127  %add.1 = fadd <32 x float> %mai.1, %in.1
128  store <32 x float> %add.1, <32 x float> addrspace(1)* %gep
129  ret void
130}
131
132; GCN-LABEL: {{^}}test_load_add_mfma_mul_store:
133; GCN-COUNT-8:  global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
134; GCN:          v_pk_add_f32
135; GCN-COUNT-32: v_accvgpr_write
136; GCN:          v_mfma_f32_32x32x1f32
137; GCN-COUNT-32: v_accvgpr_read
138; GCN:          v_pk_mul_f32
139; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}]
140define amdgpu_kernel void @test_load_add_mfma_mul_store(<32 x float> addrspace(1)* %arg) {
141bb:
142  %tid = call i32 @llvm.amdgcn.workitem.id.x()
143  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
144  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
145  %add.1 = fadd <32 x float> %in.1, %in.1
146  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3)
147  %mul.1 = fmul <32 x float> %mai.1, %mai.1
148  store <32 x float> %mul.1, <32 x float> addrspace(1)* %gep
149  ret void
150}
151
152; GCN-LABEL: {{^}}test_mixeduse_load_add_mfma_mul_store:
153; GCN-COUNT-8:  global_load_dwordx4 v[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
154; GCN-COUNT-32: v_accvgpr_write
155; GCN:          v_mfma_f32_32x32x1f32
156; GCN-COUNT-32: v_accvgpr_read
157; GCN:          v_pk_mul_f32
158; GCN-COUNT-8:  global_store_dwordx4 v{{[0-9:]+}}, v[{{[0-9:]+}}]
159define amdgpu_kernel void @test_mixeduse_load_add_mfma_mul_store(<32 x float> addrspace(1)* %arg) {
160bb:
161  %tid = call i32 @llvm.amdgcn.workitem.id.x()
162  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
163  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep
164  %add.1 = fadd <32 x float> %in.1, %in.1
165  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %add.1, i32 1, i32 2, i32 3)
166  %mul.1 = fmul <32 x float> %mai.1, %in.1
167  store <32 x float> %mul.1, <32 x float> addrspace(1)* %gep
168  ret void
169}
170
171; GCN-LABEL: {{^}}test_multiuse_load_mfma_mfma_store:
172; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
173; GCN-NOT:     v_accvgpr_write
174; GCN:         v_mfma_f32_32x32x1f32
175; GCN-NOT:     v_accvgpr_read
176; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}]
177define amdgpu_kernel void @test_multiuse_load_mfma_mfma_store(<32 x float> addrspace(1)* %arg) {
178bb:
179  %tid = call i32 @llvm.amdgcn.workitem.id.x()
180  %gep.1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
181  %gep.2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %gep.1, i32 32
182  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %gep.1
183  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
184  %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
185  store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep.1
186  store <32 x float> %mai.2, <32 x float> addrspace(1)* %gep.2
187  ret void
188}
189
190; NB: for atomics both vdata and vdst shall be either VGPR or AGPR
191; GCN-LABEL: {{^}}test_atomic_mfma_4xi32_atomic_store:
192; GCN:     global_atomic_sub [[IN:v[0-9]+]], v{{[0-9:]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}] glc
193; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[IN]]
194; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
195; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
196; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
197; GCN:     v_mfma_i32_4x4x4i8 a{{\[}}[[N:[0-9]+]]:
198; GCN:     v_accvgpr_read_b32 [[V:v[0-9]+]], a[[N]]{{$}}
199; GCN:     global_atomic_add v{{[0-9]+}}, v{{[0-9:]+}}, [[V]], s[{{[0-9:]+}}] glc
200; GCN:     global_store_dword v{{[0-9]+}}, v{{[0-9]+}},
201define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic_store(i32 addrspace(1)* %arg) {
202bb:
203  %tid = call i32 @llvm.amdgcn.workitem.id.x()
204  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %tid
205  %in.1 = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 1 seq_cst
206  %tmp0 = insertelement <4 x i32> undef, i32 %in.1, i32 0
207  %tmp1 = insertelement <4 x i32> %tmp0, i32 0, i32 1
208  %tmp2 = insertelement <4 x i32> %tmp1, i32 0, i32 2
209  %tmp3 = insertelement <4 x i32> %tmp2, i32 0, i32 3
210  %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %tmp3, i32 0, i32 0, i32 0)
211  %elt = extractelement <4 x i32> %mai.1, i32 0
212  %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %elt seq_cst
213  store i32 %val, i32 addrspace(1)* %arg
214  ret void
215}
216
217; GCN-LABEL: {{^}}test_atomic_mfma_4xi32_atomic64_store:
218; GCN:         global_atomic_sub_x2 v[{{[0-9:]+}}], v{{[0-9:]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] glc
219; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
220; GCN:         v_mfma_i32_4x4x4i8 a{{\[}}[[N:[0-9]+]]:
221; GCN:         v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}}
222; GCN:         v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}}
223; GCN:         global_atomic_add_x2 v[{{[0-9:]+}}], v{{[0-9:]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] glc
224define amdgpu_kernel void @test_atomic_mfma_4xi32_atomic64_store(i64 addrspace(1)* %arg) {
225bb:
226  %tid = call i32 @llvm.amdgcn.workitem.id.x()
227  %gep = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tid
228  %in.1 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 1 seq_cst
229  %tmp0 = insertelement <2 x i64> undef, i64 %in.1, i32 0
230  %tmp1 = insertelement <2 x i64> %tmp0, i64 0, i32 1
231  %tmp2 = bitcast <2 x i64> %tmp0 to <4 x i32>
232  %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %tmp2, i32 0, i32 0, i32 0)
233  %elt.1 = extractelement <4 x i32> %mai.1, i32 0
234  %elt.2 = extractelement <4 x i32> %mai.1, i32 1
235  %v2.1 = insertelement <2 x i32> undef, i32 %elt.1, i32 0
236  %v2.2 = insertelement <2 x i32> %v2.1, i32 %elt.2, i32 1
237  %v2 = bitcast <2 x i32> %v2.2 to i64
238  %val = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %v2 seq_cst
239  store i64 %val, i64 addrspace(1)* %arg
240  ret void
241}
242
243; NB: both data operands should be VGPR or AGPR
244; GCN-LABEL: {{^}}test_load_mfma_ds2_store:
245; GCN-DAG: ds_read_b128 [[IN:a\[[0-9:]+\]]], v{{[0-9:]+}}
246; GCN-NOT: v_accvgpr_write
247; GCN-DAG: v_mfma_i32_4x4x4i8 a{{\[}}[[N:[0-9]+]]:{{[0-9]+}}], v{{[0-9:]+}}, v{{[0-9:]+}}, [[IN]]
248; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
249; GCN-NOT: v_accvgpr_read
250; GCN:     ds_write_b32 v{{[0-9]+}}, a[[N]] offset:128
251define amdgpu_kernel void @test_load_mfma_ds2_store(<4 x i32> addrspace(3)* %arg) {
252bb:
253  %tid = call i32 @llvm.amdgcn.workitem.id.x()
254  %gep.1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(3)* %arg, i32 %tid
255  %in.1 = load <4 x i32>, <4 x i32> addrspace(3)* %gep.1
256  %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 0, i32 0, i32 0)
257  %elt = extractelement <4 x i32> %mai.1, i32 0
258  %ptr = bitcast <4 x i32> addrspace(3)* %arg to i32 addrspace(3)*
259  %gep.2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr, i32 32
260  store i32 1, i32 addrspace(3)* %ptr
261  store i32 %elt, i32 addrspace(3)* %gep.2
262  ret void
263}
264
265; GCN-LABEL: {{^}}test_mfma_loop_4xi32:
266; GCN:     global_load_dwordx4 [[IN:a\[[0-9:]+\]]], v{{[0-9:]+}}, s[{{[0-9:]+}}]
267; GCN-NOT: v_accvgpr_write
268; GCN:     v_mfma_i32_4x4x4i8 [[RES:a\[[0-9:]+\]]], v{{[0-9:]+}}, v{{[0-9:]+}}, [[IN]]
269; GCN-NOT: v_accvgpr_read
270; GCN:     global_store_dwordx4 v[{{[0-9:]+}}], [[RES]],
271define amdgpu_kernel void @test_mfma_loop_4xi32(<4 x i32> addrspace(1)* %arg) {
272entry:
273  %tid = call i32 @llvm.amdgcn.workitem.id.x()
274  %gep = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %tid
275  %in = load <4 x i32>, <4 x i32> addrspace(1)* %gep
276  br label %for.cond.preheader
277
278for.cond.preheader:
279  %phi = phi <4 x i32> [ %in, %entry ], [ %mai.1, %for.cond.preheader ]
280  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
281  %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %phi, i32 0, i32 0, i32 0)
282  %inc = add nuw nsw i32 %c, 1
283  %cc = icmp eq i32 %inc, 16
284  br i1 %cc, label %exit, label %for.cond.preheader
285
286exit:
287  store <4 x i32> %mai.1, <4 x i32> addrspace(1)* %gep
288  ret void
289}
290
291; GCN-LABEL: {{^}}test_mfma_loop_32xfloat:
292; GCN-COUNT-8: global_load_dwordx4 a[{{[0-9:]+}}], v{{[0-9:]+}}, s[{{[0-9:]+}}]
293; GCN-NOT:     v_accvgpr_write
294; GCN:         v_mfma_f32_32x32x1f32
295; GCN-NOT:     v_accvgpr_read
296; GCN-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}],
297; GCN:         s_endpgm
298define amdgpu_kernel void @test_mfma_loop_32xfloat(<32 x float> addrspace(1)* %arg) {
299entry:
300  %tid = call i32 @llvm.amdgcn.workitem.id.x()
301  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %arg, i32 %tid
302  %in = load <32 x float>, <32 x float> addrspace(1)* %gep
303  br label %for.cond.preheader
304
305for.cond.preheader:
306  %phi = phi <32 x float> [ %in, %entry ], [ %mai.1, %for.cond.preheader ]
307  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
308  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
309  %inc = add nuw nsw i32 %c, 1
310  %cc = icmp eq i32 %inc, 16
311  br i1 %cc, label %exit, label %for.cond.preheader
312
313exit:
314  store <32 x float> %mai.1, <32 x float> addrspace(1)* %gep
315  ret void
316}
317