1; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
2
3declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
4declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
5declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
6declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
7declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16>, <4 x i16>, <4 x float>, i32, i32, i32)
8declare <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double, double, <4 x double>, i32, i32, i32)
9declare double @llvm.amdgcn.mfma.f64.4x4x4f64(double, double, double, i32, i32, i32)
10declare i32 @llvm.amdgcn.workitem.id.x()
11
12; GCN-LABEL: {{^}}test_mfma_f32_32x32x4bf16_1k:
13; GCN-DAG:     s_load_dwordx16
14; GCN-DAG:     s_load_dwordx16
15; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
16; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
17; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
18; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
19; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
20; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
21; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
22; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
23; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
24; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
25; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
26; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
27; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
28; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
29; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
30; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
31; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
32; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
33; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
34; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
35; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
36; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
37; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
38; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
39; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
40; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
41; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
42; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
43; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
44; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
45; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
46; GFX90A-DAG:  v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
47; GFX90A-DAG:  v_mov_b32_e32 v[[TWO:[0-9]+]], 2
48; GFX90A-DAG:  v_mov_b32_e32 v[[ONE:[0-9]+]], 1
49; GFX90A:      v_mfma_f32_32x32x4bf16_1k a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
50; GCN-NOT:     v_accvgpr_read_b32
51; GCN-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
52define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(<32 x float> addrspace(1)* %arg) {
53bb:
54  %in.1 = load <32 x float>, <32 x float> addrspace(1)* %arg
55  %a = bitcast i64 1 to <4 x i16>
56  %b = bitcast i64 2 to <4 x i16>
57  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <32 x float> %in.1, i32 1, i32 2, i32 3)
58  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
59  ret void
60}
61
62; GCN-LABEL: {{^}}test_mfma_f32_16x16x4bf16_1k:
63; GCN-DAG:         s_load_dwordx16
64; GCN-DAG:         v_mov_b32_e32 v[[TWO:[0-9]+]], 2
65; GCN-DAG:         v_mov_b32_e32 v[[ONE:[0-9]+]], 1
66; GFX90A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
67; GFX90A:          v_mfma_f32_16x16x4bf16_1k a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
68; GCN-NOT:         v_accvgpr_read_b32
69; GCN-COUNT-4:     global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
70define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(<16 x float> addrspace(1)* %arg) {
71bb:
72  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
73  %a = bitcast i64 1 to <4 x i16>
74  %b = bitcast i64 2 to <4 x i16>
75  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
76  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
77  ret void
78}
79
80; GCN-LABEL: {{^}}test_mfma_f32_4x4x4bf16_1k:
81; GCN-DAG:        s_load_dwordx4
82; GCN-DAG:        v_mov_b32_e32 v[[TWO:[0-9]+]], 2
83; GCN-DAG:        v_mov_b32_e32 v[[ONE:[0-9]+]], 1
84; GFX90A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
85; GFX90A:         v_mfma_f32_4x4x4bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
86; GCN-NOT:        v_accvgpr_read_b32
87; GCN:            global_store_dwordx4 v{{[0-9]+}}, [[RES]],
88define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(<4 x float> addrspace(1)* %arg) {
89bb:
90  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
91  %a = bitcast i64 1 to <4 x i16>
92  %b = bitcast i64 2 to <4 x i16>
93  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
94  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
95  ret void
96}
97
98; GCN-LABEL: {{^}}test_mfma_f32_32x32x8bf16_1k:
99; GCN-DAG:         s_load_dwordx16
100; GCN-DAG:         v_mov_b32_e32 v[[TWO:[0-9]+]], 2
101; GCN-DAG:         v_mov_b32_e32 v[[ONE:[0-9]+]], 1
102; GFX90A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
103; GFX90A:          v_mfma_f32_32x32x8bf16_1k a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
104; GCN-NOT:         v_accvgpr_read_b32
105; GCN-COUNT-4:     global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
106define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(<16 x float> addrspace(1)* %arg) {
107bb:
108  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
109  %a = bitcast i64 1 to <4 x i16>
110  %b = bitcast i64 2 to <4 x i16>
111  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> %a, <4 x i16> %b, <16 x float> %in.1, i32 1, i32 2, i32 3)
112  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
113  ret void
114}
115
116; GCN-LABEL: {{^}}test_mfma_f32_16x16x16bf16_1k:
117; GCN-DAG:        s_load_dwordx4
118; GCN-DAG:        v_mov_b32_e32 v[[TWO:[0-9]+]], 2
119; GCN-DAG:        v_mov_b32_e32 v[[ONE:[0-9]+]], 1
120; GFX90A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
121; GFX90A:         v_mfma_f32_16x16x16bf16_1k [[RES:a\[[0-9]+:[0-9]+\]]], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
122; GCN-NOT:        v_accvgpr_read_b32
123; GCN:            global_store_dwordx4 v{{[0-9]+}}, [[RES]],
124define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(<4 x float> addrspace(1)* %arg) {
125bb:
126  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
127  %a = bitcast i64 1 to <4 x i16>
128  %b = bitcast i64 2 to <4 x i16>
129  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> %a, <4 x i16> %b, <4 x float> %in.1, i32 1, i32 2, i32 3)
130  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
131  ret void
132}
133
134; GCN-LABEL: {{^}}test_mfma_f64_4x4x4f64:
135; GFX90A: v_mfma_f64_4x4x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}}
136; GFX90A: v_mfma_f64_4x4x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
137; GCN:    global_store_dwordx2
138define amdgpu_kernel void @test_mfma_f64_4x4x4f64(double addrspace(1)* %arg, double %a, double %b) {
139bb:
140  %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0)
141  %mai.2 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double %mai.1, i32 1, i32 2, i32 3)
142  store double %mai.2, double addrspace(1)* %arg
143  ret void
144}
145
146; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64:
147; GCN:    s_load_dwordx8
148; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
149; GCN:    global_store_dwordx4
150; GCN:    global_store_dwordx4
151define amdgpu_kernel void @test_mfma_f64_16x16x4f64(<4 x double> addrspace(1)* %arg, double %a, double %b) {
152bb:
153  %in.1 = load <4 x double>, <4 x double> addrspace(1)* %arg
154  %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %in.1, i32 1, i32 2, i32 3)
155  store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg
156  ret void
157}
158
159; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_imm:
160; GFX90A: v_mfma_f64_16x16x4f64 [[M1:a\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], 0{{$}}
161; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], [[M1]] cbsz:1 abid:2 blgp:3
162; GCN:    global_store_dwordx4
163; GCN:    global_store_dwordx4
164define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(<4 x double> addrspace(1)* %arg, double %a, double %b) {
165bb:
166  %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 0.0>, i32 0, i32 0, i32 0)
167  %mai.2 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> %mai.1, i32 1, i32 2, i32 3)
168  store <4 x double> %mai.2, <4 x double> addrspace(1)* %arg
169  ret void
170}
171
172; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_imm:
173; GFX90A: v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}}
174; GCN:    global_store_dwordx4
175; GCN:    global_store_dwordx4
176define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(<4 x double> addrspace(1)* %arg, double %a, double %b) {
177bb:
178  %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
179  store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg
180  ret void
181}
182
183; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_lit:
184; GCN-DAG:    v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
185; GFX90A-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x405ec000
186; GFX90A:     v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}}
187; GCN:        global_store_dwordx4
188; GCN:        global_store_dwordx4
189define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(<4 x double> addrspace(1)* %arg, double %a, double %b) {
190bb:
191  %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
192  store <4 x double> %mai.1, <4 x double> addrspace(1)* %arg
193  ret void
194}
195