1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s
2
3; GCN-LABEL: {{^}}accvgpr_write_read:
4; GFX908: v_accvgpr_write [[AREG:a[0-9]+]], 1
5; GFX908: v_accvgpr_read [[VREG:v[0-9]+]], [[AREG]]
6; GFX908: global_store_dword {{[^,]+}}, [[VREG]], off
7define amdgpu_kernel void @accvgpr_write_read(float addrspace(1)* %arg) {
8bb:
9  %in.1 = load float, float addrspace(1)* %arg
10  %init = tail call float asm "v_accvgpr_write $0, 1", "=a"()
11  %read = tail call float asm "v_accvgpr_read $0, $1", "=v,a"(float %init)
12  store float %read, float addrspace(1)* %arg
13  ret void
14}
15
16; GCN-LABEL: {{^}}v_mfma_f32_4x4x1f32_avva
17; GFX908: v_accvgpr_write_b32
18; GFX908: v_accvgpr_write_b32
19; GFX908: v_accvgpr_write_b32
20; GFX908: v_accvgpr_write_b32
21; GFX908: v_mfma_f32_4x4x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}]
22; GFX908: v_accvgpr_read_b32
23; GFX908: v_accvgpr_read_b32
24; GFX908: v_accvgpr_read_b32
25; GFX908: v_accvgpr_read_b32
26define amdgpu_kernel void @v_mfma_f32_4x4x1f32_avva(<4 x float> addrspace(1)* %arg) {
27bb:
28  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
29  %mai.1 = tail call <4 x float> asm "v_mfma_f32_4x4x1f32 $0, $1, $2, $3", "=a,v,v,a"(float 1.0, float 2.0, <4 x float> %in.1)
30  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
31  ret void
32}
33
34; GCN-LABEL: {{^}}v_mfma_f32_4x4x1f32_aaaa
35; GFX908: v_accvgpr_write_b32
36; GFX908: v_accvgpr_write_b32
37; GFX908: v_accvgpr_write_b32
38; GFX908: v_accvgpr_write_b32
39; GFX908: v_mfma_f32_4x4x1f32 a[{{[0-9:]+}}], a{{[0-9]+}}, a{{[0-9]+}}, a[{{[0-9:]+}}]
40; GFX908: v_accvgpr_read_b32
41; GFX908: v_accvgpr_read_b32
42; GFX908: v_accvgpr_read_b32
43; GFX908: v_accvgpr_read_b32
44define amdgpu_kernel void @v_mfma_f32_4x4x1f32_aaaa(<4 x float> addrspace(1)* %arg) {
45bb:
46  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
47  %mai.1 = tail call <4 x float> asm "v_mfma_f32_4x4x1f32 $0, $1, $2, $3", "=a,a,a,a"(float 1.0, float 2.0, <4 x float> %in.1)
48  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
49  ret void
50}
51
52; GCN-LABEL: {{^}}v_mfma_f32_4x4x4f16_aaaa
53; GFX908: v_accvgpr_write_b32
54; GFX908: v_accvgpr_write_b32
55; GFX908: v_accvgpr_write_b32
56; GFX908: v_accvgpr_write_b32
57; GFX908: v_accvgpr_write_b32
58; GFX908: v_accvgpr_write_b32
59; GFX908: v_accvgpr_write_b32
60; GFX908: v_accvgpr_write_b32
61; GFX908: v_mfma_f32_4x4x4f16 a[{{[0-9:]+}}], a[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}], a[{{[0-9:]+}}]
62; GFX908: v_accvgpr_read_b32
63; GFX908: v_accvgpr_read_b32
64; GFX908: v_accvgpr_read_b32
65; GFX908: v_accvgpr_read_b32
66define amdgpu_kernel void @v_mfma_f32_4x4x4f16_aaaa(<4 x float> addrspace(1)* %arg) {
67bb:
68  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
69  %mai.1 = tail call <4 x float> asm "v_mfma_f32_4x4x4f16 $0, $1, $2, $3", "=a,a,a,a"(<4 x half> <half 0xH3800, half 0xH3800, half 0xH3800, half 0xH3800>, <4 x half> <half 0xH03FF, half 0xH03FF, half 0xH03FF, half 0xH03FF>, <4 x float> %in.1)
70  store <4 x float> %mai.1, <4 x float> addrspace(1)* %arg
71  ret void
72}
73
74; GCN-LABEL: {{^}}v_mfma_f32_16x16x1f32_avaa
75; GFX908: v_accvgpr_write_b32
76; GFX908: v_accvgpr_write_b32
77; GFX908: v_accvgpr_write_b32
78; GFX908: v_accvgpr_write_b32
79; GFX908: v_accvgpr_write_b32
80; GFX908: v_accvgpr_write_b32
81; GFX908: v_accvgpr_write_b32
82; GFX908: v_accvgpr_write_b32
83; GFX908: v_accvgpr_write_b32
84; GFX908: v_accvgpr_write_b32
85; GFX908: v_accvgpr_write_b32
86; GFX908: v_accvgpr_write_b32
87; GFX908: v_accvgpr_write_b32
88; GFX908: v_accvgpr_write_b32
89; GFX908: v_accvgpr_write_b32
90; GFX908: v_accvgpr_write_b32
91; GFX908: v_accvgpr_write_b32
92; GFX908: v_mfma_f32_16x16x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, a{{[0-9]+}}, a[{{[0-9:]+}}]
93; GFX908: v_accvgpr_read_b32
94; GFX908: v_accvgpr_read_b32
95; GFX908: v_accvgpr_read_b32
96; GFX908: v_accvgpr_read_b32
97; GFX908: v_accvgpr_read_b32
98; GFX908: v_accvgpr_read_b32
99; GFX908: v_accvgpr_read_b32
100; GFX908: v_accvgpr_read_b32
101; GFX908: v_accvgpr_read_b32
102; GFX908: v_accvgpr_read_b32
103; GFX908: v_accvgpr_read_b32
104; GFX908: v_accvgpr_read_b32
105; GFX908: v_accvgpr_read_b32
106; GFX908: v_accvgpr_read_b32
107; GFX908: v_accvgpr_read_b32
108; GFX908: v_accvgpr_read_b32
109define amdgpu_kernel void @v_mfma_f32_16x16x1f32_avaa(<16 x float> addrspace(1)* %arg) {
110bb:
111  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
112  %mai.1 = tail call <16 x float> asm "v_mfma_f32_16x16x1f32 $0, $1, $2, $3", "=a,v,a,a"(float 1.0, float 2.0, <16 x float> %in.1)
113  store <16 x float> %mai.1, <16 x float> addrspace(1)* %arg
114  ret void
115}
116
117; GCN-LABEL: {{^}}v_mfma_f32_32x32x1f32_avaa
118; GFX908: v_accvgpr_write_b32
119; GFX908: v_accvgpr_write_b32
120; GFX908: v_accvgpr_write_b32
121; GFX908: v_accvgpr_write_b32
122; GFX908: v_accvgpr_write_b32
123; GFX908: v_accvgpr_write_b32
124; GFX908: v_accvgpr_write_b32
125; GFX908: v_accvgpr_write_b32
126; GFX908: v_accvgpr_write_b32
127; GFX908: v_accvgpr_write_b32
128; GFX908: v_accvgpr_write_b32
129; GFX908: v_accvgpr_write_b32
130; GFX908: v_accvgpr_write_b32
131; GFX908: v_accvgpr_write_b32
132; GFX908: v_accvgpr_write_b32
133; GFX908: v_accvgpr_write_b32
134; GFX908: v_accvgpr_write_b32
135; GFX908: v_accvgpr_write_b32
136; GFX908: v_accvgpr_write_b32
137; GFX908: v_accvgpr_write_b32
138; GFX908: v_accvgpr_write_b32
139; GFX908: v_accvgpr_write_b32
140; GFX908: v_accvgpr_write_b32
141; GFX908: v_accvgpr_write_b32
142; GFX908: v_accvgpr_write_b32
143; GFX908: v_accvgpr_write_b32
144; GFX908: v_accvgpr_write_b32
145; GFX908: v_accvgpr_write_b32
146; GFX908: v_accvgpr_write_b32
147; GFX908: v_accvgpr_write_b32
148; GFX908: v_accvgpr_write_b32
149; GFX908: v_accvgpr_write_b32
150; GFX908: v_accvgpr_write_b32
151; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, a{{[0-9]+}}, a[{{[0-9:]+}}]
152; GFX908: v_accvgpr_read_b32
153; GFX908: v_accvgpr_read_b32
154; GFX908: v_accvgpr_read_b32
155; GFX908: v_accvgpr_read_b32
156; GFX908: v_accvgpr_read_b32
157; GFX908: v_accvgpr_read_b32
158; GFX908: v_accvgpr_read_b32
159; GFX908: v_accvgpr_read_b32
160; GFX908: v_accvgpr_read_b32
161; GFX908: v_accvgpr_read_b32
162; GFX908: v_accvgpr_read_b32
163; GFX908: v_accvgpr_read_b32
164; GFX908: v_accvgpr_read_b32
165; GFX908: v_accvgpr_read_b32
166; GFX908: v_accvgpr_read_b32
167; GFX908: v_accvgpr_read_b32
168; GFX908: v_accvgpr_read_b32
169; GFX908: v_accvgpr_read_b32
170; GFX908: v_accvgpr_read_b32
171; GFX908: v_accvgpr_read_b32
172; GFX908: v_accvgpr_read_b32
173; GFX908: v_accvgpr_read_b32
174; GFX908: v_accvgpr_read_b32
175; GFX908: v_accvgpr_read_b32
176; GFX908: v_accvgpr_read_b32
177; GFX908: v_accvgpr_read_b32
178; GFX908: v_accvgpr_read_b32
179; GFX908: v_accvgpr_read_b32
180; GFX908: v_accvgpr_read_b32
181; GFX908: v_accvgpr_read_b32
182; GFX908: v_accvgpr_read_b32
183; GFX908: v_accvgpr_read_b32
184define amdgpu_kernel void @v_mfma_f32_32x32x1f32_avaa(<32 x i32> addrspace(1)* %arg) {
185bb:
186  %in.1 = load <32 x i32>, <32 x i32> addrspace(1)* %arg
187  %mai.1 = tail call <32 x i32> asm "v_mfma_f32_32x32x1f32 $0, $1, $2, $3", "=a,v,a,a"(float 1.0, float 2.0, <32 x i32> %in.1)
188  store <32 x i32> %mai.1, <32 x i32> addrspace(1)* %arg
189  ret void
190}
191