1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck --check-prefix=GFX7 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10_W32 %s
5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefix=GFX10_W64 %s
6
7define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
8; GFX7-LABEL: v_div_fmas_f32:
9; GFX7:       ; %bb.0:
10; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
12; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
13; GFX7-NEXT:    s_nop 3
14; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
15; GFX7-NEXT:    s_setpc_b64 s[30:31]
16;
17; GFX8-LABEL: v_div_fmas_f32:
18; GFX8:       ; %bb.0:
19; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
21; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
22; GFX8-NEXT:    s_nop 3
23; GFX8-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
24; GFX8-NEXT:    s_setpc_b64 s[30:31]
25;
26; GFX10_W32-LABEL: v_div_fmas_f32:
27; GFX10_W32:       ; %bb.0:
28; GFX10_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29; GFX10_W32-NEXT:    s_waitcnt_vscnt null, 0x0
30; GFX10_W32-NEXT:    v_and_b32_e32 v3, 1, v3
31; GFX10_W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
32; GFX10_W32-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
33; GFX10_W32-NEXT:    s_setpc_b64 s[30:31]
34;
35; GFX10_W64-LABEL: v_div_fmas_f32:
36; GFX10_W64:       ; %bb.0:
37; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38; GFX10_W64-NEXT:    s_waitcnt_vscnt null, 0x0
39; GFX10_W64-NEXT:    v_and_b32_e32 v3, 1, v3
40; GFX10_W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
41; GFX10_W64-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
42; GFX10_W64-NEXT:    s_setpc_b64 s[30:31]
43  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d)
44  ret float %result
45}
46
47define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
48; GFX7-LABEL: v_div_fmas_f64:
49; GFX7:       ; %bb.0:
50; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
51; GFX7-NEXT:    v_and_b32_e32 v6, 1, v6
52; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
53; GFX7-NEXT:    s_nop 3
54; GFX7-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
55; GFX7-NEXT:    s_setpc_b64 s[30:31]
56;
57; GFX8-LABEL: v_div_fmas_f64:
58; GFX8:       ; %bb.0:
59; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
60; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
61; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
62; GFX8-NEXT:    s_nop 3
63; GFX8-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
64; GFX8-NEXT:    s_setpc_b64 s[30:31]
65;
66; GFX10_W32-LABEL: v_div_fmas_f64:
67; GFX10_W32:       ; %bb.0:
68; GFX10_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
69; GFX10_W32-NEXT:    s_waitcnt_vscnt null, 0x0
70; GFX10_W32-NEXT:    v_and_b32_e32 v6, 1, v6
71; GFX10_W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
72; GFX10_W32-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
73; GFX10_W32-NEXT:    s_setpc_b64 s[30:31]
74;
75; GFX10_W64-LABEL: v_div_fmas_f64:
76; GFX10_W64:       ; %bb.0:
77; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
78; GFX10_W64-NEXT:    s_waitcnt_vscnt null, 0x0
79; GFX10_W64-NEXT:    v_and_b32_e32 v6, 1, v6
80; GFX10_W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
81; GFX10_W64-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
82; GFX10_W64-NEXT:    s_setpc_b64 s[30:31]
83  %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d)
84  ret double %result
85}
86
87define amdgpu_ps float @s_div_fmas_f32(float inreg %a, float inreg %b, float inreg %c, i32 inreg %d) {
88; GFX7-LABEL: s_div_fmas_f32:
89; GFX7:       ; %bb.0:
90; GFX7-NEXT:    s_cmp_eq_u32 s3, 0
91; GFX7-NEXT:    s_cselect_b32 s3, 1, 0
92; GFX7-NEXT:    v_mov_b32_e32 v0, s0
93; GFX7-NEXT:    s_and_b32 s0, 1, s3
94; GFX7-NEXT:    v_mov_b32_e32 v1, s1
95; GFX7-NEXT:    v_mov_b32_e32 v2, s2
96; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
97; GFX7-NEXT:    s_nop 3
98; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
99; GFX7-NEXT:    ; return to shader part epilog
100;
101; GFX8-LABEL: s_div_fmas_f32:
102; GFX8:       ; %bb.0:
103; GFX8-NEXT:    s_cmp_eq_u32 s3, 0
104; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
105; GFX8-NEXT:    v_mov_b32_e32 v0, s0
106; GFX8-NEXT:    s_and_b32 s0, 1, s3
107; GFX8-NEXT:    v_mov_b32_e32 v1, s1
108; GFX8-NEXT:    v_mov_b32_e32 v2, s2
109; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
110; GFX8-NEXT:    s_nop 3
111; GFX8-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
112; GFX8-NEXT:    ; return to shader part epilog
113;
114; GFX10_W32-LABEL: s_div_fmas_f32:
115; GFX10_W32:       ; %bb.0:
116; GFX10_W32-NEXT:    s_cmp_eq_u32 s3, 0
117; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s1
118; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s2
119; GFX10_W32-NEXT:    s_cselect_b32 s3, 1, 0
120; GFX10_W32-NEXT:    s_and_b32 s3, 1, s3
121; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
122; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s0, v0, v1
123; GFX10_W32-NEXT:    ; return to shader part epilog
124;
125; GFX10_W64-LABEL: s_div_fmas_f32:
126; GFX10_W64:       ; %bb.0:
127; GFX10_W64-NEXT:    s_cmp_eq_u32 s3, 0
128; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s1
129; GFX10_W64-NEXT:    s_cselect_b32 s3, 1, 0
130; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s2
131; GFX10_W64-NEXT:    s_and_b32 s3, 1, s3
132; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
133; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s0, v0, v1
134; GFX10_W64-NEXT:    ; return to shader part epilog
135  %vcc = icmp eq i32 %d, 0
136  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %vcc)
137  ret float %result
138}
139
140define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double inreg %c, i32 inreg %d) {
141; GFX7-LABEL: s_div_fmas_f64:
142; GFX7:       ; %bb.0:
143; GFX7-NEXT:    s_cmp_eq_u32 s6, 0
144; GFX7-NEXT:    s_cselect_b32 s6, 1, 0
145; GFX7-NEXT:    v_mov_b32_e32 v0, s0
146; GFX7-NEXT:    v_mov_b32_e32 v1, s1
147; GFX7-NEXT:    v_mov_b32_e32 v2, s2
148; GFX7-NEXT:    v_mov_b32_e32 v4, s4
149; GFX7-NEXT:    s_and_b32 s0, 1, s6
150; GFX7-NEXT:    v_mov_b32_e32 v3, s3
151; GFX7-NEXT:    v_mov_b32_e32 v5, s5
152; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
153; GFX7-NEXT:    s_nop 3
154; GFX7-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
155; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
156; GFX7-NEXT:    v_readfirstlane_b32 s1, v1
157; GFX7-NEXT:    ; return to shader part epilog
158;
159; GFX8-LABEL: s_div_fmas_f64:
160; GFX8:       ; %bb.0:
161; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
162; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
163; GFX8-NEXT:    v_mov_b32_e32 v0, s0
164; GFX8-NEXT:    v_mov_b32_e32 v1, s1
165; GFX8-NEXT:    v_mov_b32_e32 v2, s2
166; GFX8-NEXT:    v_mov_b32_e32 v4, s4
167; GFX8-NEXT:    s_and_b32 s0, 1, s6
168; GFX8-NEXT:    v_mov_b32_e32 v3, s3
169; GFX8-NEXT:    v_mov_b32_e32 v5, s5
170; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
171; GFX8-NEXT:    s_nop 3
172; GFX8-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
173; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
174; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
175; GFX8-NEXT:    ; return to shader part epilog
176;
177; GFX10_W32-LABEL: s_div_fmas_f64:
178; GFX10_W32:       ; %bb.0:
179; GFX10_W32-NEXT:    s_cmp_eq_u32 s6, 0
180; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s2
181; GFX10_W32-NEXT:    v_mov_b32_e32 v2, s4
182; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s3
183; GFX10_W32-NEXT:    v_mov_b32_e32 v3, s5
184; GFX10_W32-NEXT:    s_cselect_b32 s6, 1, 0
185; GFX10_W32-NEXT:    s_and_b32 s6, 1, s6
186; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
187; GFX10_W32-NEXT:    v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3]
188; GFX10_W32-NEXT:    v_readfirstlane_b32 s0, v0
189; GFX10_W32-NEXT:    v_readfirstlane_b32 s1, v1
190; GFX10_W32-NEXT:    ; return to shader part epilog
191;
192; GFX10_W64-LABEL: s_div_fmas_f64:
193; GFX10_W64:       ; %bb.0:
194; GFX10_W64-NEXT:    s_cmp_eq_u32 s6, 0
195; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s2
196; GFX10_W64-NEXT:    s_cselect_b32 s6, 1, 0
197; GFX10_W64-NEXT:    v_mov_b32_e32 v2, s4
198; GFX10_W64-NEXT:    s_and_b32 s6, 1, s6
199; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s3
200; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s6
201; GFX10_W64-NEXT:    v_mov_b32_e32 v3, s5
202; GFX10_W64-NEXT:    v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3]
203; GFX10_W64-NEXT:    v_readfirstlane_b32 s0, v0
204; GFX10_W64-NEXT:    v_readfirstlane_b32 s1, v1
205; GFX10_W64-NEXT:    ; return to shader part epilog
206  %vcc = icmp eq i32 %d, 0
207  %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %vcc)
208  ret double %result
209}
210
211define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) {
212; GFX7-LABEL: test_div_fmas_f32:
213; GFX7:       ; %bb.0:
214; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
215; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x13
216; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x1c
217; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x25
218; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x2e
219; GFX7-NEXT:    s_mov_b32 s7, 0xf000
220; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
221; GFX7-NEXT:    v_mov_b32_e32 v0, s2
222; GFX7-NEXT:    v_mov_b32_e32 v1, s3
223; GFX7-NEXT:    v_mov_b32_e32 v2, s6
224; GFX7-NEXT:    s_and_b32 s0, 1, s0
225; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
226; GFX7-NEXT:    s_mov_b32 s6, -1
227; GFX7-NEXT:    s_nop 2
228; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
229; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
230; GFX7-NEXT:    s_endpgm
231;
232; GFX8-LABEL: test_div_fmas_f32:
233; GFX8:       ; %bb.0:
234; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x4c
235; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x70
236; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x94
237; GFX8-NEXT:    s_load_dword s5, s[0:1], 0xb8
238; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
239; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
240; GFX8-NEXT:    v_mov_b32_e32 v0, s2
241; GFX8-NEXT:    v_mov_b32_e32 v1, s3
242; GFX8-NEXT:    v_mov_b32_e32 v2, s4
243; GFX8-NEXT:    s_and_b32 s2, 1, s5
244; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
245; GFX8-NEXT:    s_nop 3
246; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, v2
247; GFX8-NEXT:    v_mov_b32_e32 v0, s0
248; GFX8-NEXT:    v_mov_b32_e32 v1, s1
249; GFX8-NEXT:    flat_store_dword v[0:1], v2
250; GFX8-NEXT:    s_endpgm
251;
252; GFX10_W32-LABEL: test_div_fmas_f32:
253; GFX10_W32:       ; %bb.0:
254; GFX10_W32-NEXT:    s_clause 0x4
255; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0xb8
256; GFX10_W32-NEXT:    s_load_dword s5, s[0:1], 0x70
257; GFX10_W32-NEXT:    s_load_dword s6, s[0:1], 0x94
258; GFX10_W32-NEXT:    s_load_dword s7, s[0:1], 0x4c
259; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
260; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
261; GFX10_W32-NEXT:    s_and_b32 s0, 1, s4
262; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s5
263; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s6
264; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
265; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s7, v0, v1
266; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
267; GFX10_W32-NEXT:    global_store_dword v1, v0, s[2:3]
268; GFX10_W32-NEXT:    s_endpgm
269;
270; GFX10_W64-LABEL: test_div_fmas_f32:
271; GFX10_W64:       ; %bb.0:
272; GFX10_W64-NEXT:    s_clause 0x4
273; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0xb8
274; GFX10_W64-NEXT:    s_load_dword s5, s[0:1], 0x70
275; GFX10_W64-NEXT:    s_load_dword s6, s[0:1], 0x94
276; GFX10_W64-NEXT:    s_load_dword s7, s[0:1], 0x4c
277; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
278; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
279; GFX10_W64-NEXT:    s_and_b32 s0, 1, s4
280; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s5
281; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
282; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s6
283; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s7, v0, v1
284; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
285; GFX10_W64-NEXT:    global_store_dword v1, v0, s[2:3]
286; GFX10_W64-NEXT:    s_endpgm
287  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d)
288  store float %result, float addrspace(1)* %out, align 4
289  ret void
290}
291
292define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) {
293; GFX7-LABEL: test_div_fmas_f32_inline_imm_0:
294; GFX7:       ; %bb.0:
295; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
296; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x1c
297; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x25
298; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x2e
299; GFX7-NEXT:    s_mov_b32 s6, -1
300; GFX7-NEXT:    s_mov_b32 s7, 0xf000
301; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
302; GFX7-NEXT:    v_mov_b32_e32 v0, s2
303; GFX7-NEXT:    v_mov_b32_e32 v1, s3
304; GFX7-NEXT:    s_and_b32 s0, 1, s0
305; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
306; GFX7-NEXT:    s_nop 3
307; GFX7-NEXT:    v_div_fmas_f32 v0, 1.0, v0, v1
308; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
309; GFX7-NEXT:    s_endpgm
310;
311; GFX8-LABEL: test_div_fmas_f32_inline_imm_0:
312; GFX8:       ; %bb.0:
313; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x70
314; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x94
315; GFX8-NEXT:    s_load_dword s4, s[0:1], 0xb8
316; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
317; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
318; GFX8-NEXT:    v_mov_b32_e32 v0, s2
319; GFX8-NEXT:    v_mov_b32_e32 v1, s3
320; GFX8-NEXT:    s_and_b32 s2, 1, s4
321; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
322; GFX8-NEXT:    s_nop 3
323; GFX8-NEXT:    v_div_fmas_f32 v2, 1.0, v0, v1
324; GFX8-NEXT:    v_mov_b32_e32 v0, s0
325; GFX8-NEXT:    v_mov_b32_e32 v1, s1
326; GFX8-NEXT:    flat_store_dword v[0:1], v2
327; GFX8-NEXT:    s_endpgm
328;
329; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_0:
330; GFX10_W32:       ; %bb.0:
331; GFX10_W32-NEXT:    s_clause 0x3
332; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0xb8
333; GFX10_W32-NEXT:    s_load_dword s5, s[0:1], 0x94
334; GFX10_W32-NEXT:    s_load_dword s6, s[0:1], 0x70
335; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
336; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
337; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
338; GFX10_W32-NEXT:    s_and_b32 s0, 1, s4
339; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s5
340; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
341; GFX10_W32-NEXT:    v_div_fmas_f32 v0, 1.0, s6, v0
342; GFX10_W32-NEXT:    global_store_dword v1, v0, s[2:3]
343; GFX10_W32-NEXT:    s_endpgm
344;
345; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_0:
346; GFX10_W64:       ; %bb.0:
347; GFX10_W64-NEXT:    s_clause 0x3
348; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0xb8
349; GFX10_W64-NEXT:    s_load_dword s5, s[0:1], 0x94
350; GFX10_W64-NEXT:    s_load_dword s6, s[0:1], 0x70
351; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
352; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
353; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
354; GFX10_W64-NEXT:    s_and_b32 s0, 1, s4
355; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s5
356; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
357; GFX10_W64-NEXT:    v_div_fmas_f32 v0, 1.0, s6, v0
358; GFX10_W64-NEXT:    global_store_dword v1, v0, s[2:3]
359; GFX10_W64-NEXT:    s_endpgm
360  %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d)
361  store float %result, float addrspace(1)* %out, align 4
362  ret void
363}
364
365define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, [8 x i32], i1 %d) {
366; GFX7-LABEL: test_div_fmas_f32_inline_imm_1:
367; GFX7:       ; %bb.0:
368; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
369; GFX7-NEXT:    s_load_dword s2, s[0:1], 0xb
370; GFX7-NEXT:    s_load_dword s3, s[0:1], 0xd
371; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x16
372; GFX7-NEXT:    s_mov_b32 s6, -1
373; GFX7-NEXT:    s_mov_b32 s7, 0xf000
374; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
375; GFX7-NEXT:    v_mov_b32_e32 v0, s2
376; GFX7-NEXT:    v_mov_b32_e32 v1, s3
377; GFX7-NEXT:    s_and_b32 s0, 1, s0
378; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
379; GFX7-NEXT:    s_nop 3
380; GFX7-NEXT:    v_div_fmas_f32 v0, v0, 1.0, v1
381; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
382; GFX7-NEXT:    s_endpgm
383;
384; GFX8-LABEL: test_div_fmas_f32_inline_imm_1:
385; GFX8:       ; %bb.0:
386; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x2c
387; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x34
388; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x58
389; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
390; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
391; GFX8-NEXT:    v_mov_b32_e32 v0, s2
392; GFX8-NEXT:    v_mov_b32_e32 v1, s3
393; GFX8-NEXT:    s_and_b32 s2, 1, s4
394; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
395; GFX8-NEXT:    s_nop 3
396; GFX8-NEXT:    v_div_fmas_f32 v2, v0, 1.0, v1
397; GFX8-NEXT:    v_mov_b32_e32 v0, s0
398; GFX8-NEXT:    v_mov_b32_e32 v1, s1
399; GFX8-NEXT:    flat_store_dword v[0:1], v2
400; GFX8-NEXT:    s_endpgm
401;
402; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_1:
403; GFX10_W32:       ; %bb.0:
404; GFX10_W32-NEXT:    s_clause 0x3
405; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0x58
406; GFX10_W32-NEXT:    s_load_dword s5, s[0:1], 0x34
407; GFX10_W32-NEXT:    s_load_dword s6, s[0:1], 0x2c
408; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
409; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
410; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
411; GFX10_W32-NEXT:    s_and_b32 s0, 1, s4
412; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s5
413; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
414; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s6, 1.0, v0
415; GFX10_W32-NEXT:    global_store_dword v1, v0, s[2:3]
416; GFX10_W32-NEXT:    s_endpgm
417;
418; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_1:
419; GFX10_W64:       ; %bb.0:
420; GFX10_W64-NEXT:    s_clause 0x3
421; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0x58
422; GFX10_W64-NEXT:    s_load_dword s5, s[0:1], 0x34
423; GFX10_W64-NEXT:    s_load_dword s6, s[0:1], 0x2c
424; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
425; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
426; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
427; GFX10_W64-NEXT:    s_and_b32 s0, 1, s4
428; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s5
429; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
430; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s6, 1.0, v0
431; GFX10_W64-NEXT:    global_store_dword v1, v0, s[2:3]
432; GFX10_W64-NEXT:    s_endpgm
433  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d)
434  store float %result, float addrspace(1)* %out, align 4
435  ret void
436}
437
438define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) {
439; GFX7-LABEL: test_div_fmas_f32_inline_imm_2:
440; GFX7:       ; %bb.0:
441; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
442; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x13
443; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x1c
444; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x2e
445; GFX7-NEXT:    s_mov_b32 s6, -1
446; GFX7-NEXT:    s_mov_b32 s7, 0xf000
447; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
448; GFX7-NEXT:    v_mov_b32_e32 v0, s2
449; GFX7-NEXT:    v_mov_b32_e32 v1, s3
450; GFX7-NEXT:    s_and_b32 s0, 1, s0
451; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
452; GFX7-NEXT:    s_nop 3
453; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, 1.0
454; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
455; GFX7-NEXT:    s_endpgm
456;
457; GFX8-LABEL: test_div_fmas_f32_inline_imm_2:
458; GFX8:       ; %bb.0:
459; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x4c
460; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x70
461; GFX8-NEXT:    s_load_dword s4, s[0:1], 0xb8
462; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
463; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
464; GFX8-NEXT:    v_mov_b32_e32 v0, s2
465; GFX8-NEXT:    v_mov_b32_e32 v1, s3
466; GFX8-NEXT:    s_and_b32 s2, 1, s4
467; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
468; GFX8-NEXT:    s_nop 3
469; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, 1.0
470; GFX8-NEXT:    v_mov_b32_e32 v0, s0
471; GFX8-NEXT:    v_mov_b32_e32 v1, s1
472; GFX8-NEXT:    flat_store_dword v[0:1], v2
473; GFX8-NEXT:    s_endpgm
474;
475; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_2:
476; GFX10_W32:       ; %bb.0:
477; GFX10_W32-NEXT:    s_clause 0x3
478; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0xb8
479; GFX10_W32-NEXT:    s_load_dword s5, s[0:1], 0x70
480; GFX10_W32-NEXT:    s_load_dword s6, s[0:1], 0x4c
481; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
482; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
483; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
484; GFX10_W32-NEXT:    s_and_b32 s0, 1, s4
485; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s5
486; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
487; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s6, v0, 1.0
488; GFX10_W32-NEXT:    global_store_dword v1, v0, s[2:3]
489; GFX10_W32-NEXT:    s_endpgm
490;
491; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_2:
492; GFX10_W64:       ; %bb.0:
493; GFX10_W64-NEXT:    s_clause 0x3
494; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0xb8
495; GFX10_W64-NEXT:    s_load_dword s5, s[0:1], 0x70
496; GFX10_W64-NEXT:    s_load_dword s6, s[0:1], 0x4c
497; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
498; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
499; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
500; GFX10_W64-NEXT:    s_and_b32 s0, 1, s4
501; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s5
502; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
503; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s6, v0, 1.0
504; GFX10_W64-NEXT:    global_store_dword v1, v0, s[2:3]
505; GFX10_W64-NEXT:    s_endpgm
506  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d)
507  store float %result, float addrspace(1)* %out, align 4
508  ret void
509}
510
511define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) {
512; GFX7-LABEL: test_div_fmas_f64:
513; GFX7:       ; %bb.0:
514; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x11
515; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
516; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
517; GFX7-NEXT:    v_mov_b32_e32 v0, s2
518; GFX7-NEXT:    v_mov_b32_e32 v1, s3
519; GFX7-NEXT:    v_mov_b32_e32 v2, s4
520; GFX7-NEXT:    v_mov_b32_e32 v4, s6
521; GFX7-NEXT:    s_and_b32 s2, 1, s8
522; GFX7-NEXT:    v_mov_b32_e32 v3, s5
523; GFX7-NEXT:    v_mov_b32_e32 v5, s7
524; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
525; GFX7-NEXT:    s_nop 3
526; GFX7-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
527; GFX7-NEXT:    v_mov_b32_e32 v3, s1
528; GFX7-NEXT:    v_mov_b32_e32 v2, s0
529; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
530; GFX7-NEXT:    s_endpgm
531;
532; GFX8-LABEL: test_div_fmas_f64:
533; GFX8:       ; %bb.0:
534; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x44
535; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
536; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
537; GFX8-NEXT:    v_mov_b32_e32 v0, s2
538; GFX8-NEXT:    v_mov_b32_e32 v1, s3
539; GFX8-NEXT:    v_mov_b32_e32 v2, s4
540; GFX8-NEXT:    v_mov_b32_e32 v4, s6
541; GFX8-NEXT:    s_and_b32 s2, 1, s8
542; GFX8-NEXT:    v_mov_b32_e32 v3, s5
543; GFX8-NEXT:    v_mov_b32_e32 v5, s7
544; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
545; GFX8-NEXT:    s_nop 3
546; GFX8-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
547; GFX8-NEXT:    v_mov_b32_e32 v3, s1
548; GFX8-NEXT:    v_mov_b32_e32 v2, s0
549; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
550; GFX8-NEXT:    s_endpgm
551;
552; GFX10_W32-LABEL: test_div_fmas_f64:
553; GFX10_W32:       ; %bb.0:
554; GFX10_W32-NEXT:    s_clause 0x1
555; GFX10_W32-NEXT:    s_load_dword s2, s[0:1], 0x44
556; GFX10_W32-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
557; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
558; GFX10_W32-NEXT:    s_and_b32 s0, 1, s2
559; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s8
560; GFX10_W32-NEXT:    v_mov_b32_e32 v2, s10
561; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s9
562; GFX10_W32-NEXT:    v_mov_b32_e32 v3, s11
563; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
564; GFX10_W32-NEXT:    v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3]
565; GFX10_W32-NEXT:    v_mov_b32_e32 v2, 0
566; GFX10_W32-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
567; GFX10_W32-NEXT:    s_endpgm
568;
569; GFX10_W64-LABEL: test_div_fmas_f64:
570; GFX10_W64:       ; %bb.0:
571; GFX10_W64-NEXT:    s_clause 0x1
572; GFX10_W64-NEXT:    s_load_dword s2, s[0:1], 0x44
573; GFX10_W64-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x24
574; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
575; GFX10_W64-NEXT:    s_and_b32 s0, 1, s2
576; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s8
577; GFX10_W64-NEXT:    v_mov_b32_e32 v2, s10
578; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
579; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s9
580; GFX10_W64-NEXT:    v_mov_b32_e32 v3, s11
581; GFX10_W64-NEXT:    v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3]
582; GFX10_W64-NEXT:    v_mov_b32_e32 v2, 0
583; GFX10_W64-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
584; GFX10_W64-NEXT:    s_endpgm
585  %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d)
586  store double %result, double addrspace(1)* %out, align 8
587  ret void
588}
589
590define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) {
591; GFX7-LABEL: test_div_fmas_f32_cond_to_vcc:
592; GFX7:       ; %bb.0:
593; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
594; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
595; GFX7-NEXT:    s_mov_b32 s6, -1
596; GFX7-NEXT:    s_mov_b32 s7, 0xf000
597; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
598; GFX7-NEXT:    s_cmp_eq_u32 s3, 0
599; GFX7-NEXT:    s_cselect_b32 s3, 1, 0
600; GFX7-NEXT:    v_mov_b32_e32 v0, s0
601; GFX7-NEXT:    s_and_b32 s0, 1, s3
602; GFX7-NEXT:    v_mov_b32_e32 v1, s1
603; GFX7-NEXT:    v_mov_b32_e32 v2, s2
604; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
605; GFX7-NEXT:    s_nop 3
606; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
607; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
608; GFX7-NEXT:    s_endpgm
609;
610; GFX8-LABEL: test_div_fmas_f32_cond_to_vcc:
611; GFX8:       ; %bb.0:
612; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
613; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
614; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
615; GFX8-NEXT:    s_cmp_eq_u32 s7, 0
616; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
617; GFX8-NEXT:    s_and_b32 s2, 1, s2
618; GFX8-NEXT:    v_mov_b32_e32 v0, s4
619; GFX8-NEXT:    v_mov_b32_e32 v1, s5
620; GFX8-NEXT:    v_mov_b32_e32 v2, s6
621; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
622; GFX8-NEXT:    s_nop 3
623; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, v2
624; GFX8-NEXT:    v_mov_b32_e32 v0, s0
625; GFX8-NEXT:    v_mov_b32_e32 v1, s1
626; GFX8-NEXT:    flat_store_dword v[0:1], v2
627; GFX8-NEXT:    s_endpgm
628;
629; GFX10_W32-LABEL: test_div_fmas_f32_cond_to_vcc:
630; GFX10_W32:       ; %bb.0:
631; GFX10_W32-NEXT:    s_clause 0x1
632; GFX10_W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
633; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
634; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
635; GFX10_W32-NEXT:    s_cmp_eq_u32 s7, 0
636; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s5
637; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s6
638; GFX10_W32-NEXT:    s_cselect_b32 s0, 1, 0
639; GFX10_W32-NEXT:    s_and_b32 s0, 1, s0
640; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
641; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s4, v0, v1
642; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
643; GFX10_W32-NEXT:    global_store_dword v1, v0, s[2:3]
644; GFX10_W32-NEXT:    s_endpgm
645;
646; GFX10_W64-LABEL: test_div_fmas_f32_cond_to_vcc:
647; GFX10_W64:       ; %bb.0:
648; GFX10_W64-NEXT:    s_clause 0x1
649; GFX10_W64-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
650; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
651; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
652; GFX10_W64-NEXT:    s_cmp_eq_u32 s7, 0
653; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s5
654; GFX10_W64-NEXT:    s_cselect_b32 s0, 1, 0
655; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s6
656; GFX10_W64-NEXT:    s_and_b32 s0, 1, s0
657; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
658; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s4, v0, v1
659; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
660; GFX10_W64-NEXT:    global_store_dword v1, v0, s[2:3]
661; GFX10_W64-NEXT:    s_endpgm
662  %cmp = icmp eq i32 %i, 0
663  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp)
664  store float %result, float addrspace(1)* %out, align 4
665  ret void
666}
667
668define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) {
669; GFX7-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
670; GFX7:       ; %bb.0:
671; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
672; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x13
673; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x1c
674; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x25
675; GFX7-NEXT:    s_mov_b64 vcc, 0
676; GFX7-NEXT:    s_mov_b32 s6, -1
677; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
678; GFX7-NEXT:    v_mov_b32_e32 v0, s2
679; GFX7-NEXT:    v_mov_b32_e32 v1, s3
680; GFX7-NEXT:    v_mov_b32_e32 v2, s0
681; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
682; GFX7-NEXT:    s_mov_b32 s7, 0xf000
683; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
684; GFX7-NEXT:    s_endpgm
685;
686; GFX8-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
687; GFX8:       ; %bb.0:
688; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x4c
689; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x70
690; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x94
691; GFX8-NEXT:    s_mov_b64 vcc, 0
692; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
693; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
694; GFX8-NEXT:    v_mov_b32_e32 v0, s2
695; GFX8-NEXT:    v_mov_b32_e32 v1, s3
696; GFX8-NEXT:    v_mov_b32_e32 v2, s4
697; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, v2
698; GFX8-NEXT:    v_mov_b32_e32 v0, s0
699; GFX8-NEXT:    v_mov_b32_e32 v1, s1
700; GFX8-NEXT:    flat_store_dword v[0:1], v2
701; GFX8-NEXT:    s_endpgm
702;
703; GFX10_W32-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
704; GFX10_W32:       ; %bb.0:
705; GFX10_W32-NEXT:    s_clause 0x3
706; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0x70
707; GFX10_W32-NEXT:    s_load_dword s5, s[0:1], 0x94
708; GFX10_W32-NEXT:    s_load_dword s6, s[0:1], 0x4c
709; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
710; GFX10_W32-NEXT:    s_mov_b32 vcc_lo, 0
711; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
712; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s4
713; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s5
714; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s6, v0, v1
715; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
716; GFX10_W32-NEXT:    global_store_dword v1, v0, s[2:3]
717; GFX10_W32-NEXT:    s_endpgm
718;
719; GFX10_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
720; GFX10_W64:       ; %bb.0:
721; GFX10_W64-NEXT:    s_clause 0x3
722; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0x70
723; GFX10_W64-NEXT:    s_load_dword s5, s[0:1], 0x94
724; GFX10_W64-NEXT:    s_load_dword s6, s[0:1], 0x4c
725; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
726; GFX10_W64-NEXT:    s_mov_b64 vcc, 0
727; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
728; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s4
729; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s5
730; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s6, v0, v1
731; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
732; GFX10_W64-NEXT:    global_store_dword v1, v0, s[2:3]
733; GFX10_W64-NEXT:    s_endpgm
734  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false)
735  store float %result, float addrspace(1)* %out, align 4
736  ret void
737}
738
739define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) {
740; GFX7-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
741; GFX7:       ; %bb.0:
742; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
743; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x13
744; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x1c
745; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x25
746; GFX7-NEXT:    s_mov_b64 vcc, -1
747; GFX7-NEXT:    s_mov_b32 s6, -1
748; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
749; GFX7-NEXT:    v_mov_b32_e32 v0, s2
750; GFX7-NEXT:    v_mov_b32_e32 v1, s3
751; GFX7-NEXT:    v_mov_b32_e32 v2, s0
752; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
753; GFX7-NEXT:    s_mov_b32 s7, 0xf000
754; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
755; GFX7-NEXT:    s_endpgm
756;
757; GFX8-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
758; GFX8:       ; %bb.0:
759; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x4c
760; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x70
761; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x94
762; GFX8-NEXT:    s_mov_b64 vcc, -1
763; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
764; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
765; GFX8-NEXT:    v_mov_b32_e32 v0, s2
766; GFX8-NEXT:    v_mov_b32_e32 v1, s3
767; GFX8-NEXT:    v_mov_b32_e32 v2, s4
768; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, v2
769; GFX8-NEXT:    v_mov_b32_e32 v0, s0
770; GFX8-NEXT:    v_mov_b32_e32 v1, s1
771; GFX8-NEXT:    flat_store_dword v[0:1], v2
772; GFX8-NEXT:    s_endpgm
773;
774; GFX10_W32-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
775; GFX10_W32:       ; %bb.0:
776; GFX10_W32-NEXT:    s_clause 0x3
777; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0x70
778; GFX10_W32-NEXT:    s_load_dword s5, s[0:1], 0x94
779; GFX10_W32-NEXT:    s_load_dword s6, s[0:1], 0x4c
780; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
781; GFX10_W32-NEXT:    s_mov_b32 vcc_lo, -1
782; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
783; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s4
784; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s5
785; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s6, v0, v1
786; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
787; GFX10_W32-NEXT:    global_store_dword v1, v0, s[2:3]
788; GFX10_W32-NEXT:    s_endpgm
789;
790; GFX10_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
791; GFX10_W64:       ; %bb.0:
792; GFX10_W64-NEXT:    s_clause 0x3
793; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0x70
794; GFX10_W64-NEXT:    s_load_dword s5, s[0:1], 0x94
795; GFX10_W64-NEXT:    s_load_dword s6, s[0:1], 0x4c
796; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
797; GFX10_W64-NEXT:    s_mov_b64 vcc, -1
798; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
799; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s4
800; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s5
801; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s6, v0, v1
802; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
803; GFX10_W64-NEXT:    global_store_dword v1, v0, s[2:3]
804; GFX10_W64-NEXT:    s_endpgm
805  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true)
806  store float %result, float addrspace(1)* %out, align 4
807  ret void
808}
809
810define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, [8 x i32], i32 %d) {
811; GFX7-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
812; GFX7:       ; %bb.0:
813; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
814; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x15
815; GFX7-NEXT:    s_mov_b32 s2, 0
816; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
817; GFX7-NEXT:    v_mov_b32_e32 v2, 0
818; GFX7-NEXT:    s_mov_b32 s3, 0xf000
819; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
820; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
821; GFX7-NEXT:    buffer_load_dword v3, v[1:2], s[0:3], 0 addr64 glc
822; GFX7-NEXT:    s_waitcnt vmcnt(0)
823; GFX7-NEXT:    buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:4 glc
824; GFX7-NEXT:    s_waitcnt vmcnt(0)
825; GFX7-NEXT:    buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 offset:8 glc
826; GFX7-NEXT:    s_waitcnt vmcnt(0)
827; GFX7-NEXT:    s_cmp_lg_u32 s8, 0
828; GFX7-NEXT:    s_cselect_b32 s0, 1, 0
829; GFX7-NEXT:    s_and_b32 s0, 1, s0
830; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
831; GFX7-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
832; GFX7-NEXT:    s_mov_b32 s2, -1
833; GFX7-NEXT:    s_and_b64 vcc, vcc, s[0:1]
834; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
835; GFX7-NEXT:    v_div_fmas_f32 v0, v3, v4, v1
836; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
837; GFX7-NEXT:    s_endpgm
838;
839; GFX8-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
840; GFX8:       ; %bb.0:
841; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
842; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x54
843; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
844; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
845; GFX8-NEXT:    v_mov_b32_e32 v1, s6
846; GFX8-NEXT:    v_mov_b32_e32 v2, s7
847; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
848; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
849; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v1
850; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
851; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 8, v1
852; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v2, vcc
853; GFX8-NEXT:    flat_load_dword v1, v[1:2] glc
854; GFX8-NEXT:    s_waitcnt vmcnt(0)
855; GFX8-NEXT:    flat_load_dword v2, v[3:4] glc
856; GFX8-NEXT:    s_waitcnt vmcnt(0)
857; GFX8-NEXT:    flat_load_dword v3, v[5:6] glc
858; GFX8-NEXT:    s_waitcnt vmcnt(0)
859; GFX8-NEXT:    s_add_u32 s0, s4, 8
860; GFX8-NEXT:    s_addc_u32 s1, s5, 0
861; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
862; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
863; GFX8-NEXT:    s_and_b32 s2, 1, s2
864; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
865; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, s2
866; GFX8-NEXT:    s_and_b64 vcc, vcc, s[2:3]
867; GFX8-NEXT:    s_nop 1
868; GFX8-NEXT:    v_div_fmas_f32 v2, v1, v2, v3
869; GFX8-NEXT:    v_mov_b32_e32 v0, s0
870; GFX8-NEXT:    v_mov_b32_e32 v1, s1
871; GFX8-NEXT:    flat_store_dword v[0:1], v2
872; GFX8-NEXT:    s_endpgm
873;
874; GFX10_W32-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
875; GFX10_W32:       ; %bb.0:
876; GFX10_W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
877; GFX10_W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
878; GFX10_W32-NEXT:    s_load_dword s0, s[0:1], 0x54
879; GFX10_W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
880; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
881; GFX10_W32-NEXT:    global_load_dword v2, v1, s[6:7] glc dlc
882; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
883; GFX10_W32-NEXT:    global_load_dword v3, v1, s[6:7] offset:4 glc dlc
884; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
885; GFX10_W32-NEXT:    global_load_dword v4, v1, s[6:7] offset:8 glc dlc
886; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
887; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
888; GFX10_W32-NEXT:    s_cmp_lg_u32 s0, 0
889; GFX10_W32-NEXT:    s_cselect_b32 s0, 1, 0
890; GFX10_W32-NEXT:    s_and_b32 s0, 1, s0
891; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
892; GFX10_W32-NEXT:    s_and_b32 vcc_lo, vcc_lo, s0
893; GFX10_W32-NEXT:    v_div_fmas_f32 v0, v2, v3, v4
894; GFX10_W32-NEXT:    global_store_dword v1, v0, s[4:5] offset:8
895; GFX10_W32-NEXT:    s_endpgm
896;
897; GFX10_W64-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
898; GFX10_W64:       ; %bb.0:
899; GFX10_W64-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
900; GFX10_W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
901; GFX10_W64-NEXT:    s_load_dword s0, s[0:1], 0x54
902; GFX10_W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
903; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
904; GFX10_W64-NEXT:    global_load_dword v2, v1, s[6:7] glc dlc
905; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
906; GFX10_W64-NEXT:    global_load_dword v3, v1, s[6:7] offset:4 glc dlc
907; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
908; GFX10_W64-NEXT:    global_load_dword v4, v1, s[6:7] offset:8 glc dlc
909; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
910; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
911; GFX10_W64-NEXT:    s_cmp_lg_u32 s0, 0
912; GFX10_W64-NEXT:    s_cselect_b32 s0, 1, 0
913; GFX10_W64-NEXT:    s_and_b32 s0, 1, s0
914; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
915; GFX10_W64-NEXT:    s_and_b64 vcc, vcc, s[0:1]
916; GFX10_W64-NEXT:    v_div_fmas_f32 v0, v2, v3, v4
917; GFX10_W64-NEXT:    global_store_dword v1, v0, s[4:5] offset:8
918; GFX10_W64-NEXT:    s_endpgm
919  %tid = call i32 @llvm.amdgcn.workitem.id.x()
920  %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
921  %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
922  %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
923  %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
924
925  %a = load volatile float, float addrspace(1)* %gep.a
926  %b = load volatile float, float addrspace(1)* %gep.b
927  %c = load volatile float, float addrspace(1)* %gep.c
928
929  %cmp0 = icmp eq i32 %tid, 0
930  %cmp1 = icmp ne i32 %d, 0
931  %and = and i1 %cmp0, %cmp1
932
933  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %and)
934  store float %result, float addrspace(1)* %gep.out, align 4
935  ret void
936}
937
938define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, [8 x i32], float addrspace(1)* %in, [8 x i32], i32 addrspace(1)* %dummy) {
939; GFX7-LABEL: test_div_fmas_f32_i1_phi_vcc:
940; GFX7:       ; %bb.0: ; %entry
941; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
942; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x13
943; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
944; GFX7-NEXT:    v_mov_b32_e32 v2, 0
945; GFX7-NEXT:    s_mov_b32 s10, 0
946; GFX7-NEXT:    s_mov_b32 s11, 0xf000
947; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
948; GFX7-NEXT:    buffer_load_dwordx3 v[1:3], v[1:2], s[8:11], 0 addr64
949; GFX7-NEXT:    s_mov_b32 s6, 0
950; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
951; GFX7-NEXT:    s_and_saveexec_b64 s[2:3], vcc
952; GFX7-NEXT:    s_cbranch_execz BB13_2
953; GFX7-NEXT:  ; %bb.1: ; %bb
954; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x1d
955; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
956; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
957; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
958; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
959; GFX7-NEXT:    s_cselect_b32 s6, 1, 0
960; GFX7-NEXT:  BB13_2: ; %exit
961; GFX7-NEXT:    s_or_b64 exec, exec, s[2:3]
962; GFX7-NEXT:    s_and_b32 s0, 1, s6
963; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
964; GFX7-NEXT:    s_mov_b32 s10, -1
965; GFX7-NEXT:    s_mov_b64 s[6:7], s[10:11]
966; GFX7-NEXT:    s_waitcnt vmcnt(0)
967; GFX7-NEXT:    s_nop 0
968; GFX7-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
969; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
970; GFX7-NEXT:    s_endpgm
971;
972; GFX8-LABEL: test_div_fmas_f32_i1_phi_vcc:
973; GFX8:       ; %bb.0: ; %entry
974; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
975; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x4c
976; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
977; GFX8-NEXT:    s_mov_b32 s6, 0
978; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
979; GFX8-NEXT:    v_mov_b32_e32 v1, s4
980; GFX8-NEXT:    v_mov_b32_e32 v2, s5
981; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
982; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
983; GFX8-NEXT:    flat_load_dwordx3 v[1:3], v[1:2]
984; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
985; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
986; GFX8-NEXT:    s_cbranch_execz BB13_2
987; GFX8-NEXT:  ; %bb.1: ; %bb
988; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x74
989; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
990; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
991; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
992; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
993; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
994; GFX8-NEXT:  BB13_2: ; %exit
995; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
996; GFX8-NEXT:    s_add_u32 s0, s2, 8
997; GFX8-NEXT:    s_addc_u32 s1, s3, 0
998; GFX8-NEXT:    s_and_b32 s2, 1, s6
999; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
1000; GFX8-NEXT:    s_waitcnt vmcnt(0)
1001; GFX8-NEXT:    s_nop 2
1002; GFX8-NEXT:    v_div_fmas_f32 v2, v1, v2, v3
1003; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1004; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1005; GFX8-NEXT:    flat_store_dword v[0:1], v2
1006; GFX8-NEXT:    s_endpgm
1007;
1008; GFX10_W32-LABEL: test_div_fmas_f32_i1_phi_vcc:
1009; GFX10_W32:       ; %bb.0: ; %entry
1010; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
1011; GFX10_W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1012; GFX10_W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1013; GFX10_W32-NEXT:    s_mov_b32 s5, 0
1014; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
1015; GFX10_W32-NEXT:    global_load_dwordx3 v[1:3], v1, s[2:3]
1016; GFX10_W32-NEXT:    s_waitcnt_depctr 0xffe3
1017; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1018; GFX10_W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1019; GFX10_W32-NEXT:    s_cbranch_execz BB13_2
1020; GFX10_W32-NEXT:  ; %bb.1: ; %bb
1021; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x74
1022; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
1023; GFX10_W32-NEXT:    s_load_dword s0, s[0:1], 0x0
1024; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
1025; GFX10_W32-NEXT:    s_cmp_lg_u32 s0, 0
1026; GFX10_W32-NEXT:    s_cselect_b32 s5, 1, 0
1027; GFX10_W32-NEXT:  BB13_2: ; %exit
1028; GFX10_W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1029; GFX10_W32-NEXT:    s_and_b32 s0, 1, s5
1030; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
1031; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
1032; GFX10_W32-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
1033; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
1034; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
1035; GFX10_W32-NEXT:    global_store_dword v1, v0, s[2:3] offset:8
1036; GFX10_W32-NEXT:    s_endpgm
1037;
1038; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc:
1039; GFX10_W64:       ; %bb.0: ; %entry
1040; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
1041; GFX10_W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1042; GFX10_W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1043; GFX10_W64-NEXT:    s_mov_b32 s6, 0
1044; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
1045; GFX10_W64-NEXT:    global_load_dwordx3 v[1:3], v1, s[2:3]
1046; GFX10_W64-NEXT:    s_waitcnt_depctr 0xffe3
1047; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1048; GFX10_W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1049; GFX10_W64-NEXT:    s_cbranch_execz BB13_2
1050; GFX10_W64-NEXT:  ; %bb.1: ; %bb
1051; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x74
1052; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
1053; GFX10_W64-NEXT:    s_load_dword s0, s[0:1], 0x0
1054; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
1055; GFX10_W64-NEXT:    s_cmp_lg_u32 s0, 0
1056; GFX10_W64-NEXT:    s_cselect_b32 s6, 1, 0
1057; GFX10_W64-NEXT:  BB13_2: ; %exit
1058; GFX10_W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1059; GFX10_W64-NEXT:    s_and_b32 s0, 1, s6
1060; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
1061; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
1062; GFX10_W64-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
1063; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
1064; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
1065; GFX10_W64-NEXT:    global_store_dword v1, v0, s[2:3] offset:8
1066; GFX10_W64-NEXT:    s_endpgm
1067entry:
1068  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1069  %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
1070  %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
1071  %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
1072  %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
1073
1074  %a = load float, float addrspace(1)* %gep.a
1075  %b = load float, float addrspace(1)* %gep.b
1076  %c = load float, float addrspace(1)* %gep.c
1077
1078  %cmp0 = icmp eq i32 %tid, 0
1079  br i1 %cmp0, label %bb, label %exit
1080
1081bb:
1082  %val = load i32, i32 addrspace(1)* %dummy
1083  %cmp1 = icmp ne i32 %val, 0
1084  br label %exit
1085
1086exit:
1087  %cond = phi i1 [false, %entry], [%cmp1, %bb]
1088  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond)
1089  store float %result, float addrspace(1)* %gep.out, align 4
1090  ret void
1091}
1092
1093declare i32 @llvm.amdgcn.workitem.id.x() #0
1094declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) #0
1095declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) #0
1096
1097attributes #0 = { nounwind readnone speculatable }
1098