1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10_W32 %s
5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10_W64 %s
6
7define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
8; GFX7-LABEL: v_div_fmas_f32:
9; GFX7:       ; %bb.0:
10; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
12; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
13; GFX7-NEXT:    s_nop 3
14; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
15; GFX7-NEXT:    s_setpc_b64 s[30:31]
16;
17; GFX8-LABEL: v_div_fmas_f32:
18; GFX8:       ; %bb.0:
19; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
21; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
22; GFX8-NEXT:    s_nop 3
23; GFX8-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
24; GFX8-NEXT:    s_setpc_b64 s[30:31]
25;
26; GFX10_W32-LABEL: v_div_fmas_f32:
27; GFX10_W32:       ; %bb.0:
28; GFX10_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
29; GFX10_W32-NEXT:    s_waitcnt_vscnt null, 0x0
30; GFX10_W32-NEXT:    v_and_b32_e32 v3, 1, v3
31; GFX10_W32-NEXT:    ; implicit-def: $vcc_hi
32; GFX10_W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
33; GFX10_W32-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
34; GFX10_W32-NEXT:    s_setpc_b64 s[30:31]
35;
36; GFX10_W64-LABEL: v_div_fmas_f32:
37; GFX10_W64:       ; %bb.0:
38; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
39; GFX10_W64-NEXT:    s_waitcnt_vscnt null, 0x0
40; GFX10_W64-NEXT:    v_and_b32_e32 v3, 1, v3
41; GFX10_W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
42; GFX10_W64-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
43; GFX10_W64-NEXT:    s_setpc_b64 s[30:31]
44  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d)
45  ret float %result
46}
47
48define double @v_div_fmas_f64(double %a, double %b, double %c, i1 %d) {
49; GFX7-LABEL: v_div_fmas_f64:
50; GFX7:       ; %bb.0:
51; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
52; GFX7-NEXT:    v_and_b32_e32 v6, 1, v6
53; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
54; GFX7-NEXT:    s_nop 3
55; GFX7-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
56; GFX7-NEXT:    s_setpc_b64 s[30:31]
57;
58; GFX8-LABEL: v_div_fmas_f64:
59; GFX8:       ; %bb.0:
60; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
61; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
62; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
63; GFX8-NEXT:    s_nop 3
64; GFX8-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
65; GFX8-NEXT:    s_setpc_b64 s[30:31]
66;
67; GFX10_W32-LABEL: v_div_fmas_f64:
68; GFX10_W32:       ; %bb.0:
69; GFX10_W32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
70; GFX10_W32-NEXT:    s_waitcnt_vscnt null, 0x0
71; GFX10_W32-NEXT:    v_and_b32_e32 v6, 1, v6
72; GFX10_W32-NEXT:    ; implicit-def: $vcc_hi
73; GFX10_W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
74; GFX10_W32-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
75; GFX10_W32-NEXT:    s_setpc_b64 s[30:31]
76;
77; GFX10_W64-LABEL: v_div_fmas_f64:
78; GFX10_W64:       ; %bb.0:
79; GFX10_W64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80; GFX10_W64-NEXT:    s_waitcnt_vscnt null, 0x0
81; GFX10_W64-NEXT:    v_and_b32_e32 v6, 1, v6
82; GFX10_W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
83; GFX10_W64-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
84; GFX10_W64-NEXT:    s_setpc_b64 s[30:31]
85  %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d)
86  ret double %result
87}
88
89define amdgpu_ps float @s_div_fmas_f32(float inreg %a, float inreg %b, float inreg %c, i32 inreg %d) {
90; GFX7-LABEL: s_div_fmas_f32:
91; GFX7:       ; %bb.0:
92; GFX7-NEXT:    s_cmp_eq_u32 s3, 0
93; GFX7-NEXT:    s_cselect_b32 s3, 1, 0
94; GFX7-NEXT:    v_mov_b32_e32 v0, s0
95; GFX7-NEXT:    s_and_b32 s0, 1, s3
96; GFX7-NEXT:    v_mov_b32_e32 v1, s1
97; GFX7-NEXT:    v_mov_b32_e32 v2, s2
98; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
99; GFX7-NEXT:    s_nop 3
100; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
101; GFX7-NEXT:    ; return to shader part epilog
102;
103; GFX8-LABEL: s_div_fmas_f32:
104; GFX8:       ; %bb.0:
105; GFX8-NEXT:    s_cmp_eq_u32 s3, 0
106; GFX8-NEXT:    s_cselect_b32 s3, 1, 0
107; GFX8-NEXT:    v_mov_b32_e32 v0, s0
108; GFX8-NEXT:    s_and_b32 s0, 1, s3
109; GFX8-NEXT:    v_mov_b32_e32 v1, s1
110; GFX8-NEXT:    v_mov_b32_e32 v2, s2
111; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
112; GFX8-NEXT:    s_nop 3
113; GFX8-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
114; GFX8-NEXT:    ; return to shader part epilog
115;
116; GFX10_W32-LABEL: s_div_fmas_f32:
117; GFX10_W32:       ; %bb.0:
118; GFX10_W32-NEXT:    s_cmp_eq_u32 s3, 0
119; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s1
120; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s2
121; GFX10_W32-NEXT:    s_cselect_b32 s3, 1, 0
122; GFX10_W32-NEXT:    ; implicit-def: $vcc_hi
123; GFX10_W32-NEXT:    s_and_b32 s3, 1, s3
124; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s3
125; GFX10_W32-NEXT:    v_div_fmas_f32 v0, s0, v0, v1
126; GFX10_W32-NEXT:    ; return to shader part epilog
127;
128; GFX10_W64-LABEL: s_div_fmas_f32:
129; GFX10_W64:       ; %bb.0:
130; GFX10_W64-NEXT:    s_cmp_eq_u32 s3, 0
131; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s1
132; GFX10_W64-NEXT:    s_cselect_b32 s3, 1, 0
133; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s2
134; GFX10_W64-NEXT:    s_and_b32 s3, 1, s3
135; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s3
136; GFX10_W64-NEXT:    v_div_fmas_f32 v0, s0, v0, v1
137; GFX10_W64-NEXT:    ; return to shader part epilog
138  %vcc = icmp eq i32 %d, 0
139  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %vcc)
140  ret float %result
141}
142
143define amdgpu_ps double @s_div_fmas_f64(double inreg %a, double inreg %b, double inreg %c, i32 inreg %d) {
144; GFX7-LABEL: s_div_fmas_f64:
145; GFX7:       ; %bb.0:
146; GFX7-NEXT:    s_cmp_eq_u32 s6, 0
147; GFX7-NEXT:    v_mov_b32_e32 v0, s0
148; GFX7-NEXT:    s_cselect_b32 s6, 1, 0
149; GFX7-NEXT:    v_mov_b32_e32 v1, s1
150; GFX7-NEXT:    v_mov_b32_e32 v2, s2
151; GFX7-NEXT:    v_mov_b32_e32 v4, s4
152; GFX7-NEXT:    s_and_b32 s0, 1, s6
153; GFX7-NEXT:    v_mov_b32_e32 v3, s3
154; GFX7-NEXT:    v_mov_b32_e32 v5, s5
155; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
156; GFX7-NEXT:    s_nop 3
157; GFX7-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
158; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
159; GFX7-NEXT:    v_readfirstlane_b32 s1, v1
160; GFX7-NEXT:    ; return to shader part epilog
161;
162; GFX8-LABEL: s_div_fmas_f64:
163; GFX8:       ; %bb.0:
164; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
165; GFX8-NEXT:    v_mov_b32_e32 v0, s0
166; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
167; GFX8-NEXT:    v_mov_b32_e32 v1, s1
168; GFX8-NEXT:    v_mov_b32_e32 v2, s2
169; GFX8-NEXT:    v_mov_b32_e32 v4, s4
170; GFX8-NEXT:    s_and_b32 s0, 1, s6
171; GFX8-NEXT:    v_mov_b32_e32 v3, s3
172; GFX8-NEXT:    v_mov_b32_e32 v5, s5
173; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
174; GFX8-NEXT:    s_nop 3
175; GFX8-NEXT:    v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
176; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
177; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
178; GFX8-NEXT:    ; return to shader part epilog
179;
180; GFX10_W32-LABEL: s_div_fmas_f64:
181; GFX10_W32:       ; %bb.0:
182; GFX10_W32-NEXT:    s_cmp_eq_u32 s6, 0
183; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s2
184; GFX10_W32-NEXT:    v_mov_b32_e32 v2, s4
185; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s3
186; GFX10_W32-NEXT:    v_mov_b32_e32 v3, s5
187; GFX10_W32-NEXT:    s_cselect_b32 s6, 1, 0
188; GFX10_W32-NEXT:    ; implicit-def: $vcc_hi
189; GFX10_W32-NEXT:    s_and_b32 s6, 1, s6
190; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
191; GFX10_W32-NEXT:    v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3]
192; GFX10_W32-NEXT:    v_readfirstlane_b32 s0, v0
193; GFX10_W32-NEXT:    v_readfirstlane_b32 s1, v1
194; GFX10_W32-NEXT:    ; return to shader part epilog
195;
196; GFX10_W64-LABEL: s_div_fmas_f64:
197; GFX10_W64:       ; %bb.0:
198; GFX10_W64-NEXT:    s_cmp_eq_u32 s6, 0
199; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s2
200; GFX10_W64-NEXT:    s_cselect_b32 s6, 1, 0
201; GFX10_W64-NEXT:    v_mov_b32_e32 v2, s4
202; GFX10_W64-NEXT:    s_and_b32 s6, 1, s6
203; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s3
204; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s6
205; GFX10_W64-NEXT:    v_mov_b32_e32 v3, s5
206; GFX10_W64-NEXT:    v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3]
207; GFX10_W64-NEXT:    v_readfirstlane_b32 s0, v0
208; GFX10_W64-NEXT:    v_readfirstlane_b32 s1, v1
209; GFX10_W64-NEXT:    ; return to shader part epilog
210  %vcc = icmp eq i32 %d, 0
211  %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %vcc)
212  ret double %result
213}
214
215define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) {
216; GFX7-LABEL: test_div_fmas_f32:
217; GFX7:       ; %bb.0:
218; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
219; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x13
220; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x1c
221; GFX7-NEXT:    s_load_dword s6, s[0:1], 0x25
222; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x2e
223; GFX7-NEXT:    s_mov_b32 s7, 0xf000
224; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
225; GFX7-NEXT:    v_mov_b32_e32 v0, s2
226; GFX7-NEXT:    v_mov_b32_e32 v1, s3
227; GFX7-NEXT:    v_mov_b32_e32 v2, s6
228; GFX7-NEXT:    s_and_b32 s0, 1, s0
229; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
230; GFX7-NEXT:    s_mov_b32 s6, -1
231; GFX7-NEXT:    s_nop 2
232; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
233; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
234; GFX7-NEXT:    s_endpgm
235;
236; GFX8-LABEL: test_div_fmas_f32:
237; GFX8:       ; %bb.0:
238; GFX8-NEXT:	s_load_dword s2, s[0:1], 0xb8
239; GFX8-NEXT:	s_load_dword s3, s[0:1], 0x4c
240; GFX8-NEXT:	s_load_dword s4, s[0:1], 0x70
241; GFX8-NEXT:	s_load_dword s5, s[0:1], 0x94
242; GFX8-NEXT:	s_load_dwordx2 s[0:1], s[0:1], 0x24
243; GFX8-NEXT:	s_waitcnt lgkmcnt(0)
244; GFX8-NEXT:	s_and_b32 s2, 1, s2
245; GFX8-NEXT:	v_mov_b32_e32 v0, s3
246; GFX8-NEXT:	v_mov_b32_e32 v1, s4
247; GFX8-NEXT:	v_mov_b32_e32 v2, s5
248; GFX8-NEXT:	v_cmp_ne_u32_e64 vcc, 0, s2
249; GFX8-NEXT:    s_nop 3
250; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, v2
251; GFX8-NEXT:    v_mov_b32_e32 v0, s0
252; GFX8-NEXT:    v_mov_b32_e32 v1, s1
253; GFX8-NEXT:    flat_store_dword v[0:1], v2
254; GFX8-NEXT:    s_endpgm
255;
256; GFX10_W32-LABEL: test_div_fmas_f32:
257; GFX10_W32:       ; %bb.0:
258; GFX10_W32-NEXT:    s_clause 0x4
259; GFX10_W32-NEXT:    s_load_dword s2, s[0:1], 0xb8
260; GFX10_W32-NEXT:    s_load_dword s3, s[0:1], 0x70
261; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0x94
262; GFX10_W32-NEXT:    s_load_dword s5, s[0:1], 0x4c
263; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
264; GFX10_W32-NEXT:    ; implicit-def: $vcc_hi
265; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
266; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
267; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s3
268; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s4
269; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
270; GFX10_W32-NEXT:    v_div_fmas_f32 v2, s5, v0, v1
271; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s0
272; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s1
273; GFX10_W32-NEXT:    global_store_dword v[0:1], v2, off
274; GFX10_W32-NEXT:    s_endpgm
275;
276; GFX10_W64-LABEL: test_div_fmas_f32:
277; GFX10_W64:       ; %bb.0:
278; GFX10_W64-NEXT:    s_clause 0x4
279; GFX10_W64-NEXT:    s_load_dword s2, s[0:1], 0xb8
280; GFX10_W64-NEXT:    s_load_dword s3, s[0:1], 0x70
281; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0x94
282; GFX10_W64-NEXT:    s_load_dword s5, s[0:1], 0x4c
283; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
284; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
285; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
286; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s3
287; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
288; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s4
289; GFX10_W64-NEXT:    v_div_fmas_f32 v2, s5, v0, v1
290; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s0
291; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s1
292; GFX10_W64-NEXT:    global_store_dword v[0:1], v2, off
293; GFX10_W64-NEXT:    s_endpgm
294  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d)
295  store float %result, float addrspace(1)* %out, align 4
296  ret void
297}
298
299define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) {
300; GFX7-LABEL: test_div_fmas_f32_inline_imm_0:
301; GFX7:       ; %bb.0:
302; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
303; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x1c
304; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x25
305; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x2e
306; GFX7-NEXT:    s_mov_b32 s6, -1
307; GFX7-NEXT:    s_mov_b32 s7, 0xf000
308; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
309; GFX7-NEXT:    v_mov_b32_e32 v0, s2
310; GFX7-NEXT:    v_mov_b32_e32 v1, s3
311; GFX7-NEXT:    s_and_b32 s0, 1, s0
312; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
313; GFX7-NEXT:    s_nop 3
314; GFX7-NEXT:    v_div_fmas_f32 v0, 1.0, v0, v1
315; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
316; GFX7-NEXT:    s_endpgm
317;
318; GFX8-LABEL: test_div_fmas_f32_inline_imm_0:
319; GFX8:       ; %bb.0:
320; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x70
321; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x94
322; GFX8-NEXT:    s_load_dword s4, s[0:1], 0xb8
323; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
324; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
325; GFX8-NEXT:    v_mov_b32_e32 v0, s2
326; GFX8-NEXT:    v_mov_b32_e32 v1, s3
327; GFX8-NEXT:    s_and_b32 s2, 1, s4
328; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
329; GFX8-NEXT:    s_nop 3
330; GFX8-NEXT:    v_div_fmas_f32 v2, 1.0, v0, v1
331; GFX8-NEXT:    v_mov_b32_e32 v0, s0
332; GFX8-NEXT:    v_mov_b32_e32 v1, s1
333; GFX8-NEXT:    flat_store_dword v[0:1], v2
334; GFX8-NEXT:    s_endpgm
335;
336; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_0:
337; GFX10_W32:       ; %bb.0:
338; GFX10_W32-NEXT:    s_clause 0x3
339; GFX10_W32-NEXT:    s_load_dword s2, s[0:1], 0xb8
340; GFX10_W32-NEXT:    s_load_dword s3, s[0:1], 0x94
341; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0x70
342; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
343; GFX10_W32-NEXT:    ; implicit-def: $vcc_hi
344; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
345; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
346; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s3
347; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
348; GFX10_W32-NEXT:    v_div_fmas_f32 v2, 1.0, s4, v0
349; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s0
350; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s1
351; GFX10_W32-NEXT:    global_store_dword v[0:1], v2, off
352; GFX10_W32-NEXT:    s_endpgm
353;
354; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_0:
355; GFX10_W64:       ; %bb.0:
356; GFX10_W64-NEXT:    s_clause 0x3
357; GFX10_W64-NEXT:    s_load_dword s2, s[0:1], 0xb8
358; GFX10_W64-NEXT:    s_load_dword s3, s[0:1], 0x94
359; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0x70
360; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
361; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
362; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
363; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s3
364; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
365; GFX10_W64-NEXT:    v_div_fmas_f32 v2, 1.0, s4, v0
366; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s0
367; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s1
368; GFX10_W64-NEXT:    global_store_dword v[0:1], v2, off
369; GFX10_W64-NEXT:    s_endpgm
370  %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d)
371  store float %result, float addrspace(1)* %out, align 4
372  ret void
373}
374
375define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, [8 x i32], i1 %d) {
376; GFX7-LABEL: test_div_fmas_f32_inline_imm_1:
377; GFX7:       ; %bb.0:
378; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
379; GFX7-NEXT:    s_load_dword s2, s[0:1], 0xb
380; GFX7-NEXT:    s_load_dword s3, s[0:1], 0xd
381; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x16
382; GFX7-NEXT:    s_mov_b32 s6, -1
383; GFX7-NEXT:    s_mov_b32 s7, 0xf000
384; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
385; GFX7-NEXT:    v_mov_b32_e32 v0, s2
386; GFX7-NEXT:    v_mov_b32_e32 v1, s3
387; GFX7-NEXT:    s_and_b32 s0, 1, s0
388; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
389; GFX7-NEXT:    s_nop 3
390; GFX7-NEXT:    v_div_fmas_f32 v0, v0, 1.0, v1
391; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
392; GFX7-NEXT:    s_endpgm
393;
394; GFX8-LABEL: test_div_fmas_f32_inline_imm_1:
395; GFX8:       ; %bb.0:
396; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x2c
397; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x34
398; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x58
399; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
400; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
401; GFX8-NEXT:    v_mov_b32_e32 v0, s2
402; GFX8-NEXT:    v_mov_b32_e32 v1, s3
403; GFX8-NEXT:    s_and_b32 s2, 1, s4
404; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
405; GFX8-NEXT:    s_nop 3
406; GFX8-NEXT:    v_div_fmas_f32 v2, v0, 1.0, v1
407; GFX8-NEXT:    v_mov_b32_e32 v0, s0
408; GFX8-NEXT:    v_mov_b32_e32 v1, s1
409; GFX8-NEXT:    flat_store_dword v[0:1], v2
410; GFX8-NEXT:    s_endpgm
411;
412; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_1:
413; GFX10_W32:       ; %bb.0:
414; GFX10_W32-NEXT:    s_clause 0x3
415; GFX10_W32-NEXT:    s_load_dword s2, s[0:1], 0x58
416; GFX10_W32-NEXT:    s_load_dword s3, s[0:1], 0x34
417; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0x2c
418; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
419; GFX10_W32-NEXT:    ; implicit-def: $vcc_hi
420; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
421; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
422; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s3
423; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
424; GFX10_W32-NEXT:    v_div_fmas_f32 v2, s4, 1.0, v0
425; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s0
426; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s1
427; GFX10_W32-NEXT:    global_store_dword v[0:1], v2, off
428; GFX10_W32-NEXT:    s_endpgm
429;
430; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_1:
431; GFX10_W64:       ; %bb.0:
432; GFX10_W64-NEXT:    s_clause 0x3
433; GFX10_W64-NEXT:    s_load_dword s2, s[0:1], 0x58
434; GFX10_W64-NEXT:    s_load_dword s3, s[0:1], 0x34
435; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0x2c
436; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
437; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
438; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
439; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s3
440; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
441; GFX10_W64-NEXT:    v_div_fmas_f32 v2, s4, 1.0, v0
442; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s0
443; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s1
444; GFX10_W64-NEXT:    global_store_dword v[0:1], v2, off
445; GFX10_W64-NEXT:    s_endpgm
446  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d)
447  store float %result, float addrspace(1)* %out, align 4
448  ret void
449}
450
451define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) {
452; GFX7-LABEL: test_div_fmas_f32_inline_imm_2:
453; GFX7:       ; %bb.0:
454; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
455; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x13
456; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x1c
457; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x2e
458; GFX7-NEXT:    s_mov_b32 s6, -1
459; GFX7-NEXT:    s_mov_b32 s7, 0xf000
460; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
461; GFX7-NEXT:    v_mov_b32_e32 v0, s2
462; GFX7-NEXT:    v_mov_b32_e32 v1, s3
463; GFX7-NEXT:    s_and_b32 s0, 1, s0
464; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
465; GFX7-NEXT:    s_nop 3
466; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, 1.0
467; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
468; GFX7-NEXT:    s_endpgm
469;
470; GFX8-LABEL: test_div_fmas_f32_inline_imm_2:
471; GFX8:       ; %bb.0:
472; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x4c
473; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x70
474; GFX8-NEXT:    s_load_dword s4, s[0:1], 0xb8
475; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
476; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
477; GFX8-NEXT:    v_mov_b32_e32 v0, s2
478; GFX8-NEXT:    v_mov_b32_e32 v1, s3
479; GFX8-NEXT:    s_and_b32 s2, 1, s4
480; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
481; GFX8-NEXT:    s_nop 3
482; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, 1.0
483; GFX8-NEXT:    v_mov_b32_e32 v0, s0
484; GFX8-NEXT:    v_mov_b32_e32 v1, s1
485; GFX8-NEXT:    flat_store_dword v[0:1], v2
486; GFX8-NEXT:    s_endpgm
487;
488; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_2:
489; GFX10_W32:       ; %bb.0:
490; GFX10_W32-NEXT:    s_clause 0x3
491; GFX10_W32-NEXT:    s_load_dword s2, s[0:1], 0xb8
492; GFX10_W32-NEXT:    s_load_dword s3, s[0:1], 0x70
493; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0x4c
494; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
495; GFX10_W32-NEXT:    ; implicit-def: $vcc_hi
496; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
497; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
498; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s3
499; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
500; GFX10_W32-NEXT:    v_div_fmas_f32 v2, s4, v0, 1.0
501; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s0
502; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s1
503; GFX10_W32-NEXT:    global_store_dword v[0:1], v2, off
504; GFX10_W32-NEXT:    s_endpgm
505;
506; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_2:
507; GFX10_W64:       ; %bb.0:
508; GFX10_W64-NEXT:    s_clause 0x3
509; GFX10_W64-NEXT:    s_load_dword s2, s[0:1], 0xb8
510; GFX10_W64-NEXT:    s_load_dword s3, s[0:1], 0x70
511; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0x4c
512; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
513; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
514; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
515; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s3
516; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
517; GFX10_W64-NEXT:    v_div_fmas_f32 v2, s4, v0, 1.0
518; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s0
519; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s1
520; GFX10_W64-NEXT:    global_store_dword v[0:1], v2, off
521; GFX10_W64-NEXT:    s_endpgm
522  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d)
523  store float %result, float addrspace(1)* %out, align 4
524  ret void
525}
526
527define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) {
528; GFX7-LABEL: test_div_fmas_f64:
529; GFX7:       ; %bb.0:
530; GFX7-NEXT:	s_load_dwordx8 s[4:11], s[0:1], 0x9
531; GFX7-NEXT:	s_load_dword s0, s[0:1], 0x11
532; GFX7-NEXT:	s_waitcnt lgkmcnt(0)
533; GFX7-NEXT:	v_mov_b32_e32 v0, s6
534; GFX7-NEXT:	v_mov_b32_e32 v2, s8
535; GFX7-NEXT:	v_mov_b32_e32 v4, s10
536; GFX7-NEXT:	s_and_b32 s0, 1, s0
537; GFX7-NEXT:	v_mov_b32_e32 v1, s7
538; GFX7-NEXT:	v_mov_b32_e32 v3, s9
539; GFX7-NEXT:	v_mov_b32_e32 v5, s11
540; GFX7-NEXT:	v_cmp_ne_u32_e64 vcc, 0, s0
541; GFX7-NEXT:	s_nop 3
542; GFX7-NEXT:	v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
543; GFX7-NEXT:	v_mov_b32_e32 v2, s4
544; GFX7-NEXT:	v_mov_b32_e32 v3, s5
545; GFX7-NEXT:	flat_store_dwordx2 v[2:3], v[0:1]
546; GFX7-NEXT:	s_endpgm
547;
548; GFX8-LABEL: test_div_fmas_f64:
549; GFX8:       ; %bb.0:
550; GFX8-NEXT:	s_load_dwordx8 s[4:11], s[0:1], 0x24
551; GFX8-NEXT:	s_load_dword s0, s[0:1], 0x44
552; GFX8-NEXT:	s_waitcnt lgkmcnt(0)
553; GFX8-NEXT:	v_mov_b32_e32 v0, s6
554; GFX8-NEXT:	v_mov_b32_e32 v2, s8
555; GFX8-NEXT:	v_mov_b32_e32 v4, s10
556; GFX8-NEXT:	s_and_b32 s0, 1, s0
557; GFX8-NEXT:	v_mov_b32_e32 v1, s7
558; GFX8-NEXT:	v_mov_b32_e32 v3, s9
559; GFX8-NEXT:	v_mov_b32_e32 v5, s11
560; GFX8-NEXT:	v_cmp_ne_u32_e64 vcc, 0, s0
561; GFX8-NEXT:	s_nop 3
562; GFX8-NEXT:	v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
563; GFX8-NEXT:	v_mov_b32_e32 v2, s4
564; GFX8-NEXT:	v_mov_b32_e32 v3, s5
565; GFX8-NEXT:	flat_store_dwordx2 v[2:3], v[0:1]
566; GFX8-NEXT:	s_endpgm
567;
568; GFX10_W32-LABEL: test_div_fmas_f64:
569; GFX10_W32:       ; %bb.0:
570; GFX10_W32-NEXT:    s_clause 0x1
571; GFX10_W32-NEXT:    s_load_dword s8, s[0:1], 0x44
572; GFX10_W32-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
573; GFX10_W32-NEXT:    ; implicit-def: $vcc_hi
574; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
575; GFX10_W32-NEXT:    s_and_b32 s8, 1, s8
576; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s4
577; GFX10_W32-NEXT:    v_mov_b32_e32 v2, s6
578; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s5
579; GFX10_W32-NEXT:    v_mov_b32_e32 v3, s7
580; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s8
581; GFX10_W32-NEXT:    v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3]
582; GFX10_W32-NEXT:    v_mov_b32_e32 v3, s1
583; GFX10_W32-NEXT:    v_mov_b32_e32 v2, s0
584; GFX10_W32-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
585; GFX10_W32-NEXT:    s_endpgm
586;
587; GFX10_W64-LABEL: test_div_fmas_f64:
588; GFX10_W64:       ; %bb.0:
589; GFX10_W64-NEXT:    s_clause 0x1
590; GFX10_W64-NEXT:    s_load_dword s8, s[0:1], 0x44
591; GFX10_W64-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
592; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
593; GFX10_W64-NEXT:    s_and_b32 s8, 1, s8
594; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s4
595; GFX10_W64-NEXT:    v_mov_b32_e32 v2, s6
596; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s8
597; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s5
598; GFX10_W64-NEXT:    v_mov_b32_e32 v3, s7
599; GFX10_W64-NEXT:    v_div_fmas_f64 v[0:1], s[2:3], v[0:1], v[2:3]
600; GFX10_W64-NEXT:    v_mov_b32_e32 v3, s1
601; GFX10_W64-NEXT:    v_mov_b32_e32 v2, s0
602; GFX10_W64-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
603; GFX10_W64-NEXT:    s_endpgm
604  %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d)
605  store double %result, double addrspace(1)* %out, align 8
606  ret void
607}
608
609define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) {
610; GFX7-LABEL: test_div_fmas_f32_cond_to_vcc:
611; GFX7:       ; %bb.0:
612; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
613; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
614; GFX7-NEXT:    s_mov_b32 s6, -1
615; GFX7-NEXT:    s_mov_b32 s7, 0xf000
616; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
617; GFX7-NEXT:    s_cmp_eq_u32 s3, 0
618; GFX7-NEXT:    s_cselect_b32 s3, 1, 0
619; GFX7-NEXT:    v_mov_b32_e32 v0, s0
620; GFX7-NEXT:    s_and_b32 s0, 1, s3
621; GFX7-NEXT:    v_mov_b32_e32 v1, s1
622; GFX7-NEXT:    v_mov_b32_e32 v2, s2
623; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
624; GFX7-NEXT:    s_nop 3
625; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
626; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
627; GFX7-NEXT:    s_endpgm
628;
629; GFX8-LABEL: test_div_fmas_f32_cond_to_vcc:
630; GFX8:       ; %bb.0:
631; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
632; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
633; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
634; GFX8-NEXT:    s_cmp_eq_u32 s7, 0
635; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
636; GFX8-NEXT:    s_and_b32 s2, 1, s2
637; GFX8-NEXT:    v_mov_b32_e32 v0, s4
638; GFX8-NEXT:    v_mov_b32_e32 v1, s5
639; GFX8-NEXT:    v_mov_b32_e32 v2, s6
640; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
641; GFX8-NEXT:    s_nop 3
642; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, v2
643; GFX8-NEXT:    v_mov_b32_e32 v0, s0
644; GFX8-NEXT:    v_mov_b32_e32 v1, s1
645; GFX8-NEXT:    flat_store_dword v[0:1], v2
646; GFX8-NEXT:    s_endpgm
647;
648; GFX10_W32-LABEL: test_div_fmas_f32_cond_to_vcc:
649; GFX10_W32:       ; %bb.0:
650; GFX10_W32-NEXT:    s_clause 0x1
651; GFX10_W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
652; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
653; GFX10_W32-NEXT:    ; implicit-def: $vcc_hi
654; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
655; GFX10_W32-NEXT:    s_cmp_eq_u32 s7, 0
656; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s5
657; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s6
658; GFX10_W32-NEXT:    s_cselect_b32 s2, 1, 0
659; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
660; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
661; GFX10_W32-NEXT:    v_div_fmas_f32 v2, s4, v0, v1
662; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s0
663; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s1
664; GFX10_W32-NEXT:    global_store_dword v[0:1], v2, off
665; GFX10_W32-NEXT:    s_endpgm
666;
667; GFX10_W64-LABEL: test_div_fmas_f32_cond_to_vcc:
668; GFX10_W64:       ; %bb.0:
669; GFX10_W64-NEXT:    s_clause 0x1
670; GFX10_W64-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
671; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
672; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
673; GFX10_W64-NEXT:    s_cmp_eq_u32 s7, 0
674; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s5
675; GFX10_W64-NEXT:    s_cselect_b32 s2, 1, 0
676; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s6
677; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
678; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
679; GFX10_W64-NEXT:    v_div_fmas_f32 v2, s4, v0, v1
680; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s0
681; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s1
682; GFX10_W64-NEXT:    global_store_dword v[0:1], v2, off
683; GFX10_W64-NEXT:    s_endpgm
684  %cmp = icmp eq i32 %i, 0
685  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp)
686  store float %result, float addrspace(1)* %out, align 4
687  ret void
688}
689
690define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) {
691; GFX7-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
692; GFX7:       ; %bb.0:
693; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
694; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x13
695; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x1c
696; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x25
697; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, 0
698; GFX7-NEXT:    s_mov_b32 s6, -1
699; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
700; GFX7-NEXT:    v_mov_b32_e32 v0, s2
701; GFX7-NEXT:    v_mov_b32_e32 v1, s3
702; GFX7-NEXT:    v_mov_b32_e32 v2, s0
703; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
704; GFX7-NEXT:    s_mov_b32 s7, 0xf000
705; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
706; GFX7-NEXT:    s_endpgm
707;
708; GFX8-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
709; GFX8:       ; %bb.0:
710; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x4c
711; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x70
712; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x94
713; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, 0
714; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
715; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
716; GFX8-NEXT:    v_mov_b32_e32 v0, s2
717; GFX8-NEXT:    v_mov_b32_e32 v1, s3
718; GFX8-NEXT:    v_mov_b32_e32 v2, s4
719; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, v2
720; GFX8-NEXT:    v_mov_b32_e32 v0, s0
721; GFX8-NEXT:    v_mov_b32_e32 v1, s1
722; GFX8-NEXT:    flat_store_dword v[0:1], v2
723; GFX8-NEXT:    s_endpgm
724;
725; GFX10_W32-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
726; GFX10_W32:       ; %bb.0:
727; GFX10_W32-NEXT:    s_clause 0x3
728; GFX10_W32-NEXT:    s_load_dword s2, s[0:1], 0x70
729; GFX10_W32-NEXT:    s_load_dword s3, s[0:1], 0x94
730; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0x4c
731; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
732; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, 0
733; GFX10_W32-NEXT:    ; implicit-def: $vcc_hi
734; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
735; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s2
736; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s3
737; GFX10_W32-NEXT:    v_div_fmas_f32 v2, s4, v0, v1
738; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s0
739; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s1
740; GFX10_W32-NEXT:    global_store_dword v[0:1], v2, off
741; GFX10_W32-NEXT:    s_endpgm
742;
743; GFX10_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc:
744; GFX10_W64:       ; %bb.0:
745; GFX10_W64-NEXT:    s_clause 0x3
746; GFX10_W64-NEXT:    s_load_dword s2, s[0:1], 0x70
747; GFX10_W64-NEXT:    s_load_dword s3, s[0:1], 0x94
748; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0x4c
749; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
750; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, 0
751; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
752; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s2
753; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s3
754; GFX10_W64-NEXT:    v_div_fmas_f32 v2, s4, v0, v1
755; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s0
756; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s1
757; GFX10_W64-NEXT:    global_store_dword v[0:1], v2, off
758; GFX10_W64-NEXT:    s_endpgm
759  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false)
760  store float %result, float addrspace(1)* %out, align 4
761  ret void
762}
763
764define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) {
765; GFX7-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
766; GFX7:       ; %bb.0:
767; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
768; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x13
769; GFX7-NEXT:    s_load_dword s3, s[0:1], 0x1c
770; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x25
771; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, 1
772; GFX7-NEXT:    s_mov_b32 s6, -1
773; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
774; GFX7-NEXT:    v_mov_b32_e32 v0, s2
775; GFX7-NEXT:    v_mov_b32_e32 v1, s3
776; GFX7-NEXT:    v_mov_b32_e32 v2, s0
777; GFX7-NEXT:    v_div_fmas_f32 v0, v0, v1, v2
778; GFX7-NEXT:    s_mov_b32 s7, 0xf000
779; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
780; GFX7-NEXT:    s_endpgm
781;
782; GFX8-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
783; GFX8:       ; %bb.0:
784; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x4c
785; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x70
786; GFX8-NEXT:    s_load_dword s4, s[0:1], 0x94
787; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, 1
788; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
789; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
790; GFX8-NEXT:    v_mov_b32_e32 v0, s2
791; GFX8-NEXT:    v_mov_b32_e32 v1, s3
792; GFX8-NEXT:    v_mov_b32_e32 v2, s4
793; GFX8-NEXT:    v_div_fmas_f32 v2, v0, v1, v2
794; GFX8-NEXT:    v_mov_b32_e32 v0, s0
795; GFX8-NEXT:    v_mov_b32_e32 v1, s1
796; GFX8-NEXT:    flat_store_dword v[0:1], v2
797; GFX8-NEXT:    s_endpgm
798;
799; GFX10_W32-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
800; GFX10_W32:       ; %bb.0:
801; GFX10_W32-NEXT:    s_clause 0x3
802; GFX10_W32-NEXT:    s_load_dword s2, s[0:1], 0x70
803; GFX10_W32-NEXT:    s_load_dword s3, s[0:1], 0x94
804; GFX10_W32-NEXT:    s_load_dword s4, s[0:1], 0x4c
805; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
806; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, 1
807; GFX10_W32-NEXT:    ; implicit-def: $vcc_hi
808; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
809; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s2
810; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s3
811; GFX10_W32-NEXT:    v_div_fmas_f32 v2, s4, v0, v1
812; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s0
813; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s1
814; GFX10_W32-NEXT:    global_store_dword v[0:1], v2, off
815; GFX10_W32-NEXT:    s_endpgm
816;
817; GFX10_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc:
818; GFX10_W64:       ; %bb.0:
819; GFX10_W64-NEXT:    s_clause 0x3
820; GFX10_W64-NEXT:    s_load_dword s2, s[0:1], 0x70
821; GFX10_W64-NEXT:    s_load_dword s3, s[0:1], 0x94
822; GFX10_W64-NEXT:    s_load_dword s4, s[0:1], 0x4c
823; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
824; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, 1
825; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
826; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s2
827; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s3
828; GFX10_W64-NEXT:    v_div_fmas_f32 v2, s4, v0, v1
829; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s0
830; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s1
831; GFX10_W64-NEXT:    global_store_dword v[0:1], v2, off
832; GFX10_W64-NEXT:    s_endpgm
833  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true)
834  store float %result, float addrspace(1)* %out, align 4
835  ret void
836}
837
838define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, [8 x i32], i32 %d) {
839; GFX7-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
840; GFX7:       ; %bb.0:
841; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
842; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x15
843; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
844; GFX7-NEXT:    v_lshl_b64 v[1:2], v[0:1], 2
845; GFX7-NEXT:    s_mov_b32 s2, 0
846; GFX7-NEXT:    s_mov_b32 s3, 0xf000
847; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
848; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
849; GFX7-NEXT:    buffer_load_dword v3, v[1:2], s[0:3], 0 addr64
850; GFX7-NEXT:    buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:4
851; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
852; GFX7-NEXT:    buffer_load_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
853; GFX7-NEXT:    s_cmp_lg_u32 s8, 0
854; GFX7-NEXT:    s_cselect_b32 s6, 1, 0
855; GFX7-NEXT:    s_and_b32 s0, 1, s6
856; GFX7-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
857; GFX7-NEXT:    s_mov_b32 s2, -1
858; GFX7-NEXT:    s_and_b64 vcc, vcc, s[0:1]
859; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
860; GFX7-NEXT:    s_waitcnt vmcnt(0)
861; GFX7-NEXT:    v_div_fmas_f32 v0, v3, v4, v0
862; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
863; GFX7-NEXT:    s_endpgm
864;
865; GFX8-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
866; GFX8:       ; %bb.0:
867; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
868; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x54
869; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
870; GFX8-NEXT:    v_lshlrev_b64 v[1:2], 2, v[0:1]
871; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
872; GFX8-NEXT:    v_mov_b32_e32 v3, s6
873; GFX8-NEXT:    v_mov_b32_e32 v4, s7
874; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
875; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v4, v2, vcc
876; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v1
877; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
878; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 8, v1
879; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v2, vcc
880; GFX8-NEXT:    flat_load_dword v1, v[1:2]
881; GFX8-NEXT:    flat_load_dword v2, v[3:4]
882; GFX8-NEXT:    flat_load_dword v3, v[5:6]
883; GFX8-NEXT:    s_add_u32 s0, s4, 8
884; GFX8-NEXT:    s_addc_u32 s1, s5, 0
885; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
886; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
887; GFX8-NEXT:    s_and_b32 s2, 1, s2
888; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
889; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, s2
890; GFX8-NEXT:    s_and_b64 vcc, vcc, s[2:3]
891; GFX8-NEXT:    s_nop 1
892; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
893; GFX8-NEXT:    v_div_fmas_f32 v2, v1, v2, v3
894; GFX8-NEXT:    v_mov_b32_e32 v0, s0
895; GFX8-NEXT:    v_mov_b32_e32 v1, s1
896; GFX8-NEXT:    flat_store_dword v[0:1], v2
897; GFX8-NEXT:    s_endpgm
898;
899; GFX10_W32-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
900; GFX10_W32:       ; %bb.0:
901; GFX10_W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
902; GFX10_W32-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
903; GFX10_W32-NEXT:    s_load_dword s2, s[0:1], 0x54
904; GFX10_W32-NEXT:    ; implicit-def: $vcc_hi
905; GFX10_W32-NEXT:    v_lshlrev_b64 v[1:2], 2, v[0:1]
906; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
907; GFX10_W32-NEXT:    v_mov_b32_e32 v3, s6
908; GFX10_W32-NEXT:    v_mov_b32_e32 v4, s7
909; GFX10_W32-NEXT:    s_add_u32 s0, s4, 8
910; GFX10_W32-NEXT:    s_addc_u32 s1, s5, 0
911; GFX10_W32-NEXT:    s_cmp_lg_u32 s2, 0
912; GFX10_W32-NEXT:    v_add_co_u32_e64 v1, vcc_lo, v3, v1
913; GFX10_W32-NEXT:    s_cselect_b32 s2, 1, 0
914; GFX10_W32-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo
915; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
916; GFX10_W32-NEXT:    v_add_co_u32_e64 v3, vcc_lo, v1, 8
917; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 s2, 0, s2
918; GFX10_W32-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo
919; GFX10_W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
920; GFX10_W32-NEXT:    s_clause 0x2
921; GFX10_W32-NEXT:    global_load_dword v1, v[1:2], off
922; GFX10_W32-NEXT:    global_load_dword v2, v[3:4], off offset:-4
923; GFX10_W32-NEXT:    global_load_dword v3, v[3:4], off
924; GFX10_W32-NEXT:    s_and_b32 vcc_lo, vcc_lo, s2
925; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
926; GFX10_W32-NEXT:    v_div_fmas_f32 v2, v1, v2, v3
927; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s0
928; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s1
929; GFX10_W32-NEXT:    global_store_dword v[0:1], v2, off
930; GFX10_W32-NEXT:    s_endpgm
931;
932; GFX10_W64-LABEL: test_div_fmas_f32_logical_cond_to_vcc:
933; GFX10_W64:       ; %bb.0:
934; GFX10_W64-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
935; GFX10_W64-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
936; GFX10_W64-NEXT:    s_load_dword s2, s[0:1], 0x54
937; GFX10_W64-NEXT:    v_lshlrev_b64 v[1:2], 2, v[0:1]
938; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
939; GFX10_W64-NEXT:    v_mov_b32_e32 v3, s6
940; GFX10_W64-NEXT:    v_mov_b32_e32 v4, s7
941; GFX10_W64-NEXT:    s_add_u32 s0, s4, 8
942; GFX10_W64-NEXT:    s_addc_u32 s1, s5, 0
943; GFX10_W64-NEXT:    s_cmp_lg_u32 s2, 0
944; GFX10_W64-NEXT:    v_add_co_u32_e64 v1, vcc, v3, v1
945; GFX10_W64-NEXT:    s_cselect_b32 s2, 1, 0
946; GFX10_W64-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v4, v2, vcc
947; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
948; GFX10_W64-NEXT:    v_add_co_u32_e64 v3, vcc, v1, 8
949; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, s2
950; GFX10_W64-NEXT:    v_add_co_ci_u32_e32 v4, vcc, 0, v2, vcc
951; GFX10_W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
952; GFX10_W64-NEXT:    s_clause 0x2
953; GFX10_W64-NEXT:    global_load_dword v1, v[1:2], off
954; GFX10_W64-NEXT:    global_load_dword v2, v[3:4], off offset:-4
955; GFX10_W64-NEXT:    global_load_dword v3, v[3:4], off
956; GFX10_W64-NEXT:    s_and_b64 vcc, vcc, s[2:3]
957; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
958; GFX10_W64-NEXT:    v_div_fmas_f32 v2, v1, v2, v3
959; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s0
960; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s1
961; GFX10_W64-NEXT:    global_store_dword v[0:1], v2, off
962; GFX10_W64-NEXT:    s_endpgm
963  %tid = call i32 @llvm.amdgcn.workitem.id.x()
964  %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
965  %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
966  %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
967  %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
968
969  %a = load volatile float, float addrspace(1)* %gep.a
970  %b = load volatile float, float addrspace(1)* %gep.b
971  %c = load volatile float, float addrspace(1)* %gep.c
972
973  %cmp0 = icmp eq i32 %tid, 0
974  %cmp1 = icmp ne i32 %d, 0
975  %and = and i1 %cmp0, %cmp1
976
977  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %and)
978  store float %result, float addrspace(1)* %gep.out, align 4
979  ret void
980}
981
982define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, [8 x i32], float addrspace(1)* %in, [8 x i32], i32 addrspace(1)* %dummy) {
983; GFX7-LABEL: test_div_fmas_f32_i1_phi_vcc:
984; GFX7:       ; %bb.0: ; %entry
985; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
986; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x13
987; GFX7-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
988; GFX7-NEXT:    v_lshl_b64 v[1:2], v[0:1], 2
989; GFX7-NEXT:    s_mov_b32 s10, 0
990; GFX7-NEXT:    s_mov_b32 s11, 0xf000
991; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
992; GFX7-NEXT:    buffer_load_dwordx3 v[1:3], v[1:2], s[8:11], 0 addr64
993; GFX7-NEXT:    s_mov_b32 s2, 0
994; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
995; GFX7-NEXT:    s_and_saveexec_b64 s[6:7], vcc
996; GFX7-NEXT:    s_cbranch_execz BB13_2
997; GFX7-NEXT:  ; %bb.1: ; %bb
998; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x1d
999; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1000; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
1001; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1002; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
1003; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
1004; GFX7-NEXT:  BB13_2: ; %exit
1005; GFX7-NEXT:    s_or_b64 exec, exec, s[6:7]
1006; GFX7-NEXT:    s_and_b32 s0, 1, s2
1007; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
1008; GFX7-NEXT:    s_mov_b32 s10, -1
1009; GFX7-NEXT:    s_mov_b64 s[6:7], s[10:11]
1010; GFX7-NEXT:    s_nop 1
1011; GFX7-NEXT:    s_waitcnt vmcnt(0)
1012; GFX7-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
1013; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
1014; GFX7-NEXT:    s_endpgm
1015;
1016; GFX8-LABEL: test_div_fmas_f32_i1_phi_vcc:
1017; GFX8:       ; %bb.0: ; %entry
1018; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1019; GFX8-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x4c
1020; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1021; GFX8-NEXT:    v_lshlrev_b64 v[1:2], 2, v[0:1]
1022; GFX8-NEXT:    s_mov_b32 s2, 0
1023; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1024; GFX8-NEXT:    v_mov_b32_e32 v3, s6
1025; GFX8-NEXT:    v_mov_b32_e32 v4, s7
1026; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
1027; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v4, v2, vcc
1028; GFX8-NEXT:    flat_load_dwordx3 v[1:3], v[1:2]
1029; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1030; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1031; GFX8-NEXT:    s_cbranch_execz BB13_2
1032; GFX8-NEXT:  ; %bb.1: ; %bb
1033; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x74
1034; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1035; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
1036; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1037; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
1038; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
1039; GFX8-NEXT:  BB13_2: ; %exit
1040; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
1041; GFX8-NEXT:    s_add_u32 s0, s4, 8
1042; GFX8-NEXT:    s_addc_u32 s1, s5, 0
1043; GFX8-NEXT:    s_and_b32 s2, 1, s2
1044; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
1045; GFX8-NEXT:    s_nop 3
1046; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1047; GFX8-NEXT:    v_div_fmas_f32 v2, v1, v2, v3
1048; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1049; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1050; GFX8-NEXT:    flat_store_dword v[0:1], v2
1051; GFX8-NEXT:    s_endpgm
1052;
1053; GFX10_W32-LABEL: test_div_fmas_f32_i1_phi_vcc:
1054; GFX10_W32:       ; %bb.0: ; %entry
1055; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
1056; GFX10_W32-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1057; GFX10_W32-NEXT:    s_mov_b32 s4, 0
1058; GFX10_W32-NEXT:    ; implicit-def: $vcc_hi
1059; GFX10_W32-NEXT:    v_lshlrev_b64 v[1:2], 2, v[0:1]
1060; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
1061; GFX10_W32-NEXT:    v_mov_b32_e32 v4, s3
1062; GFX10_W32-NEXT:    v_mov_b32_e32 v3, s2
1063; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1064; GFX10_W32-NEXT:    v_add_co_u32_e64 v1, vcc_lo, v3, v1
1065; GFX10_W32-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo
1066; GFX10_W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1067; GFX10_W32-NEXT:    global_load_dwordx3 v[1:3], v[1:2], off
1068; GFX10_W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
1069; GFX10_W32-NEXT:    s_cbranch_execz BB13_2
1070; GFX10_W32-NEXT:  ; %bb.1: ; %bb
1071; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x74
1072; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
1073; GFX10_W32-NEXT:    s_load_dword s0, s[0:1], 0x0
1074; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
1075; GFX10_W32-NEXT:    s_cmp_lg_u32 s0, 0
1076; GFX10_W32-NEXT:    s_cselect_b32 s4, 1, 0
1077; GFX10_W32-NEXT:  BB13_2: ; %exit
1078; GFX10_W32-NEXT:    v_nop
1079; GFX10_W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
1080; GFX10_W32-NEXT:    s_and_b32 s0, 1, s4
1081; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
1082; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
1083; GFX10_W32-NEXT:    s_add_u32 s0, s2, 8
1084; GFX10_W32-NEXT:    s_addc_u32 s1, s3, 0
1085; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
1086; GFX10_W32-NEXT:    v_div_fmas_f32 v2, v1, v2, v3
1087; GFX10_W32-NEXT:    v_mov_b32_e32 v0, s0
1088; GFX10_W32-NEXT:    v_mov_b32_e32 v1, s1
1089; GFX10_W32-NEXT:    global_store_dword v[0:1], v2, off
1090; GFX10_W32-NEXT:    s_endpgm
1091;
1092; GFX10_W64-LABEL: test_div_fmas_f32_i1_phi_vcc:
1093; GFX10_W64:       ; %bb.0: ; %entry
1094; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
1095; GFX10_W64-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1096; GFX10_W64-NEXT:    s_mov_b32 s6, 0
1097; GFX10_W64-NEXT:    v_lshlrev_b64 v[1:2], 2, v[0:1]
1098; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
1099; GFX10_W64-NEXT:    v_mov_b32_e32 v4, s3
1100; GFX10_W64-NEXT:    v_mov_b32_e32 v3, s2
1101; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1102; GFX10_W64-NEXT:    v_add_co_u32_e64 v1, vcc, v3, v1
1103; GFX10_W64-NEXT:    v_add_co_ci_u32_e32 v2, vcc, v4, v2, vcc
1104; GFX10_W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1105; GFX10_W64-NEXT:    global_load_dwordx3 v[1:3], v[1:2], off
1106; GFX10_W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1107; GFX10_W64-NEXT:    s_cbranch_execz BB13_2
1108; GFX10_W64-NEXT:  ; %bb.1: ; %bb
1109; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x74
1110; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
1111; GFX10_W64-NEXT:    s_load_dword s0, s[0:1], 0x0
1112; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
1113; GFX10_W64-NEXT:    s_cmp_lg_u32 s0, 0
1114; GFX10_W64-NEXT:    s_cselect_b32 s6, 1, 0
1115; GFX10_W64-NEXT:  BB13_2: ; %exit
1116; GFX10_W64-NEXT:    v_nop
1117; GFX10_W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1118; GFX10_W64-NEXT:    s_and_b32 s0, 1, s6
1119; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
1120; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
1121; GFX10_W64-NEXT:    s_add_u32 s0, s2, 8
1122; GFX10_W64-NEXT:    s_addc_u32 s1, s3, 0
1123; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
1124; GFX10_W64-NEXT:    v_div_fmas_f32 v2, v1, v2, v3
1125; GFX10_W64-NEXT:    v_mov_b32_e32 v0, s0
1126; GFX10_W64-NEXT:    v_mov_b32_e32 v1, s1
1127; GFX10_W64-NEXT:    global_store_dword v[0:1], v2, off
1128; GFX10_W64-NEXT:    s_endpgm
1129entry:
1130  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1131  %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
1132  %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
1133  %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
1134  %gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2
1135
1136  %a = load float, float addrspace(1)* %gep.a
1137  %b = load float, float addrspace(1)* %gep.b
1138  %c = load float, float addrspace(1)* %gep.c
1139
1140  %cmp0 = icmp eq i32 %tid, 0
1141  br i1 %cmp0, label %bb, label %exit
1142
1143bb:
1144  %val = load i32, i32 addrspace(1)* %dummy
1145  %cmp1 = icmp ne i32 %val, 0
1146  br label %exit
1147
1148exit:
1149  %cond = phi i1 [false, %entry], [%cmp1, %bb]
1150  %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cond)
1151  store float %result, float addrspace(1)* %gep.out, align 4
1152  ret void
1153}
1154
1155declare i32 @llvm.amdgcn.workitem.id.x() #0
1156declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) #0
1157declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) #0
1158
1159attributes #0 = { nounwind readnone speculatable }
1160