1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
4; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-32 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-64 %s
6
7define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
8; SI-LABEL: static_exact:
9; SI:       ; %bb.0: ; %.entry
10; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
11; SI-NEXT:    s_andn2_b64 exec, exec, exec
12; SI-NEXT:    s_cbranch_scc0 BB0_2
13; SI-NEXT:  ; %bb.1: ; %.entry
14; SI-NEXT:    s_mov_b64 exec, 0
15; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
16; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
17; SI-NEXT:    s_endpgm
18; SI-NEXT:  BB0_2:
19; SI-NEXT:    s_mov_b64 exec, 0
20; SI-NEXT:    exp null off, off, off, off done vm
21; SI-NEXT:    s_endpgm
22;
23; GFX9-LABEL: static_exact:
24; GFX9:       ; %bb.0: ; %.entry
25; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
26; GFX9-NEXT:    s_andn2_b64 exec, exec, exec
27; GFX9-NEXT:    s_cbranch_scc0 BB0_2
28; GFX9-NEXT:  ; %bb.1: ; %.entry
29; GFX9-NEXT:    s_mov_b64 exec, 0
30; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
31; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
32; GFX9-NEXT:    s_endpgm
33; GFX9-NEXT:  BB0_2:
34; GFX9-NEXT:    s_mov_b64 exec, 0
35; GFX9-NEXT:    exp null off, off, off, off done vm
36; GFX9-NEXT:    s_endpgm
37;
38; GFX10-32-LABEL: static_exact:
39; GFX10-32:       ; %bb.0: ; %.entry
40; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
41; GFX10-32-NEXT:    s_andn2_b32 exec_lo, exec_lo, exec_lo
42; GFX10-32-NEXT:    s_cbranch_scc0 BB0_2
43; GFX10-32-NEXT:  ; %bb.1: ; %.entry
44; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
45; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
46; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
47; GFX10-32-NEXT:    s_endpgm
48; GFX10-32-NEXT:  BB0_2:
49; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
50; GFX10-32-NEXT:    exp null off, off, off, off done vm
51; GFX10-32-NEXT:    s_endpgm
52;
53; GFX10-64-LABEL: static_exact:
54; GFX10-64:       ; %bb.0: ; %.entry
55; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
56; GFX10-64-NEXT:    s_andn2_b64 exec, exec, exec
57; GFX10-64-NEXT:    s_cbranch_scc0 BB0_2
58; GFX10-64-NEXT:  ; %bb.1: ; %.entry
59; GFX10-64-NEXT:    s_mov_b64 exec, 0
60; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
61; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
62; GFX10-64-NEXT:    s_endpgm
63; GFX10-64-NEXT:  BB0_2:
64; GFX10-64-NEXT:    s_mov_b64 exec, 0
65; GFX10-64-NEXT:    exp null off, off, off, off done vm
66; GFX10-64-NEXT:    s_endpgm
67.entry:
68  %c0 = fcmp olt float %arg0, 0.000000e+00
69  %c1 = fcmp oge float %arg1, 0.0
70  call void @llvm.amdgcn.wqm.demote(i1 false)
71  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
72  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
73  ret void
74}
75
76define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
77; SI-LABEL: dynamic_exact:
78; SI:       ; %bb.0: ; %.entry
79; SI-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
80; SI-NEXT:    s_mov_b64 s[2:3], exec
81; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
82; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
83; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
84; SI-NEXT:    s_cbranch_scc0 BB1_2
85; SI-NEXT:  ; %bb.1: ; %.entry
86; SI-NEXT:    s_and_b64 exec, exec, s[2:3]
87; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
88; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
89; SI-NEXT:    s_endpgm
90; SI-NEXT:  BB1_2:
91; SI-NEXT:    s_mov_b64 exec, 0
92; SI-NEXT:    exp null off, off, off, off done vm
93; SI-NEXT:    s_endpgm
94;
95; GFX9-LABEL: dynamic_exact:
96; GFX9:       ; %bb.0: ; %.entry
97; GFX9-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
98; GFX9-NEXT:    s_mov_b64 s[2:3], exec
99; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
100; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
101; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
102; GFX9-NEXT:    s_cbranch_scc0 BB1_2
103; GFX9-NEXT:  ; %bb.1: ; %.entry
104; GFX9-NEXT:    s_and_b64 exec, exec, s[2:3]
105; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
106; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
107; GFX9-NEXT:    s_endpgm
108; GFX9-NEXT:  BB1_2:
109; GFX9-NEXT:    s_mov_b64 exec, 0
110; GFX9-NEXT:    exp null off, off, off, off done vm
111; GFX9-NEXT:    s_endpgm
112;
113; GFX10-32-LABEL: dynamic_exact:
114; GFX10-32:       ; %bb.0: ; %.entry
115; GFX10-32-NEXT:    v_cmp_le_f32_e64 s0, 0, v1
116; GFX10-32-NEXT:    s_mov_b32 s1, exec_lo
117; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
118; GFX10-32-NEXT:    s_xor_b32 s0, s0, exec_lo
119; GFX10-32-NEXT:    s_andn2_b32 s1, s1, s0
120; GFX10-32-NEXT:    s_cbranch_scc0 BB1_2
121; GFX10-32-NEXT:  ; %bb.1: ; %.entry
122; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
123; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
124; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
125; GFX10-32-NEXT:    s_endpgm
126; GFX10-32-NEXT:  BB1_2:
127; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
128; GFX10-32-NEXT:    exp null off, off, off, off done vm
129; GFX10-32-NEXT:    s_endpgm
130;
131; GFX10-64-LABEL: dynamic_exact:
132; GFX10-64:       ; %bb.0: ; %.entry
133; GFX10-64-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
134; GFX10-64-NEXT:    s_mov_b64 s[2:3], exec
135; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
136; GFX10-64-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
137; GFX10-64-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
138; GFX10-64-NEXT:    s_cbranch_scc0 BB1_2
139; GFX10-64-NEXT:  ; %bb.1: ; %.entry
140; GFX10-64-NEXT:    s_and_b64 exec, exec, s[2:3]
141; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
142; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
143; GFX10-64-NEXT:    s_endpgm
144; GFX10-64-NEXT:  BB1_2:
145; GFX10-64-NEXT:    s_mov_b64 exec, 0
146; GFX10-64-NEXT:    exp null off, off, off, off done vm
147; GFX10-64-NEXT:    s_endpgm
148.entry:
149  %c0 = fcmp olt float %arg0, 0.000000e+00
150  %c1 = fcmp oge float %arg1, 0.0
151  call void @llvm.amdgcn.wqm.demote(i1 %c1)
152  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
153  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
154  ret void
155}
156
157define amdgpu_ps void @branch(float %arg0, float %arg1) {
158; SI-LABEL: branch:
159; SI:       ; %bb.0: ; %.entry
160; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
161; SI-NEXT:    v_cvt_i32_f32_e32 v1, v1
162; SI-NEXT:    s_mov_b64 s[2:3], exec
163; SI-NEXT:    v_or_b32_e32 v0, v0, v1
164; SI-NEXT:    v_and_b32_e32 v1, 1, v0
165; SI-NEXT:    v_and_b32_e32 v0, 1, v0
166; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
167; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
168; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
169; SI-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
170; SI-NEXT:    s_cbranch_execz BB2_3
171; SI-NEXT:  ; %bb.1: ; %.demote
172; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
173; SI-NEXT:    s_cbranch_scc0 BB2_4
174; SI-NEXT:  ; %bb.2: ; %.demote
175; SI-NEXT:    s_mov_b64 exec, 0
176; SI-NEXT:  BB2_3: ; %.continue
177; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
178; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
179; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
180; SI-NEXT:    s_endpgm
181; SI-NEXT:  BB2_4:
182; SI-NEXT:    s_mov_b64 exec, 0
183; SI-NEXT:    exp null off, off, off, off done vm
184; SI-NEXT:    s_endpgm
185;
186; GFX9-LABEL: branch:
187; GFX9:       ; %bb.0: ; %.entry
188; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
189; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
190; GFX9-NEXT:    s_mov_b64 s[2:3], exec
191; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
192; GFX9-NEXT:    v_and_b32_e32 v1, 1, v0
193; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
194; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
195; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
196; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
197; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
198; GFX9-NEXT:    s_cbranch_execz BB2_3
199; GFX9-NEXT:  ; %bb.1: ; %.demote
200; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
201; GFX9-NEXT:    s_cbranch_scc0 BB2_4
202; GFX9-NEXT:  ; %bb.2: ; %.demote
203; GFX9-NEXT:    s_mov_b64 exec, 0
204; GFX9-NEXT:  BB2_3: ; %.continue
205; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
206; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
207; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
208; GFX9-NEXT:    s_endpgm
209; GFX9-NEXT:  BB2_4:
210; GFX9-NEXT:    s_mov_b64 exec, 0
211; GFX9-NEXT:    exp null off, off, off, off done vm
212; GFX9-NEXT:    s_endpgm
213;
214; GFX10-32-LABEL: branch:
215; GFX10-32:       ; %bb.0: ; %.entry
216; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
217; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v1, v1
218; GFX10-32-NEXT:    s_mov_b32 s1, exec_lo
219; GFX10-32-NEXT:    v_or_b32_e32 v0, v0, v1
220; GFX10-32-NEXT:    v_and_b32_e32 v1, 1, v0
221; GFX10-32-NEXT:    v_and_b32_e32 v0, 1, v0
222; GFX10-32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
223; GFX10-32-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
224; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s0
225; GFX10-32-NEXT:    s_xor_b32 s0, exec_lo, s2
226; GFX10-32-NEXT:    s_cbranch_execz BB2_3
227; GFX10-32-NEXT:  ; %bb.1: ; %.demote
228; GFX10-32-NEXT:    s_andn2_b32 s1, s1, exec_lo
229; GFX10-32-NEXT:    s_cbranch_scc0 BB2_4
230; GFX10-32-NEXT:  ; %bb.2: ; %.demote
231; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
232; GFX10-32-NEXT:  BB2_3: ; %.continue
233; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
234; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
235; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
236; GFX10-32-NEXT:    s_endpgm
237; GFX10-32-NEXT:  BB2_4:
238; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
239; GFX10-32-NEXT:    exp null off, off, off, off done vm
240; GFX10-32-NEXT:    s_endpgm
241;
242; GFX10-64-LABEL: branch:
243; GFX10-64:       ; %bb.0: ; %.entry
244; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
245; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v1, v1
246; GFX10-64-NEXT:    s_mov_b64 s[2:3], exec
247; GFX10-64-NEXT:    v_or_b32_e32 v0, v0, v1
248; GFX10-64-NEXT:    v_and_b32_e32 v1, 1, v0
249; GFX10-64-NEXT:    v_and_b32_e32 v0, 1, v0
250; GFX10-64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
251; GFX10-64-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
252; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
253; GFX10-64-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
254; GFX10-64-NEXT:    s_cbranch_execz BB2_3
255; GFX10-64-NEXT:  ; %bb.1: ; %.demote
256; GFX10-64-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
257; GFX10-64-NEXT:    s_cbranch_scc0 BB2_4
258; GFX10-64-NEXT:  ; %bb.2: ; %.demote
259; GFX10-64-NEXT:    s_mov_b64 exec, 0
260; GFX10-64-NEXT:  BB2_3: ; %.continue
261; GFX10-64-NEXT:    s_or_b64 exec, exec, s[0:1]
262; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
263; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
264; GFX10-64-NEXT:    s_endpgm
265; GFX10-64-NEXT:  BB2_4:
266; GFX10-64-NEXT:    s_mov_b64 exec, 0
267; GFX10-64-NEXT:    exp null off, off, off, off done vm
268; GFX10-64-NEXT:    s_endpgm
269.entry:
270  %i0 = fptosi float %arg0 to i32
271  %i1 = fptosi float %arg1 to i32
272  %c0 = or i32 %i0, %i1
273  %c1 = and i32 %c0, 1
274  %c2 = icmp eq i32 %c1, 0
275  br i1 %c2, label %.continue, label %.demote
276
277.demote:
278  call void @llvm.amdgcn.wqm.demote(i1 false)
279  br label %.continue
280
281.continue:
282  %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00
283  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
284  ret void
285}
286
287
288define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
289; SI-LABEL: wqm_demote_1:
290; SI:       ; %bb.0: ; %.entry
291; SI-NEXT:    s_mov_b64 s[12:13], exec
292; SI-NEXT:    s_wqm_b64 exec, exec
293; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
294; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
295; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
296; SI-NEXT:    s_cbranch_execz BB3_3
297; SI-NEXT:  ; %bb.1: ; %.demote
298; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
299; SI-NEXT:    s_cbranch_scc0 BB3_4
300; SI-NEXT:  ; %bb.2: ; %.demote
301; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
302; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
303; SI-NEXT:  BB3_3: ; %.continue
304; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
305; SI-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
306; SI-NEXT:    s_waitcnt vmcnt(0)
307; SI-NEXT:    v_add_f32_e32 v0, v0, v0
308; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
309; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
310; SI-NEXT:    s_waitcnt vmcnt(0)
311; SI-NEXT:    s_branch BB3_5
312; SI-NEXT:  BB3_4:
313; SI-NEXT:    s_mov_b64 exec, 0
314; SI-NEXT:    exp null off, off, off, off done vm
315; SI-NEXT:    s_endpgm
316; SI-NEXT:  BB3_5:
317;
318; GFX9-LABEL: wqm_demote_1:
319; GFX9:       ; %bb.0: ; %.entry
320; GFX9-NEXT:    s_mov_b64 s[12:13], exec
321; GFX9-NEXT:    s_wqm_b64 exec, exec
322; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
323; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
324; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
325; GFX9-NEXT:    s_cbranch_execz BB3_3
326; GFX9-NEXT:  ; %bb.1: ; %.demote
327; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
328; GFX9-NEXT:    s_cbranch_scc0 BB3_4
329; GFX9-NEXT:  ; %bb.2: ; %.demote
330; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
331; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
332; GFX9-NEXT:  BB3_3: ; %.continue
333; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
334; GFX9-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
335; GFX9-NEXT:    s_waitcnt vmcnt(0)
336; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
337; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
338; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
339; GFX9-NEXT:    s_waitcnt vmcnt(0)
340; GFX9-NEXT:    s_branch BB3_5
341; GFX9-NEXT:  BB3_4:
342; GFX9-NEXT:    s_mov_b64 exec, 0
343; GFX9-NEXT:    exp null off, off, off, off done vm
344; GFX9-NEXT:    s_endpgm
345; GFX9-NEXT:  BB3_5:
346;
347; GFX10-32-LABEL: wqm_demote_1:
348; GFX10-32:       ; %bb.0: ; %.entry
349; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
350; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
351; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v1
352; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
353; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
354; GFX10-32-NEXT:    s_cbranch_execz BB3_3
355; GFX10-32-NEXT:  ; %bb.1: ; %.demote
356; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
357; GFX10-32-NEXT:    s_cbranch_scc0 BB3_4
358; GFX10-32-NEXT:  ; %bb.2: ; %.demote
359; GFX10-32-NEXT:    s_wqm_b32 s14, s12
360; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s14
361; GFX10-32-NEXT:  BB3_3: ; %.continue
362; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
363; GFX10-32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
364; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
365; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
366; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
367; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
368; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
369; GFX10-32-NEXT:    s_branch BB3_5
370; GFX10-32-NEXT:  BB3_4:
371; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
372; GFX10-32-NEXT:    exp null off, off, off, off done vm
373; GFX10-32-NEXT:    s_endpgm
374; GFX10-32-NEXT:  BB3_5:
375;
376; GFX10-64-LABEL: wqm_demote_1:
377; GFX10-64:       ; %bb.0: ; %.entry
378; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
379; GFX10-64-NEXT:    s_wqm_b64 exec, exec
380; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
381; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
382; GFX10-64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
383; GFX10-64-NEXT:    s_cbranch_execz BB3_3
384; GFX10-64-NEXT:  ; %bb.1: ; %.demote
385; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
386; GFX10-64-NEXT:    s_cbranch_scc0 BB3_4
387; GFX10-64-NEXT:  ; %bb.2: ; %.demote
388; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
389; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
390; GFX10-64-NEXT:  BB3_3: ; %.continue
391; GFX10-64-NEXT:    s_or_b64 exec, exec, s[14:15]
392; GFX10-64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
393; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
394; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
395; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
396; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
397; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
398; GFX10-64-NEXT:    s_branch BB3_5
399; GFX10-64-NEXT:  BB3_4:
400; GFX10-64-NEXT:    s_mov_b64 exec, 0
401; GFX10-64-NEXT:    exp null off, off, off, off done vm
402; GFX10-64-NEXT:    s_endpgm
403; GFX10-64-NEXT:  BB3_5:
404.entry:
405  %z.cmp = fcmp olt float %z, 0.0
406  br i1 %z.cmp, label %.continue, label %.demote
407
408.demote:
409  call void @llvm.amdgcn.wqm.demote(i1 false)
410  br label %.continue
411
412.continue:
413  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
414  %tex0 = extractelement <4 x float> %tex, i32 0
415  %tex1 = extractelement <4 x float> %tex, i32 0
416  %coord1 = fadd float %tex0, %tex1
417  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
418
419  ret <4 x float> %rtex
420}
421
422define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
423; SI-LABEL: wqm_demote_2:
424; SI:       ; %bb.0: ; %.entry
425; SI-NEXT:    s_mov_b64 s[12:13], exec
426; SI-NEXT:    s_wqm_b64 exec, exec
427; SI-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
428; SI-NEXT:    s_waitcnt vmcnt(0)
429; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
430; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
431; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
432; SI-NEXT:    s_cbranch_execz BB4_3
433; SI-NEXT:  ; %bb.1: ; %.demote
434; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
435; SI-NEXT:    s_cbranch_scc0 BB4_4
436; SI-NEXT:  ; %bb.2: ; %.demote
437; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
438; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
439; SI-NEXT:  BB4_3: ; %.continue
440; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
441; SI-NEXT:    v_add_f32_e32 v0, v0, v0
442; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
443; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
444; SI-NEXT:    s_waitcnt vmcnt(0)
445; SI-NEXT:    s_branch BB4_5
446; SI-NEXT:  BB4_4:
447; SI-NEXT:    s_mov_b64 exec, 0
448; SI-NEXT:    exp null off, off, off, off done vm
449; SI-NEXT:    s_endpgm
450; SI-NEXT:  BB4_5:
451;
452; GFX9-LABEL: wqm_demote_2:
453; GFX9:       ; %bb.0: ; %.entry
454; GFX9-NEXT:    s_mov_b64 s[12:13], exec
455; GFX9-NEXT:    s_wqm_b64 exec, exec
456; GFX9-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
457; GFX9-NEXT:    s_waitcnt vmcnt(0)
458; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
459; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
460; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
461; GFX9-NEXT:    s_cbranch_execz BB4_3
462; GFX9-NEXT:  ; %bb.1: ; %.demote
463; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
464; GFX9-NEXT:    s_cbranch_scc0 BB4_4
465; GFX9-NEXT:  ; %bb.2: ; %.demote
466; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
467; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
468; GFX9-NEXT:  BB4_3: ; %.continue
469; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
470; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
471; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
472; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
473; GFX9-NEXT:    s_waitcnt vmcnt(0)
474; GFX9-NEXT:    s_branch BB4_5
475; GFX9-NEXT:  BB4_4:
476; GFX9-NEXT:    s_mov_b64 exec, 0
477; GFX9-NEXT:    exp null off, off, off, off done vm
478; GFX9-NEXT:    s_endpgm
479; GFX9-NEXT:  BB4_5:
480;
481; GFX10-32-LABEL: wqm_demote_2:
482; GFX10-32:       ; %bb.0: ; %.entry
483; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
484; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
485; GFX10-32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
486; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
487; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v0
488; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
489; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
490; GFX10-32-NEXT:    s_cbranch_execz BB4_3
491; GFX10-32-NEXT:  ; %bb.1: ; %.demote
492; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
493; GFX10-32-NEXT:    s_cbranch_scc0 BB4_4
494; GFX10-32-NEXT:  ; %bb.2: ; %.demote
495; GFX10-32-NEXT:    s_wqm_b32 s14, s12
496; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s14
497; GFX10-32-NEXT:  BB4_3: ; %.continue
498; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
499; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
500; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
501; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
502; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
503; GFX10-32-NEXT:    s_branch BB4_5
504; GFX10-32-NEXT:  BB4_4:
505; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
506; GFX10-32-NEXT:    exp null off, off, off, off done vm
507; GFX10-32-NEXT:    s_endpgm
508; GFX10-32-NEXT:  BB4_5:
509;
510; GFX10-64-LABEL: wqm_demote_2:
511; GFX10-64:       ; %bb.0: ; %.entry
512; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
513; GFX10-64-NEXT:    s_wqm_b64 exec, exec
514; GFX10-64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
515; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
516; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
517; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
518; GFX10-64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
519; GFX10-64-NEXT:    s_cbranch_execz BB4_3
520; GFX10-64-NEXT:  ; %bb.1: ; %.demote
521; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
522; GFX10-64-NEXT:    s_cbranch_scc0 BB4_4
523; GFX10-64-NEXT:  ; %bb.2: ; %.demote
524; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
525; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
526; GFX10-64-NEXT:  BB4_3: ; %.continue
527; GFX10-64-NEXT:    s_or_b64 exec, exec, s[14:15]
528; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
529; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
530; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
531; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
532; GFX10-64-NEXT:    s_branch BB4_5
533; GFX10-64-NEXT:  BB4_4:
534; GFX10-64-NEXT:    s_mov_b64 exec, 0
535; GFX10-64-NEXT:    exp null off, off, off, off done vm
536; GFX10-64-NEXT:    s_endpgm
537; GFX10-64-NEXT:  BB4_5:
538.entry:
539  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
540  %tex0 = extractelement <4 x float> %tex, i32 0
541  %tex1 = extractelement <4 x float> %tex, i32 0
542  %z.cmp = fcmp olt float %tex0, 0.0
543  br i1 %z.cmp, label %.continue, label %.demote
544
545.demote:
546  call void @llvm.amdgcn.wqm.demote(i1 false)
547  br label %.continue
548
549.continue:
550  %coord1 = fadd float %tex0, %tex1
551  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
552
553  ret <4 x float> %rtex
554}
555
556define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
557; SI-LABEL: wqm_demote_dynamic:
558; SI:       ; %bb.0: ; %.entry
559; SI-NEXT:    s_mov_b64 s[12:13], exec
560; SI-NEXT:    s_wqm_b64 exec, exec
561; SI-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
562; SI-NEXT:    s_waitcnt vmcnt(0)
563; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
564; SI-NEXT:    s_xor_b64 s[14:15], vcc, exec
565; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
566; SI-NEXT:    s_cbranch_scc0 BB5_2
567; SI-NEXT:  ; %bb.1: ; %.entry
568; SI-NEXT:    s_wqm_b64 s[14:15], s[12:13]
569; SI-NEXT:    s_and_b64 exec, exec, s[14:15]
570; SI-NEXT:    v_add_f32_e32 v0, v0, v0
571; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
572; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
573; SI-NEXT:    s_waitcnt vmcnt(0)
574; SI-NEXT:    s_branch BB5_3
575; SI-NEXT:  BB5_2:
576; SI-NEXT:    s_mov_b64 exec, 0
577; SI-NEXT:    exp null off, off, off, off done vm
578; SI-NEXT:    s_endpgm
579; SI-NEXT:  BB5_3:
580;
581; GFX9-LABEL: wqm_demote_dynamic:
582; GFX9:       ; %bb.0: ; %.entry
583; GFX9-NEXT:    s_mov_b64 s[12:13], exec
584; GFX9-NEXT:    s_wqm_b64 exec, exec
585; GFX9-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
586; GFX9-NEXT:    s_waitcnt vmcnt(0)
587; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
588; GFX9-NEXT:    s_xor_b64 s[14:15], vcc, exec
589; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
590; GFX9-NEXT:    s_cbranch_scc0 BB5_2
591; GFX9-NEXT:  ; %bb.1: ; %.entry
592; GFX9-NEXT:    s_wqm_b64 s[14:15], s[12:13]
593; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
594; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
595; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
596; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
597; GFX9-NEXT:    s_waitcnt vmcnt(0)
598; GFX9-NEXT:    s_branch BB5_3
599; GFX9-NEXT:  BB5_2:
600; GFX9-NEXT:    s_mov_b64 exec, 0
601; GFX9-NEXT:    exp null off, off, off, off done vm
602; GFX9-NEXT:    s_endpgm
603; GFX9-NEXT:  BB5_3:
604;
605; GFX10-32-LABEL: wqm_demote_dynamic:
606; GFX10-32:       ; %bb.0: ; %.entry
607; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
608; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
609; GFX10-32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
610; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
611; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
612; GFX10-32-NEXT:    s_xor_b32 s13, vcc_lo, exec_lo
613; GFX10-32-NEXT:    s_andn2_b32 s12, s12, s13
614; GFX10-32-NEXT:    s_cbranch_scc0 BB5_2
615; GFX10-32-NEXT:  ; %bb.1: ; %.entry
616; GFX10-32-NEXT:    s_wqm_b32 s13, s12
617; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s13
618; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
619; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
620; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
621; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
622; GFX10-32-NEXT:    s_branch BB5_3
623; GFX10-32-NEXT:  BB5_2:
624; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
625; GFX10-32-NEXT:    exp null off, off, off, off done vm
626; GFX10-32-NEXT:    s_endpgm
627; GFX10-32-NEXT:  BB5_3:
628;
629; GFX10-64-LABEL: wqm_demote_dynamic:
630; GFX10-64:       ; %bb.0: ; %.entry
631; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
632; GFX10-64-NEXT:    s_wqm_b64 exec, exec
633; GFX10-64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
634; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
635; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
636; GFX10-64-NEXT:    s_xor_b64 s[14:15], vcc, exec
637; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
638; GFX10-64-NEXT:    s_cbranch_scc0 BB5_2
639; GFX10-64-NEXT:  ; %bb.1: ; %.entry
640; GFX10-64-NEXT:    s_wqm_b64 s[14:15], s[12:13]
641; GFX10-64-NEXT:    s_and_b64 exec, exec, s[14:15]
642; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
643; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
644; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
645; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
646; GFX10-64-NEXT:    s_branch BB5_3
647; GFX10-64-NEXT:  BB5_2:
648; GFX10-64-NEXT:    s_mov_b64 exec, 0
649; GFX10-64-NEXT:    exp null off, off, off, off done vm
650; GFX10-64-NEXT:    s_endpgm
651; GFX10-64-NEXT:  BB5_3:
652.entry:
653  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
654  %tex0 = extractelement <4 x float> %tex, i32 0
655  %tex1 = extractelement <4 x float> %tex, i32 0
656  %z.cmp = fcmp olt float %tex0, 0.0
657  call void @llvm.amdgcn.wqm.demote(i1 %z.cmp)
658  %coord1 = fadd float %tex0, %tex1
659  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
660
661  ret <4 x float> %rtex
662}
663
664
665define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
666; SI-LABEL: wqm_deriv:
667; SI:       ; %bb.0: ; %.entry
668; SI-NEXT:    s_mov_b64 s[0:1], exec
669; SI-NEXT:    s_wqm_b64 exec, exec
670; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
671; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
672; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
673; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
674; SI-NEXT:    s_cbranch_execz BB6_3
675; SI-NEXT:  ; %bb.1: ; %.demote0
676; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
677; SI-NEXT:    s_cbranch_scc0 BB6_7
678; SI-NEXT:  ; %bb.2: ; %.demote0
679; SI-NEXT:    s_wqm_b64 s[4:5], s[0:1]
680; SI-NEXT:    s_and_b64 exec, exec, s[4:5]
681; SI-NEXT:  BB6_3: ; %.continue0
682; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
683; SI-NEXT:    s_mov_b64 s[2:3], s[0:1]
684; SI-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
685; SI-NEXT:    v_mov_b32_e32 v1, v0
686; SI-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
687; SI-NEXT:    s_nop 0
688; SI-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
689; SI-NEXT:    s_nop 1
690; SI-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
691; SI-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
692; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
693; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
694; SI-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
695; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
696; SI-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
697; SI-NEXT:    s_cbranch_execz BB6_6
698; SI-NEXT:  ; %bb.4: ; %.demote1
699; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
700; SI-NEXT:    s_cbranch_scc0 BB6_7
701; SI-NEXT:  ; %bb.5: ; %.demote1
702; SI-NEXT:    s_mov_b64 exec, 0
703; SI-NEXT:  BB6_6: ; %.continue1
704; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
705; SI-NEXT:    v_bfrev_b32_e32 v0, 60
706; SI-NEXT:    v_mov_b32_e32 v1, 0x3c00
707; SI-NEXT:    exp mrt0 v1, v1, v0, v0 done compr vm
708; SI-NEXT:    s_endpgm
709; SI-NEXT:  BB6_7:
710; SI-NEXT:    s_mov_b64 exec, 0
711; SI-NEXT:    exp null off, off, off, off done vm
712; SI-NEXT:    s_endpgm
713;
714; GFX9-LABEL: wqm_deriv:
715; GFX9:       ; %bb.0: ; %.entry
716; GFX9-NEXT:    s_mov_b64 s[0:1], exec
717; GFX9-NEXT:    s_wqm_b64 exec, exec
718; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
719; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
720; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
721; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
722; GFX9-NEXT:    s_cbranch_execz BB6_3
723; GFX9-NEXT:  ; %bb.1: ; %.demote0
724; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
725; GFX9-NEXT:    s_cbranch_scc0 BB6_7
726; GFX9-NEXT:  ; %bb.2: ; %.demote0
727; GFX9-NEXT:    s_wqm_b64 s[4:5], s[0:1]
728; GFX9-NEXT:    s_and_b64 exec, exec, s[4:5]
729; GFX9-NEXT:  BB6_3: ; %.continue0
730; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
731; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
732; GFX9-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
733; GFX9-NEXT:    v_mov_b32_e32 v1, v0
734; GFX9-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
735; GFX9-NEXT:    s_nop 0
736; GFX9-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
737; GFX9-NEXT:    s_nop 1
738; GFX9-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
739; GFX9-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
740; GFX9-NEXT:    s_and_b64 exec, exec, s[0:1]
741; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
742; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
743; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
744; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
745; GFX9-NEXT:    s_cbranch_execz BB6_6
746; GFX9-NEXT:  ; %bb.4: ; %.demote1
747; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
748; GFX9-NEXT:    s_cbranch_scc0 BB6_7
749; GFX9-NEXT:  ; %bb.5: ; %.demote1
750; GFX9-NEXT:    s_mov_b64 exec, 0
751; GFX9-NEXT:  BB6_6: ; %.continue1
752; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
753; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
754; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
755; GFX9-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
756; GFX9-NEXT:    s_endpgm
757; GFX9-NEXT:  BB6_7:
758; GFX9-NEXT:    s_mov_b64 exec, 0
759; GFX9-NEXT:    exp null off, off, off, off done vm
760; GFX9-NEXT:    s_endpgm
761;
762; GFX10-32-LABEL: wqm_deriv:
763; GFX10-32:       ; %bb.0: ; %.entry
764; GFX10-32-NEXT:    s_mov_b32 s0, exec_lo
765; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
766; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
767; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
768; GFX10-32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
769; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s1
770; GFX10-32-NEXT:    s_cbranch_execz BB6_3
771; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
772; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
773; GFX10-32-NEXT:    s_cbranch_scc0 BB6_7
774; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
775; GFX10-32-NEXT:    s_wqm_b32 s2, s0
776; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
777; GFX10-32-NEXT:  BB6_3: ; %.continue0
778; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
779; GFX10-32-NEXT:    s_mov_b32 s1, s0
780; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s1
781; GFX10-32-NEXT:    v_mov_b32_e32 v1, v0
782; GFX10-32-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
783; GFX10-32-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
784; GFX10-32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
785; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
786; GFX10-32-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v0
787; GFX10-32-NEXT:    s_xor_b32 s1, s0, -1
788; GFX10-32-NEXT:    s_or_b32 s1, s1, vcc_lo
789; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s1
790; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s2
791; GFX10-32-NEXT:    s_cbranch_execz BB6_6
792; GFX10-32-NEXT:  ; %bb.4: ; %.demote1
793; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
794; GFX10-32-NEXT:    s_cbranch_scc0 BB6_7
795; GFX10-32-NEXT:  ; %bb.5: ; %.demote1
796; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
797; GFX10-32-NEXT:  BB6_6: ; %.continue1
798; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
799; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
800; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
801; GFX10-32-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
802; GFX10-32-NEXT:    s_endpgm
803; GFX10-32-NEXT:  BB6_7:
804; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
805; GFX10-32-NEXT:    exp null off, off, off, off done vm
806; GFX10-32-NEXT:    s_endpgm
807;
808; GFX10-64-LABEL: wqm_deriv:
809; GFX10-64:       ; %bb.0: ; %.entry
810; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
811; GFX10-64-NEXT:    s_wqm_b64 exec, exec
812; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
813; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
814; GFX10-64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
815; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
816; GFX10-64-NEXT:    s_cbranch_execz BB6_3
817; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
818; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
819; GFX10-64-NEXT:    s_cbranch_scc0 BB6_7
820; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
821; GFX10-64-NEXT:    s_wqm_b64 s[4:5], s[0:1]
822; GFX10-64-NEXT:    s_and_b64 exec, exec, s[4:5]
823; GFX10-64-NEXT:  BB6_3: ; %.continue0
824; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
825; GFX10-64-NEXT:    s_mov_b64 s[2:3], s[0:1]
826; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
827; GFX10-64-NEXT:    v_mov_b32_e32 v1, v0
828; GFX10-64-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
829; GFX10-64-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
830; GFX10-64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
831; GFX10-64-NEXT:    s_and_b64 exec, exec, s[0:1]
832; GFX10-64-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
833; GFX10-64-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
834; GFX10-64-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
835; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
836; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
837; GFX10-64-NEXT:    s_cbranch_execz BB6_6
838; GFX10-64-NEXT:  ; %bb.4: ; %.demote1
839; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
840; GFX10-64-NEXT:    s_cbranch_scc0 BB6_7
841; GFX10-64-NEXT:  ; %bb.5: ; %.demote1
842; GFX10-64-NEXT:    s_mov_b64 exec, 0
843; GFX10-64-NEXT:  BB6_6: ; %.continue1
844; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
845; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
846; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60
847; GFX10-64-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
848; GFX10-64-NEXT:    s_endpgm
849; GFX10-64-NEXT:  BB6_7:
850; GFX10-64-NEXT:    s_mov_b64 exec, 0
851; GFX10-64-NEXT:    exp null off, off, off, off done vm
852; GFX10-64-NEXT:    s_endpgm
853.entry:
854  %p0 = extractelement <2 x float> %input, i32 0
855  %p1 = extractelement <2 x float> %input, i32 1
856  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2
857  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2
858  %argi = fptosi float %arg to i32
859  %cond0 = icmp eq i32 %argi, 0
860  br i1 %cond0, label %.continue0, label %.demote0
861
862.demote0:
863  call void @llvm.amdgcn.wqm.demote(i1 false)
864  br label %.continue0
865
866.continue0:
867  %live = call i1 @llvm.amdgcn.live.mask()
868  %live.cond = select i1 %live, i32 0, i32 1065353216
869  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
870  %live.v0f = bitcast i32 %live.v0 to float
871  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
872  %live.v1f = bitcast i32 %live.v1 to float
873  %v0 = fsub float %live.v0f, %live.v1f
874  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
875  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
876  %cond2 = and i1 %live, %cond1
877  br i1 %cond2, label %.continue1, label %.demote1
878
879.demote1:
880  call void @llvm.amdgcn.wqm.demote(i1 false)
881  br label %.continue1
882
883.continue1:
884  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3
885  ret void
886}
887
888define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) {
889; SI-LABEL: wqm_deriv_loop:
890; SI:       ; %bb.0: ; %.entry
891; SI-NEXT:    s_mov_b64 s[0:1], exec
892; SI-NEXT:    s_wqm_b64 exec, exec
893; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
894; SI-NEXT:    s_mov_b32 s2, 0
895; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
896; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
897; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
898; SI-NEXT:    s_cbranch_execz BB7_3
899; SI-NEXT:  ; %bb.1: ; %.demote0
900; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
901; SI-NEXT:    s_cbranch_scc0 BB7_9
902; SI-NEXT:  ; %bb.2: ; %.demote0
903; SI-NEXT:    s_wqm_b64 s[6:7], s[0:1]
904; SI-NEXT:    s_and_b64 exec, exec, s[6:7]
905; SI-NEXT:  BB7_3: ; %.continue0.preheader
906; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
907; SI-NEXT:    s_mov_b64 s[4:5], 0
908; SI-NEXT:    s_branch BB7_5
909; SI-NEXT:  BB7_4: ; %.continue1
910; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
911; SI-NEXT:    s_or_b64 exec, exec, s[6:7]
912; SI-NEXT:    s_add_i32 s2, s2, 1
913; SI-NEXT:    v_cmp_ge_i32_e32 vcc, s2, v1
914; SI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
915; SI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
916; SI-NEXT:    s_cbranch_execz BB7_8
917; SI-NEXT:  BB7_5: ; %.continue0
918; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
919; SI-NEXT:    v_mov_b32_e32 v0, s2
920; SI-NEXT:    s_mov_b64 s[6:7], s[0:1]
921; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[6:7]
922; SI-NEXT:    v_mov_b32_e32 v2, v0
923; SI-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
924; SI-NEXT:    s_nop 0
925; SI-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
926; SI-NEXT:    s_nop 1
927; SI-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
928; SI-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
929; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
930; SI-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
931; SI-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
932; SI-NEXT:    s_xor_b64 s[6:7], exec, s[8:9]
933; SI-NEXT:    s_cbranch_execz BB7_4
934; SI-NEXT:  ; %bb.6: ; %.demote1
935; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
936; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
937; SI-NEXT:    s_cbranch_scc0 BB7_9
938; SI-NEXT:  ; %bb.7: ; %.demote1
939; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
940; SI-NEXT:    s_wqm_b64 s[8:9], s[0:1]
941; SI-NEXT:    s_and_b64 exec, exec, s[8:9]
942; SI-NEXT:    s_branch BB7_4
943; SI-NEXT:  BB7_8: ; %.return
944; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
945; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
946; SI-NEXT:    v_bfrev_b32_e32 v0, 60
947; SI-NEXT:    v_mov_b32_e32 v1, 0x3c00
948; SI-NEXT:    exp mrt0 v1, v1, v0, v0 done compr vm
949; SI-NEXT:    s_endpgm
950; SI-NEXT:  BB7_9:
951; SI-NEXT:    s_mov_b64 exec, 0
952; SI-NEXT:    exp null off, off, off, off done vm
953; SI-NEXT:    s_endpgm
954;
955; GFX9-LABEL: wqm_deriv_loop:
956; GFX9:       ; %bb.0: ; %.entry
957; GFX9-NEXT:    s_mov_b64 s[0:1], exec
958; GFX9-NEXT:    s_wqm_b64 exec, exec
959; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
960; GFX9-NEXT:    s_mov_b32 s2, 0
961; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
962; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
963; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
964; GFX9-NEXT:    s_cbranch_execz BB7_3
965; GFX9-NEXT:  ; %bb.1: ; %.demote0
966; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
967; GFX9-NEXT:    s_cbranch_scc0 BB7_9
968; GFX9-NEXT:  ; %bb.2: ; %.demote0
969; GFX9-NEXT:    s_wqm_b64 s[6:7], s[0:1]
970; GFX9-NEXT:    s_and_b64 exec, exec, s[6:7]
971; GFX9-NEXT:  BB7_3: ; %.continue0.preheader
972; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
973; GFX9-NEXT:    s_mov_b64 s[4:5], 0
974; GFX9-NEXT:    s_branch BB7_5
975; GFX9-NEXT:  BB7_4: ; %.continue1
976; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
977; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
978; GFX9-NEXT:    s_add_i32 s2, s2, 1
979; GFX9-NEXT:    v_cmp_ge_i32_e32 vcc, s2, v1
980; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
981; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
982; GFX9-NEXT:    s_cbranch_execz BB7_8
983; GFX9-NEXT:  BB7_5: ; %.continue0
984; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
985; GFX9-NEXT:    v_mov_b32_e32 v0, s2
986; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
987; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[6:7]
988; GFX9-NEXT:    v_mov_b32_e32 v2, v0
989; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
990; GFX9-NEXT:    s_nop 0
991; GFX9-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
992; GFX9-NEXT:    s_nop 1
993; GFX9-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
994; GFX9-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
995; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
996; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
997; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
998; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[8:9]
999; GFX9-NEXT:    s_cbranch_execz BB7_4
1000; GFX9-NEXT:  ; %bb.6: ; %.demote1
1001; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1002; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
1003; GFX9-NEXT:    s_cbranch_scc0 BB7_9
1004; GFX9-NEXT:  ; %bb.7: ; %.demote1
1005; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1006; GFX9-NEXT:    s_wqm_b64 s[8:9], s[0:1]
1007; GFX9-NEXT:    s_and_b64 exec, exec, s[8:9]
1008; GFX9-NEXT:    s_branch BB7_4
1009; GFX9-NEXT:  BB7_8: ; %.return
1010; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1011; GFX9-NEXT:    s_and_b64 exec, exec, s[0:1]
1012; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
1013; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
1014; GFX9-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
1015; GFX9-NEXT:    s_endpgm
1016; GFX9-NEXT:  BB7_9:
1017; GFX9-NEXT:    s_mov_b64 exec, 0
1018; GFX9-NEXT:    exp null off, off, off, off done vm
1019; GFX9-NEXT:    s_endpgm
1020;
1021; GFX10-32-LABEL: wqm_deriv_loop:
1022; GFX10-32:       ; %bb.0: ; %.entry
1023; GFX10-32-NEXT:    s_mov_b32 s0, exec_lo
1024; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1025; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
1026; GFX10-32-NEXT:    s_mov_b32 s1, 0
1027; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
1028; GFX10-32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
1029; GFX10-32-NEXT:    s_xor_b32 s2, exec_lo, s2
1030; GFX10-32-NEXT:    s_cbranch_execz BB7_3
1031; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
1032; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
1033; GFX10-32-NEXT:    s_cbranch_scc0 BB7_9
1034; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
1035; GFX10-32-NEXT:    s_wqm_b32 s3, s0
1036; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s3
1037; GFX10-32-NEXT:  BB7_3: ; %.continue0.preheader
1038; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1039; GFX10-32-NEXT:    s_mov_b32 s2, 0
1040; GFX10-32-NEXT:    s_branch BB7_5
1041; GFX10-32-NEXT:  BB7_4: ; %.continue1
1042; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1043; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
1044; GFX10-32-NEXT:    s_add_i32 s2, s2, 1
1045; GFX10-32-NEXT:    v_cmp_ge_i32_e32 vcc_lo, s2, v1
1046; GFX10-32-NEXT:    s_or_b32 s1, vcc_lo, s1
1047; GFX10-32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s1
1048; GFX10-32-NEXT:    s_cbranch_execz BB7_8
1049; GFX10-32-NEXT:  BB7_5: ; %.continue0
1050; GFX10-32-NEXT:    ; =>This Inner Loop Header: Depth=1
1051; GFX10-32-NEXT:    s_mov_b32 s3, s0
1052; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, s2, 0, s3
1053; GFX10-32-NEXT:    s_xor_b32 s3, s0, -1
1054; GFX10-32-NEXT:    v_mov_b32_e32 v2, v0
1055; GFX10-32-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
1056; GFX10-32-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
1057; GFX10-32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
1058; GFX10-32-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v0
1059; GFX10-32-NEXT:    s_or_b32 s3, s3, vcc_lo
1060; GFX10-32-NEXT:    s_and_saveexec_b32 s4, s3
1061; GFX10-32-NEXT:    s_xor_b32 s3, exec_lo, s4
1062; GFX10-32-NEXT:    s_cbranch_execz BB7_4
1063; GFX10-32-NEXT:  ; %bb.6: ; %.demote1
1064; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1065; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
1066; GFX10-32-NEXT:    s_cbranch_scc0 BB7_9
1067; GFX10-32-NEXT:  ; %bb.7: ; %.demote1
1068; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1069; GFX10-32-NEXT:    s_wqm_b32 s4, s0
1070; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s4
1071; GFX10-32-NEXT:    s_branch BB7_4
1072; GFX10-32-NEXT:  BB7_8: ; %.return
1073; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
1074; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
1075; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
1076; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
1077; GFX10-32-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
1078; GFX10-32-NEXT:    s_endpgm
1079; GFX10-32-NEXT:  BB7_9:
1080; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
1081; GFX10-32-NEXT:    exp null off, off, off, off done vm
1082; GFX10-32-NEXT:    s_endpgm
1083;
1084; GFX10-64-LABEL: wqm_deriv_loop:
1085; GFX10-64:       ; %bb.0: ; %.entry
1086; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
1087; GFX10-64-NEXT:    s_wqm_b64 exec, exec
1088; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
1089; GFX10-64-NEXT:    s_mov_b32 s2, 0
1090; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1091; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1092; GFX10-64-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
1093; GFX10-64-NEXT:    s_cbranch_execz BB7_3
1094; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
1095; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
1096; GFX10-64-NEXT:    s_cbranch_scc0 BB7_9
1097; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
1098; GFX10-64-NEXT:    s_wqm_b64 s[6:7], s[0:1]
1099; GFX10-64-NEXT:    s_and_b64 exec, exec, s[6:7]
1100; GFX10-64-NEXT:  BB7_3: ; %.continue0.preheader
1101; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
1102; GFX10-64-NEXT:    s_mov_b64 s[4:5], 0
1103; GFX10-64-NEXT:    s_branch BB7_5
1104; GFX10-64-NEXT:  BB7_4: ; %.continue1
1105; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1106; GFX10-64-NEXT:    s_or_b64 exec, exec, s[6:7]
1107; GFX10-64-NEXT:    s_add_i32 s2, s2, 1
1108; GFX10-64-NEXT:    v_cmp_ge_i32_e32 vcc, s2, v1
1109; GFX10-64-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
1110; GFX10-64-NEXT:    s_andn2_b64 exec, exec, s[4:5]
1111; GFX10-64-NEXT:    s_cbranch_execz BB7_8
1112; GFX10-64-NEXT:  BB7_5: ; %.continue0
1113; GFX10-64-NEXT:    ; =>This Inner Loop Header: Depth=1
1114; GFX10-64-NEXT:    s_mov_b64 s[6:7], s[0:1]
1115; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, s2, 0, s[6:7]
1116; GFX10-64-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
1117; GFX10-64-NEXT:    v_mov_b32_e32 v2, v0
1118; GFX10-64-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
1119; GFX10-64-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
1120; GFX10-64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
1121; GFX10-64-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
1122; GFX10-64-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
1123; GFX10-64-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
1124; GFX10-64-NEXT:    s_xor_b64 s[6:7], exec, s[8:9]
1125; GFX10-64-NEXT:    s_cbranch_execz BB7_4
1126; GFX10-64-NEXT:  ; %bb.6: ; %.demote1
1127; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1128; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
1129; GFX10-64-NEXT:    s_cbranch_scc0 BB7_9
1130; GFX10-64-NEXT:  ; %bb.7: ; %.demote1
1131; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
1132; GFX10-64-NEXT:    s_wqm_b64 s[8:9], s[0:1]
1133; GFX10-64-NEXT:    s_and_b64 exec, exec, s[8:9]
1134; GFX10-64-NEXT:    s_branch BB7_4
1135; GFX10-64-NEXT:  BB7_8: ; %.return
1136; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
1137; GFX10-64-NEXT:    s_and_b64 exec, exec, s[0:1]
1138; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
1139; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60
1140; GFX10-64-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
1141; GFX10-64-NEXT:    s_endpgm
1142; GFX10-64-NEXT:  BB7_9:
1143; GFX10-64-NEXT:    s_mov_b64 exec, 0
1144; GFX10-64-NEXT:    exp null off, off, off, off done vm
1145; GFX10-64-NEXT:    s_endpgm
1146.entry:
1147  %p0 = extractelement <2 x float> %input, i32 0
1148  %p1 = extractelement <2 x float> %input, i32 1
1149  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2
1150  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2
1151  %argi = fptosi float %arg to i32
1152  %cond0 = icmp eq i32 %argi, 0
1153  br i1 %cond0, label %.continue0, label %.demote0
1154
1155.demote0:
1156  call void @llvm.amdgcn.wqm.demote(i1 false)
1157  br label %.continue0
1158
1159.continue0:
1160  %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ]
1161  %live = call i1 @llvm.amdgcn.live.mask()
1162  %live.cond = select i1 %live, i32 0, i32 %count
1163  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
1164  %live.v0f = bitcast i32 %live.v0 to float
1165  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
1166  %live.v1f = bitcast i32 %live.v1 to float
1167  %v0 = fsub float %live.v0f, %live.v1f
1168  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
1169  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
1170  %cond2 = and i1 %live, %cond1
1171  br i1 %cond2, label %.continue1, label %.demote1
1172
1173.demote1:
1174  call void @llvm.amdgcn.wqm.demote(i1 false)
1175  br label %.continue1
1176
1177.continue1:
1178  %next = add i32 %count, 1
1179  %loop.cond = icmp slt i32 %next, %limit
1180  br i1 %loop.cond, label %.continue0, label %.return
1181
1182.return:
1183  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3
1184  ret void
1185}
1186
1187define amdgpu_ps void @static_exact_nop(float %arg0, float %arg1) {
1188; SI-LABEL: static_exact_nop:
1189; SI:       ; %bb.0: ; %.entry
1190; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
1191; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
1192; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
1193; SI-NEXT:    s_endpgm
1194;
1195; GFX9-LABEL: static_exact_nop:
1196; GFX9:       ; %bb.0: ; %.entry
1197; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
1198; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
1199; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
1200; GFX9-NEXT:    s_endpgm
1201;
1202; GFX10-32-LABEL: static_exact_nop:
1203; GFX10-32:       ; %bb.0: ; %.entry
1204; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
1205; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
1206; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
1207; GFX10-32-NEXT:    s_endpgm
1208;
1209; GFX10-64-LABEL: static_exact_nop:
1210; GFX10-64:       ; %bb.0: ; %.entry
1211; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
1212; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
1213; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
1214; GFX10-64-NEXT:    s_endpgm
1215.entry:
1216  %c0 = fcmp olt float %arg0, 0.000000e+00
1217  %c1 = fcmp oge float %arg1, 0.0
1218  call void @llvm.amdgcn.wqm.demote(i1 true)
1219  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
1220  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
1221  ret void
1222}
1223
1224
1225declare void @llvm.amdgcn.wqm.demote(i1) #0
1226declare i1 @llvm.amdgcn.live.mask() #0
1227declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
1228declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
1229declare float @llvm.amdgcn.wqm.f32(float) #1
1230declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2
1231declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2
1232declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3
1233declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4
1234
1235attributes #0 = { nounwind }
1236attributes #1 = { nounwind readnone }
1237attributes #2 = { nounwind readnone speculatable }
1238attributes #3 = { inaccessiblememonly nounwind }
1239attributes #4 = { convergent nounwind readnone }
1240