1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
3; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
4; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
5
6
7define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) {
8; GFX7-LABEL: test_div_scale_f32_1:
9; GFX7:       ; %bb.0:
10; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
11; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
12; GFX7-NEXT:    v_mov_b32_e32 v1, 0
13; GFX7-NEXT:    s_mov_b32 s6, 0
14; GFX7-NEXT:    s_mov_b32 s7, 0xf000
15; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
16; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
17; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
18; GFX7-NEXT:    s_waitcnt vmcnt(0)
19; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc
20; GFX7-NEXT:    s_waitcnt vmcnt(0)
21; GFX7-NEXT:    s_mov_b32 s6, -1
22; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], v0, v0, v2
23; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
24; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
25; GFX7-NEXT:    s_endpgm
26;
27; GFX8-LABEL: test_div_scale_f32_1:
28; GFX8:       ; %bb.0:
29; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
30; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
31; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
32; GFX8-NEXT:    v_mov_b32_e32 v0, s2
33; GFX8-NEXT:    v_mov_b32_e32 v1, s3
34; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
35; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
36; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
37; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
38; GFX8-NEXT:    flat_load_dword v0, v[0:1] glc
39; GFX8-NEXT:    s_waitcnt vmcnt(0)
40; GFX8-NEXT:    flat_load_dword v1, v[2:3] glc
41; GFX8-NEXT:    s_waitcnt vmcnt(0)
42; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], v1, v1, v0
43; GFX8-NEXT:    v_mov_b32_e32 v0, s0
44; GFX8-NEXT:    v_mov_b32_e32 v1, s1
45; GFX8-NEXT:    flat_store_dword v[0:1], v2
46; GFX8-NEXT:    s_endpgm
47;
48; GFX10-LABEL: test_div_scale_f32_1:
49; GFX10:       ; %bb.0:
50; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
51; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
52; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
53; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
54; GFX10-NEXT:    s_waitcnt vmcnt(0)
55; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
56; GFX10-NEXT:    s_waitcnt vmcnt(0)
57; GFX10-NEXT:    v_div_scale_f32 v0, s2, v2, v2, v1
58; GFX10-NEXT:    v_mov_b32_e32 v1, 0
59; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
60; GFX10-NEXT:    s_endpgm
61  %tid = call i32 @llvm.amdgcn.workitem.id.x()
62  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
63  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
64
65  %a = load volatile float, float addrspace(1)* %gep.0, align 4
66  %b = load volatile float, float addrspace(1)* %gep.1, align 4
67
68  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false)
69  %result0 = extractvalue { float, i1 } %result, 0
70  store float %result0, float addrspace(1)* %out, align 4
71  ret void
72}
73
74define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) {
75; GFX7-LABEL: test_div_scale_f32_2:
76; GFX7:       ; %bb.0:
77; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
78; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
79; GFX7-NEXT:    v_mov_b32_e32 v1, 0
80; GFX7-NEXT:    s_mov_b32 s6, 0
81; GFX7-NEXT:    s_mov_b32 s7, 0xf000
82; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
83; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
84; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
85; GFX7-NEXT:    s_waitcnt vmcnt(0)
86; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc
87; GFX7-NEXT:    s_waitcnt vmcnt(0)
88; GFX7-NEXT:    s_mov_b32 s6, -1
89; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], v2, v0, v2
90; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
91; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
92; GFX7-NEXT:    s_endpgm
93;
94; GFX8-LABEL: test_div_scale_f32_2:
95; GFX8:       ; %bb.0:
96; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
97; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
98; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
99; GFX8-NEXT:    v_mov_b32_e32 v0, s2
100; GFX8-NEXT:    v_mov_b32_e32 v1, s3
101; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
102; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
103; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
104; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
105; GFX8-NEXT:    flat_load_dword v0, v[0:1] glc
106; GFX8-NEXT:    s_waitcnt vmcnt(0)
107; GFX8-NEXT:    flat_load_dword v1, v[2:3] glc
108; GFX8-NEXT:    s_waitcnt vmcnt(0)
109; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], v0, v1, v0
110; GFX8-NEXT:    v_mov_b32_e32 v0, s0
111; GFX8-NEXT:    v_mov_b32_e32 v1, s1
112; GFX8-NEXT:    flat_store_dword v[0:1], v2
113; GFX8-NEXT:    s_endpgm
114;
115; GFX10-LABEL: test_div_scale_f32_2:
116; GFX10:       ; %bb.0:
117; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
118; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
119; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
120; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
121; GFX10-NEXT:    s_waitcnt vmcnt(0)
122; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
123; GFX10-NEXT:    s_waitcnt vmcnt(0)
124; GFX10-NEXT:    v_div_scale_f32 v0, s2, v1, v2, v1
125; GFX10-NEXT:    v_mov_b32_e32 v1, 0
126; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
127; GFX10-NEXT:    s_endpgm
128  %tid = call i32 @llvm.amdgcn.workitem.id.x()
129  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
130  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
131
132  %a = load volatile float, float addrspace(1)* %gep.0, align 4
133  %b = load volatile float, float addrspace(1)* %gep.1, align 4
134
135  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true)
136  %result0 = extractvalue { float, i1 } %result, 0
137  store float %result0, float addrspace(1)* %out, align 4
138  ret void
139}
140
141define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) {
142; GFX7-LABEL: test_div_scale_f64_1:
143; GFX7:       ; %bb.0:
144; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
145; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
146; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
147; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
148; GFX7-NEXT:    v_mov_b32_e32 v0, s2
149; GFX7-NEXT:    v_mov_b32_e32 v1, s3
150; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
151; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
152; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
153; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
154; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
155; GFX7-NEXT:    s_waitcnt vmcnt(0)
156; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
157; GFX7-NEXT:    s_waitcnt vmcnt(0)
158; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1]
159; GFX7-NEXT:    v_mov_b32_e32 v3, s1
160; GFX7-NEXT:    v_mov_b32_e32 v2, s0
161; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
162; GFX7-NEXT:    s_endpgm
163;
164; GFX8-LABEL: test_div_scale_f64_1:
165; GFX8:       ; %bb.0:
166; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
167; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
168; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
169; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
170; GFX8-NEXT:    v_mov_b32_e32 v0, s2
171; GFX8-NEXT:    v_mov_b32_e32 v1, s3
172; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
173; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
174; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
175; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
176; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
177; GFX8-NEXT:    s_waitcnt vmcnt(0)
178; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
179; GFX8-NEXT:    s_waitcnt vmcnt(0)
180; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1]
181; GFX8-NEXT:    v_mov_b32_e32 v3, s1
182; GFX8-NEXT:    v_mov_b32_e32 v2, s0
183; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
184; GFX8-NEXT:    s_endpgm
185;
186; GFX10-LABEL: test_div_scale_f64_1:
187; GFX10:       ; %bb.0:
188; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
189; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
190; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
191; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
192; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
193; GFX10-NEXT:    s_waitcnt vmcnt(0)
194; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc
195; GFX10-NEXT:    s_waitcnt vmcnt(0)
196; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1]
197; GFX10-NEXT:    v_mov_b32_e32 v2, 0
198; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
199; GFX10-NEXT:    s_endpgm
200  %tid = call i32 @llvm.amdgcn.workitem.id.x()
201  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
202  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
203
204  %a = load volatile double, double addrspace(1)* %gep.0, align 8
205  %b = load volatile double, double addrspace(1)* %gep.1, align 8
206
207  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false)
208  %result0 = extractvalue { double, i1 } %result, 0
209  store double %result0, double addrspace(1)* %out, align 8
210  ret void
211}
212
213define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) {
214; GFX7-LABEL: test_div_scale_f64_2:
215; GFX7:       ; %bb.0:
216; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
217; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
218; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
219; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
220; GFX7-NEXT:    v_mov_b32_e32 v0, s2
221; GFX7-NEXT:    v_mov_b32_e32 v1, s3
222; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
223; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
224; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
225; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
226; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
227; GFX7-NEXT:    s_waitcnt vmcnt(0)
228; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
229; GFX7-NEXT:    s_waitcnt vmcnt(0)
230; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1]
231; GFX7-NEXT:    v_mov_b32_e32 v3, s1
232; GFX7-NEXT:    v_mov_b32_e32 v2, s0
233; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
234; GFX7-NEXT:    s_endpgm
235;
236; GFX8-LABEL: test_div_scale_f64_2:
237; GFX8:       ; %bb.0:
238; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
239; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
240; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
241; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
242; GFX8-NEXT:    v_mov_b32_e32 v0, s2
243; GFX8-NEXT:    v_mov_b32_e32 v1, s3
244; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
245; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
246; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
247; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
248; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
249; GFX8-NEXT:    s_waitcnt vmcnt(0)
250; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] glc
251; GFX8-NEXT:    s_waitcnt vmcnt(0)
252; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1]
253; GFX8-NEXT:    v_mov_b32_e32 v3, s1
254; GFX8-NEXT:    v_mov_b32_e32 v2, s0
255; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
256; GFX8-NEXT:    s_endpgm
257;
258; GFX10-LABEL: test_div_scale_f64_2:
259; GFX10:       ; %bb.0:
260; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
261; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
262; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
263; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
264; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
265; GFX10-NEXT:    s_waitcnt vmcnt(0)
266; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc
267; GFX10-NEXT:    s_waitcnt vmcnt(0)
268; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1]
269; GFX10-NEXT:    v_mov_b32_e32 v2, 0
270; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
271; GFX10-NEXT:    s_endpgm
272  %tid = call i32 @llvm.amdgcn.workitem.id.x()
273  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
274  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
275
276  %a = load volatile double, double addrspace(1)* %gep.0, align 8
277  %b = load volatile double, double addrspace(1)* %gep.1, align 8
278
279  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true)
280  %result0 = extractvalue { double, i1 } %result, 0
281  store double %result0, double addrspace(1)* %out, align 8
282  ret void
283}
284
285define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, [8 x i32], float %a) {
286; GFX7-LABEL: test_div_scale_f32_scalar_num_1:
287; GFX7:       ; %bb.0:
288; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
289; GFX7-NEXT:    s_load_dword s8, s[0:1], 0x15
290; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
291; GFX7-NEXT:    v_mov_b32_e32 v1, 0
292; GFX7-NEXT:    s_mov_b32 s2, 0
293; GFX7-NEXT:    s_mov_b32 s3, 0xf000
294; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
295; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
296; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
297; GFX7-NEXT:    s_mov_b32 s2, -1
298; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
299; GFX7-NEXT:    s_waitcnt vmcnt(0)
300; GFX7-NEXT:    v_div_scale_f32 v0, s[0:1], v0, v0, s8
301; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
302; GFX7-NEXT:    s_endpgm
303;
304; GFX8-LABEL: test_div_scale_f32_scalar_num_1:
305; GFX8:       ; %bb.0:
306; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
307; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x54
308; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
309; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
310; GFX8-NEXT:    v_mov_b32_e32 v0, s6
311; GFX8-NEXT:    v_mov_b32_e32 v1, s7
312; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
313; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
314; GFX8-NEXT:    flat_load_dword v0, v[0:1]
315; GFX8-NEXT:    s_waitcnt vmcnt(0)
316; GFX8-NEXT:    v_div_scale_f32 v2, s[0:1], v0, v0, s0
317; GFX8-NEXT:    v_mov_b32_e32 v0, s4
318; GFX8-NEXT:    v_mov_b32_e32 v1, s5
319; GFX8-NEXT:    flat_store_dword v[0:1], v2
320; GFX8-NEXT:    s_endpgm
321;
322; GFX10-LABEL: test_div_scale_f32_scalar_num_1:
323; GFX10:       ; %bb.0:
324; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
325; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
326; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x54
327; GFX10-NEXT:    v_mov_b32_e32 v1, 0
328; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
329; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
330; GFX10-NEXT:    s_waitcnt vmcnt(0)
331; GFX10-NEXT:    v_div_scale_f32 v0, s0, v0, v0, s0
332; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
333; GFX10-NEXT:    s_endpgm
334  %tid = call i32 @llvm.amdgcn.workitem.id.x()
335  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
336
337  %b = load float, float addrspace(1)* %gep, align 4
338
339  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false)
340  %result0 = extractvalue { float, i1 } %result, 0
341  store float %result0, float addrspace(1)* %out, align 4
342  ret void
343}
344
345define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) {
346; GFX7-LABEL: test_div_scale_f32_scalar_num_2:
347; GFX7:       ; %bb.0:
348; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
349; GFX7-NEXT:    s_load_dword s8, s[0:1], 0xd
350; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
351; GFX7-NEXT:    v_mov_b32_e32 v1, 0
352; GFX7-NEXT:    s_mov_b32 s2, 0
353; GFX7-NEXT:    s_mov_b32 s3, 0xf000
354; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
355; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
356; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
357; GFX7-NEXT:    s_mov_b32 s2, -1
358; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
359; GFX7-NEXT:    s_waitcnt vmcnt(0)
360; GFX7-NEXT:    v_div_scale_f32 v0, s[0:1], s8, v0, s8
361; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
362; GFX7-NEXT:    s_endpgm
363;
364; GFX8-LABEL: test_div_scale_f32_scalar_num_2:
365; GFX8:       ; %bb.0:
366; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
367; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x34
368; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
369; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
370; GFX8-NEXT:    v_mov_b32_e32 v0, s6
371; GFX8-NEXT:    v_mov_b32_e32 v1, s7
372; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
373; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
374; GFX8-NEXT:    flat_load_dword v0, v[0:1]
375; GFX8-NEXT:    s_waitcnt vmcnt(0)
376; GFX8-NEXT:    v_div_scale_f32 v2, s[0:1], s0, v0, s0
377; GFX8-NEXT:    v_mov_b32_e32 v0, s4
378; GFX8-NEXT:    v_mov_b32_e32 v1, s5
379; GFX8-NEXT:    flat_store_dword v[0:1], v2
380; GFX8-NEXT:    s_endpgm
381;
382; GFX10-LABEL: test_div_scale_f32_scalar_num_2:
383; GFX10:       ; %bb.0:
384; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
385; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
386; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x34
387; GFX10-NEXT:    v_mov_b32_e32 v1, 0
388; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
389; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
390; GFX10-NEXT:    s_waitcnt vmcnt(0)
391; GFX10-NEXT:    v_div_scale_f32 v0, s0, s0, v0, s0
392; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
393; GFX10-NEXT:    s_endpgm
394  %tid = call i32 @llvm.amdgcn.workitem.id.x()
395  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
396
397  %b = load float, float addrspace(1)* %gep, align 4
398
399  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true)
400  %result0 = extractvalue { float, i1 } %result, 0
401  store float %result0, float addrspace(1)* %out, align 4
402  ret void
403}
404
405define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) {
406; GFX7-LABEL: test_div_scale_f32_scalar_den_1:
407; GFX7:       ; %bb.0:
408; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
409; GFX7-NEXT:    s_load_dword s8, s[0:1], 0xd
410; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
411; GFX7-NEXT:    v_mov_b32_e32 v1, 0
412; GFX7-NEXT:    s_mov_b32 s2, 0
413; GFX7-NEXT:    s_mov_b32 s3, 0xf000
414; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
415; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
416; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
417; GFX7-NEXT:    s_mov_b32 s2, -1
418; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
419; GFX7-NEXT:    s_waitcnt vmcnt(0)
420; GFX7-NEXT:    v_div_scale_f32 v0, s[0:1], s8, s8, v0
421; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
422; GFX7-NEXT:    s_endpgm
423;
424; GFX8-LABEL: test_div_scale_f32_scalar_den_1:
425; GFX8:       ; %bb.0:
426; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
427; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x34
428; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
429; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
430; GFX8-NEXT:    v_mov_b32_e32 v0, s6
431; GFX8-NEXT:    v_mov_b32_e32 v1, s7
432; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
433; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
434; GFX8-NEXT:    flat_load_dword v0, v[0:1]
435; GFX8-NEXT:    s_waitcnt vmcnt(0)
436; GFX8-NEXT:    v_div_scale_f32 v2, s[0:1], s0, s0, v0
437; GFX8-NEXT:    v_mov_b32_e32 v0, s4
438; GFX8-NEXT:    v_mov_b32_e32 v1, s5
439; GFX8-NEXT:    flat_store_dword v[0:1], v2
440; GFX8-NEXT:    s_endpgm
441;
442; GFX10-LABEL: test_div_scale_f32_scalar_den_1:
443; GFX10:       ; %bb.0:
444; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
445; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
446; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x34
447; GFX10-NEXT:    v_mov_b32_e32 v1, 0
448; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
449; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
450; GFX10-NEXT:    s_waitcnt vmcnt(0)
451; GFX10-NEXT:    v_div_scale_f32 v0, s0, s0, s0, v0
452; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
453; GFX10-NEXT:    s_endpgm
454  %tid = call i32 @llvm.amdgcn.workitem.id.x()
455  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
456
457  %a = load float, float addrspace(1)* %gep, align 4
458
459  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false)
460  %result0 = extractvalue { float, i1 } %result, 0
461  store float %result0, float addrspace(1)* %out, align 4
462  ret void
463}
464
465define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) {
466; GFX7-LABEL: test_div_scale_f32_scalar_den_2:
467; GFX7:       ; %bb.0:
468; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
469; GFX7-NEXT:    s_load_dword s8, s[0:1], 0xd
470; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
471; GFX7-NEXT:    v_mov_b32_e32 v1, 0
472; GFX7-NEXT:    s_mov_b32 s2, 0
473; GFX7-NEXT:    s_mov_b32 s3, 0xf000
474; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
475; GFX7-NEXT:    s_mov_b64 s[0:1], s[6:7]
476; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
477; GFX7-NEXT:    s_mov_b32 s2, -1
478; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
479; GFX7-NEXT:    s_waitcnt vmcnt(0)
480; GFX7-NEXT:    v_div_scale_f32 v0, s[0:1], v0, s8, v0
481; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
482; GFX7-NEXT:    s_endpgm
483;
484; GFX8-LABEL: test_div_scale_f32_scalar_den_2:
485; GFX8:       ; %bb.0:
486; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
487; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x34
488; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
489; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
490; GFX8-NEXT:    v_mov_b32_e32 v0, s6
491; GFX8-NEXT:    v_mov_b32_e32 v1, s7
492; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
493; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
494; GFX8-NEXT:    flat_load_dword v0, v[0:1]
495; GFX8-NEXT:    s_waitcnt vmcnt(0)
496; GFX8-NEXT:    v_div_scale_f32 v2, s[0:1], v0, s0, v0
497; GFX8-NEXT:    v_mov_b32_e32 v0, s4
498; GFX8-NEXT:    v_mov_b32_e32 v1, s5
499; GFX8-NEXT:    flat_store_dword v[0:1], v2
500; GFX8-NEXT:    s_endpgm
501;
502; GFX10-LABEL: test_div_scale_f32_scalar_den_2:
503; GFX10:       ; %bb.0:
504; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
505; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
506; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x34
507; GFX10-NEXT:    v_mov_b32_e32 v1, 0
508; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
509; GFX10-NEXT:    global_load_dword v0, v0, s[6:7]
510; GFX10-NEXT:    s_waitcnt vmcnt(0)
511; GFX10-NEXT:    v_div_scale_f32 v0, s0, v0, s0, v0
512; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
513; GFX10-NEXT:    s_endpgm
514  %tid = call i32 @llvm.amdgcn.workitem.id.x()
515  %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
516
517  %a = load float, float addrspace(1)* %gep, align 4
518
519  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true)
520  %result0 = extractvalue { float, i1 } %result, 0
521  store float %result0, float addrspace(1)* %out, align 4
522  ret void
523}
524
525define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, [8 x i32], double %a) {
526; GFX7-LABEL: test_div_scale_f64_scalar_num_1:
527; GFX7:       ; %bb.0:
528; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
529; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x15
530; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
531; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
532; GFX7-NEXT:    v_mov_b32_e32 v0, s6
533; GFX7-NEXT:    v_mov_b32_e32 v1, s7
534; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
535; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
536; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
537; GFX7-NEXT:    v_mov_b32_e32 v2, s4
538; GFX7-NEXT:    v_mov_b32_e32 v3, s5
539; GFX7-NEXT:    s_waitcnt vmcnt(0)
540; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1]
541; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
542; GFX7-NEXT:    s_endpgm
543;
544; GFX8-LABEL: test_div_scale_f64_scalar_num_1:
545; GFX8:       ; %bb.0:
546; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
547; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
548; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
549; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
550; GFX8-NEXT:    v_mov_b32_e32 v0, s6
551; GFX8-NEXT:    v_mov_b32_e32 v1, s7
552; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
553; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
554; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
555; GFX8-NEXT:    v_mov_b32_e32 v2, s4
556; GFX8-NEXT:    v_mov_b32_e32 v3, s5
557; GFX8-NEXT:    s_waitcnt vmcnt(0)
558; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1]
559; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
560; GFX8-NEXT:    s_endpgm
561;
562; GFX10-LABEL: test_div_scale_f64_scalar_num_1:
563; GFX10:       ; %bb.0:
564; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
565; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
566; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
567; GFX10-NEXT:    v_mov_b32_e32 v2, 0
568; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
569; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
570; GFX10-NEXT:    s_waitcnt vmcnt(0)
571; GFX10-NEXT:    v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[0:1]
572; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
573; GFX10-NEXT:    s_endpgm
574  %tid = call i32 @llvm.amdgcn.workitem.id.x()
575  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
576
577  %b = load double, double addrspace(1)* %gep, align 8
578
579  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false)
580  %result0 = extractvalue { double, i1 } %result, 0
581  store double %result0, double addrspace(1)* %out, align 8
582  ret void
583}
584
585define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, [8 x i32],  double %a) {
586; GFX7-LABEL: test_div_scale_f64_scalar_num_2:
587; GFX7:       ; %bb.0:
588; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
589; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x15
590; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
591; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
592; GFX7-NEXT:    v_mov_b32_e32 v0, s6
593; GFX7-NEXT:    v_mov_b32_e32 v1, s7
594; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
595; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
596; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
597; GFX7-NEXT:    v_mov_b32_e32 v2, s4
598; GFX7-NEXT:    v_mov_b32_e32 v3, s5
599; GFX7-NEXT:    s_waitcnt vmcnt(0)
600; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1]
601; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
602; GFX7-NEXT:    s_endpgm
603;
604; GFX8-LABEL: test_div_scale_f64_scalar_num_2:
605; GFX8:       ; %bb.0:
606; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
607; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
608; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
609; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
610; GFX8-NEXT:    v_mov_b32_e32 v0, s6
611; GFX8-NEXT:    v_mov_b32_e32 v1, s7
612; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
613; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
614; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
615; GFX8-NEXT:    v_mov_b32_e32 v2, s4
616; GFX8-NEXT:    v_mov_b32_e32 v3, s5
617; GFX8-NEXT:    s_waitcnt vmcnt(0)
618; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1]
619; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
620; GFX8-NEXT:    s_endpgm
621;
622; GFX10-LABEL: test_div_scale_f64_scalar_num_2:
623; GFX10:       ; %bb.0:
624; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
625; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
626; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
627; GFX10-NEXT:    v_mov_b32_e32 v2, 0
628; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
629; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
630; GFX10-NEXT:    s_waitcnt vmcnt(0)
631; GFX10-NEXT:    v_div_scale_f64 v[0:1], s0, s[0:1], v[0:1], s[0:1]
632; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
633; GFX10-NEXT:    s_endpgm
634  %tid = call i32 @llvm.amdgcn.workitem.id.x()
635  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
636
637  %b = load double, double addrspace(1)* %gep, align 8
638
639  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true)
640  %result0 = extractvalue { double, i1 } %result, 0
641  store double %result0, double addrspace(1)* %out, align 8
642  ret void
643}
644
645define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, [8 x i32], double %b) {
646; GFX7-LABEL: test_div_scale_f64_scalar_den_1:
647; GFX7:       ; %bb.0:
648; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
649; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x15
650; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
651; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
652; GFX7-NEXT:    v_mov_b32_e32 v0, s6
653; GFX7-NEXT:    v_mov_b32_e32 v1, s7
654; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
655; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
656; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
657; GFX7-NEXT:    v_mov_b32_e32 v2, s4
658; GFX7-NEXT:    v_mov_b32_e32 v3, s5
659; GFX7-NEXT:    s_waitcnt vmcnt(0)
660; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1]
661; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
662; GFX7-NEXT:    s_endpgm
663;
664; GFX8-LABEL: test_div_scale_f64_scalar_den_1:
665; GFX8:       ; %bb.0:
666; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
667; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
668; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
669; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
670; GFX8-NEXT:    v_mov_b32_e32 v0, s6
671; GFX8-NEXT:    v_mov_b32_e32 v1, s7
672; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
673; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
674; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
675; GFX8-NEXT:    v_mov_b32_e32 v2, s4
676; GFX8-NEXT:    v_mov_b32_e32 v3, s5
677; GFX8-NEXT:    s_waitcnt vmcnt(0)
678; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1]
679; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
680; GFX8-NEXT:    s_endpgm
681;
682; GFX10-LABEL: test_div_scale_f64_scalar_den_1:
683; GFX10:       ; %bb.0:
684; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
685; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
686; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
687; GFX10-NEXT:    v_mov_b32_e32 v2, 0
688; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
689; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
690; GFX10-NEXT:    s_waitcnt vmcnt(0)
691; GFX10-NEXT:    v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], v[0:1]
692; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
693; GFX10-NEXT:    s_endpgm
694  %tid = call i32 @llvm.amdgcn.workitem.id.x()
695  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
696
697  %a = load double, double addrspace(1)* %gep, align 8
698
699  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false)
700  %result0 = extractvalue { double, i1 } %result, 0
701  store double %result0, double addrspace(1)* %out, align 8
702  ret void
703}
704
705define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, [8 x i32], double %b) {
706; GFX7-LABEL: test_div_scale_f64_scalar_den_2:
707; GFX7:       ; %bb.0:
708; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
709; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x15
710; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
711; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
712; GFX7-NEXT:    v_mov_b32_e32 v0, s6
713; GFX7-NEXT:    v_mov_b32_e32 v1, s7
714; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
715; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
716; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
717; GFX7-NEXT:    v_mov_b32_e32 v2, s4
718; GFX7-NEXT:    v_mov_b32_e32 v3, s5
719; GFX7-NEXT:    s_waitcnt vmcnt(0)
720; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1]
721; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
722; GFX7-NEXT:    s_endpgm
723;
724; GFX8-LABEL: test_div_scale_f64_scalar_den_2:
725; GFX8:       ; %bb.0:
726; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
727; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
728; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
729; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
730; GFX8-NEXT:    v_mov_b32_e32 v0, s6
731; GFX8-NEXT:    v_mov_b32_e32 v1, s7
732; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
733; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
734; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
735; GFX8-NEXT:    v_mov_b32_e32 v2, s4
736; GFX8-NEXT:    v_mov_b32_e32 v3, s5
737; GFX8-NEXT:    s_waitcnt vmcnt(0)
738; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1]
739; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
740; GFX8-NEXT:    s_endpgm
741;
742; GFX10-LABEL: test_div_scale_f64_scalar_den_2:
743; GFX10:       ; %bb.0:
744; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
745; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
746; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x54
747; GFX10-NEXT:    v_mov_b32_e32 v2, 0
748; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
749; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[6:7]
750; GFX10-NEXT:    s_waitcnt vmcnt(0)
751; GFX10-NEXT:    v_div_scale_f64 v[0:1], s0, v[0:1], s[0:1], v[0:1]
752; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
753; GFX10-NEXT:    s_endpgm
754  %tid = call i32 @llvm.amdgcn.workitem.id.x()
755  %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
756
757  %a = load double, double addrspace(1)* %gep, align 8
758
759  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true)
760  %result0 = extractvalue { double, i1 } %result, 0
761  store double %result0, double addrspace(1)* %out, align 8
762  ret void
763}
764
765define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) {
766; GFX7-LABEL: test_div_scale_f32_all_scalar_1:
767; GFX7:       ; %bb.0:
768; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
769; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x13
770; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x1c
771; GFX7-NEXT:    s_mov_b32 s6, -1
772; GFX7-NEXT:    s_mov_b32 s7, 0xf000
773; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
774; GFX7-NEXT:    v_mov_b32_e32 v0, s0
775; GFX7-NEXT:    v_div_scale_f32 v0, s[0:1], v0, v0, s2
776; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
777; GFX7-NEXT:    s_endpgm
778;
779; GFX8-LABEL: test_div_scale_f32_all_scalar_1:
780; GFX8:       ; %bb.0:
781; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x4c
782; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x70
783; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
784; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
785; GFX8-NEXT:    v_mov_b32_e32 v0, s3
786; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], v0, v0, s2
787; GFX8-NEXT:    v_mov_b32_e32 v0, s0
788; GFX8-NEXT:    v_mov_b32_e32 v1, s1
789; GFX8-NEXT:    flat_store_dword v[0:1], v2
790; GFX8-NEXT:    s_endpgm
791;
792; GFX10-LABEL: test_div_scale_f32_all_scalar_1:
793; GFX10:       ; %bb.0:
794; GFX10-NEXT:    s_clause 0x2
795; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x4c
796; GFX10-NEXT:    s_load_dword s5, s[0:1], 0x70
797; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
798; GFX10-NEXT:    v_mov_b32_e32 v1, 0
799; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
800; GFX10-NEXT:    v_div_scale_f32 v0, s0, s5, s5, s4
801; GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
802; GFX10-NEXT:    s_endpgm
803  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false)
804  %result0 = extractvalue { float, i1 } %result, 0
805  store float %result0, float addrspace(1)* %out, align 4
806  ret void
807}
808
809define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) {
810; GFX7-LABEL: test_div_scale_f32_all_scalar_2:
811; GFX7:       ; %bb.0:
812; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
813; GFX7-NEXT:    s_load_dword s2, s[0:1], 0x13
814; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x1c
815; GFX7-NEXT:    s_mov_b32 s6, -1
816; GFX7-NEXT:    s_mov_b32 s7, 0xf000
817; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
818; GFX7-NEXT:    v_mov_b32_e32 v0, s0
819; GFX7-NEXT:    v_div_scale_f32 v0, s[0:1], s2, v0, s2
820; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
821; GFX7-NEXT:    s_endpgm
822;
823; GFX8-LABEL: test_div_scale_f32_all_scalar_2:
824; GFX8:       ; %bb.0:
825; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x4c
826; GFX8-NEXT:    s_load_dword s3, s[0:1], 0x70
827; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
828; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
829; GFX8-NEXT:    v_mov_b32_e32 v0, s3
830; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], s2, v0, s2
831; GFX8-NEXT:    v_mov_b32_e32 v0, s0
832; GFX8-NEXT:    v_mov_b32_e32 v1, s1
833; GFX8-NEXT:    flat_store_dword v[0:1], v2
834; GFX8-NEXT:    s_endpgm
835;
836; GFX10-LABEL: test_div_scale_f32_all_scalar_2:
837; GFX10:       ; %bb.0:
838; GFX10-NEXT:    s_clause 0x2
839; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x4c
840; GFX10-NEXT:    s_load_dword s5, s[0:1], 0x70
841; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
842; GFX10-NEXT:    v_mov_b32_e32 v1, 0
843; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
844; GFX10-NEXT:    v_div_scale_f32 v0, s0, s4, s5, s4
845; GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
846; GFX10-NEXT:    s_endpgm
847  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true)
848  %result0 = extractvalue { float, i1 } %result, 0
849  store float %result0, float addrspace(1)* %out, align 4
850  ret void
851}
852
853define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) {
854; GFX7-LABEL: test_div_scale_f64_all_scalar_1:
855; GFX7:       ; %bb.0:
856; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
857; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x1d
858; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
859; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
860; GFX7-NEXT:    v_mov_b32_e32 v0, s4
861; GFX7-NEXT:    v_mov_b32_e32 v1, s5
862; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3]
863; GFX7-NEXT:    v_mov_b32_e32 v3, s1
864; GFX7-NEXT:    v_mov_b32_e32 v2, s0
865; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
866; GFX7-NEXT:    s_endpgm
867;
868; GFX8-LABEL: test_div_scale_f64_all_scalar_1:
869; GFX8:       ; %bb.0:
870; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
871; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x74
872; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
873; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
874; GFX8-NEXT:    v_mov_b32_e32 v0, s4
875; GFX8-NEXT:    v_mov_b32_e32 v1, s5
876; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3]
877; GFX8-NEXT:    v_mov_b32_e32 v3, s1
878; GFX8-NEXT:    v_mov_b32_e32 v2, s0
879; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
880; GFX8-NEXT:    s_endpgm
881;
882; GFX10-LABEL: test_div_scale_f64_all_scalar_1:
883; GFX10:       ; %bb.0:
884; GFX10-NEXT:    s_clause 0x1
885; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
886; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x74
887; GFX10-NEXT:    v_mov_b32_e32 v2, 0
888; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
889; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
890; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3]
891; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
892; GFX10-NEXT:    s_endpgm
893  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false)
894  %result0 = extractvalue { double, i1 } %result, 0
895  store double %result0, double addrspace(1)* %out, align 8
896  ret void
897}
898
899define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) {
900; GFX7-LABEL: test_div_scale_f64_all_scalar_2:
901; GFX7:       ; %bb.0:
902; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x13
903; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x1d
904; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
905; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
906; GFX7-NEXT:    v_mov_b32_e32 v0, s4
907; GFX7-NEXT:    v_mov_b32_e32 v1, s5
908; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], s[2:3], v[0:1], s[2:3]
909; GFX7-NEXT:    v_mov_b32_e32 v3, s1
910; GFX7-NEXT:    v_mov_b32_e32 v2, s0
911; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
912; GFX7-NEXT:    s_endpgm
913;
914; GFX8-LABEL: test_div_scale_f64_all_scalar_2:
915; GFX8:       ; %bb.0:
916; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
917; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x74
918; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
919; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
920; GFX8-NEXT:    v_mov_b32_e32 v0, s4
921; GFX8-NEXT:    v_mov_b32_e32 v1, s5
922; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[2:3], s[2:3], v[0:1], s[2:3]
923; GFX8-NEXT:    v_mov_b32_e32 v3, s1
924; GFX8-NEXT:    v_mov_b32_e32 v2, s0
925; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
926; GFX8-NEXT:    s_endpgm
927;
928; GFX10-LABEL: test_div_scale_f64_all_scalar_2:
929; GFX10:       ; %bb.0:
930; GFX10-NEXT:    s_clause 0x1
931; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x4c
932; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x74
933; GFX10-NEXT:    v_mov_b32_e32 v2, 0
934; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
935; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
936; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3]
937; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
938; GFX10-NEXT:    s_endpgm
939  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true)
940  %result0 = extractvalue { double, i1 } %result, 0
941  store double %result0, double addrspace(1)* %out, align 8
942  ret void
943}
944
945define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) {
946; GFX7-LABEL: test_div_scale_f32_inline_imm_num:
947; GFX7:       ; %bb.0:
948; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
949; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
950; GFX7-NEXT:    v_mov_b32_e32 v1, 0
951; GFX7-NEXT:    s_mov_b32 s6, 0
952; GFX7-NEXT:    s_mov_b32 s7, 0xf000
953; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
954; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
955; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
956; GFX7-NEXT:    s_mov_b32 s6, -1
957; GFX7-NEXT:    s_waitcnt vmcnt(0)
958; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], v0, v0, 1.0
959; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
960; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
961; GFX7-NEXT:    s_endpgm
962;
963; GFX8-LABEL: test_div_scale_f32_inline_imm_num:
964; GFX8:       ; %bb.0:
965; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
966; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
967; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
968; GFX8-NEXT:    v_mov_b32_e32 v0, s2
969; GFX8-NEXT:    v_mov_b32_e32 v1, s3
970; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
971; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
972; GFX8-NEXT:    flat_load_dword v0, v[0:1]
973; GFX8-NEXT:    s_waitcnt vmcnt(0)
974; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], v0, v0, 1.0
975; GFX8-NEXT:    v_mov_b32_e32 v0, s0
976; GFX8-NEXT:    v_mov_b32_e32 v1, s1
977; GFX8-NEXT:    flat_store_dword v[0:1], v2
978; GFX8-NEXT:    s_endpgm
979;
980; GFX10-LABEL: test_div_scale_f32_inline_imm_num:
981; GFX10:       ; %bb.0:
982; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
983; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
984; GFX10-NEXT:    v_mov_b32_e32 v1, 0
985; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
986; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
987; GFX10-NEXT:    s_waitcnt vmcnt(0)
988; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, 1.0
989; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
990; GFX10-NEXT:    s_endpgm
991  %tid = call i32 @llvm.amdgcn.workitem.id.x()
992  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
993  %a = load float, float addrspace(1)* %gep.0, align 4
994
995  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 1.0, float %a, i1 false)
996  %result0 = extractvalue { float, i1 } %result, 0
997  store float %result0, float addrspace(1)* %out, align 4
998  ret void
999}
1000
1001define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) {
1002; GFX7-LABEL: test_div_scale_f32_inline_imm_den:
1003; GFX7:       ; %bb.0:
1004; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1005; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1006; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1007; GFX7-NEXT:    s_mov_b32 s6, 0
1008; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1009; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1010; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
1011; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1012; GFX7-NEXT:    s_mov_b32 s6, -1
1013; GFX7-NEXT:    s_waitcnt vmcnt(0)
1014; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], 2.0, 2.0, v0
1015; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
1016; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1017; GFX7-NEXT:    s_endpgm
1018;
1019; GFX8-LABEL: test_div_scale_f32_inline_imm_den:
1020; GFX8:       ; %bb.0:
1021; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1022; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1023; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1024; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1025; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1026; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1027; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1028; GFX8-NEXT:    flat_load_dword v0, v[0:1]
1029; GFX8-NEXT:    s_waitcnt vmcnt(0)
1030; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], 2.0, 2.0, v0
1031; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1032; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1033; GFX8-NEXT:    flat_store_dword v[0:1], v2
1034; GFX8-NEXT:    s_endpgm
1035;
1036; GFX10-LABEL: test_div_scale_f32_inline_imm_den:
1037; GFX10:       ; %bb.0:
1038; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1039; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1040; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1041; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1042; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1043; GFX10-NEXT:    s_waitcnt vmcnt(0)
1044; GFX10-NEXT:    v_div_scale_f32 v0, s2, 2.0, 2.0, v0
1045; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1046; GFX10-NEXT:    s_endpgm
1047  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1048  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
1049  %a = load float, float addrspace(1)* %gep.0, align 4
1050
1051  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float 2.0, i1 false)
1052  %result0 = extractvalue { float, i1 } %result, 0
1053  store float %result0, float addrspace(1)* %out, align 4
1054  ret void
1055}
1056
1057define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) {
1058; GFX7-LABEL: test_div_scale_f32_fabs_num:
1059; GFX7:       ; %bb.0:
1060; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1061; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1062; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1063; GFX7-NEXT:    s_mov_b32 s6, 0
1064; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1065; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1066; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
1067; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
1068; GFX7-NEXT:    s_waitcnt vmcnt(0)
1069; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc
1070; GFX7-NEXT:    s_waitcnt vmcnt(0)
1071; GFX7-NEXT:    s_mov_b32 s6, -1
1072; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v2
1073; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], v0, v0, v1
1074; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
1075; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1076; GFX7-NEXT:    s_endpgm
1077;
1078; GFX8-LABEL: test_div_scale_f32_fabs_num:
1079; GFX8:       ; %bb.0:
1080; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1081; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1082; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1083; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1084; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1085; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1086; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1087; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
1088; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1089; GFX8-NEXT:    flat_load_dword v0, v[0:1] glc
1090; GFX8-NEXT:    s_waitcnt vmcnt(0)
1091; GFX8-NEXT:    flat_load_dword v1, v[2:3] glc
1092; GFX8-NEXT:    s_waitcnt vmcnt(0)
1093; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
1094; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], v1, v1, v0
1095; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1096; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1097; GFX8-NEXT:    flat_store_dword v[0:1], v2
1098; GFX8-NEXT:    s_endpgm
1099;
1100; GFX10-LABEL: test_div_scale_f32_fabs_num:
1101; GFX10:       ; %bb.0:
1102; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1103; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1104; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1105; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
1106; GFX10-NEXT:    s_waitcnt vmcnt(0)
1107; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
1108; GFX10-NEXT:    s_waitcnt vmcnt(0)
1109; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v1
1110; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1111; GFX10-NEXT:    v_div_scale_f32 v0, s2, v2, v2, v0
1112; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1113; GFX10-NEXT:    s_endpgm
1114  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1115  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
1116  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
1117
1118  %a = load volatile float, float addrspace(1)* %gep.0, align 4
1119  %b = load volatile float, float addrspace(1)* %gep.1, align 4
1120
1121  %a.fabs = call float @llvm.fabs.f32(float %a)
1122
1123  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a.fabs, float %b, i1 false)
1124  %result0 = extractvalue { float, i1 } %result, 0
1125  store float %result0, float addrspace(1)* %out, align 4
1126  ret void
1127}
1128
1129define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) {
1130; GFX7-LABEL: test_div_scale_f32_fabs_den:
1131; GFX7:       ; %bb.0:
1132; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1133; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1134; GFX7-NEXT:    v_mov_b32_e32 v1, 0
1135; GFX7-NEXT:    s_mov_b32 s6, 0
1136; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1137; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1138; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
1139; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
1140; GFX7-NEXT:    s_waitcnt vmcnt(0)
1141; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc
1142; GFX7-NEXT:    s_waitcnt vmcnt(0)
1143; GFX7-NEXT:    s_mov_b32 s6, -1
1144; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
1145; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], v0, v0, v2
1146; GFX7-NEXT:    s_mov_b64 s[2:3], s[6:7]
1147; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1148; GFX7-NEXT:    s_endpgm
1149;
1150; GFX8-LABEL: test_div_scale_f32_fabs_den:
1151; GFX8:       ; %bb.0:
1152; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1153; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1154; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1155; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1156; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1157; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1158; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1159; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
1160; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1161; GFX8-NEXT:    flat_load_dword v0, v[0:1] glc
1162; GFX8-NEXT:    s_waitcnt vmcnt(0)
1163; GFX8-NEXT:    flat_load_dword v1, v[2:3] glc
1164; GFX8-NEXT:    s_waitcnt vmcnt(0)
1165; GFX8-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
1166; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], v1, v1, v0
1167; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1168; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1169; GFX8-NEXT:    flat_store_dword v[0:1], v2
1170; GFX8-NEXT:    s_endpgm
1171;
1172; GFX10-LABEL: test_div_scale_f32_fabs_den:
1173; GFX10:       ; %bb.0:
1174; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1175; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1176; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1177; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
1178; GFX10-NEXT:    s_waitcnt vmcnt(0)
1179; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
1180; GFX10-NEXT:    s_waitcnt vmcnt(0)
1181; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v2
1182; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, v1
1183; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1184; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1185; GFX10-NEXT:    s_endpgm
1186  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1187  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
1188  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
1189
1190  %a = load volatile float, float addrspace(1)* %gep.0, align 4
1191  %b = load volatile float, float addrspace(1)* %gep.1, align 4
1192
1193  %b.fabs = call float @llvm.fabs.f32(float %b)
1194
1195  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b.fabs, i1 false)
1196  %result0 = extractvalue { float, i1 } %result, 0
1197  store float %result0, float addrspace(1)* %out, align 4
1198  ret void
1199}
1200
1201define amdgpu_kernel void @test_div_scale_f32_val_undef_val(float addrspace(1)* %out) #0 {
1202; GFX7-LABEL: test_div_scale_f32_val_undef_val:
1203; GFX7:       ; %bb.0:
1204; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1205; GFX7-NEXT:    v_mov_b32_e32 v0, 0x41000000
1206; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1207; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], s0, s0, v0
1208; GFX7-NEXT:    s_mov_b32 s2, -1
1209; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1210; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1211; GFX7-NEXT:    s_endpgm
1212;
1213; GFX8-LABEL: test_div_scale_f32_val_undef_val:
1214; GFX8:       ; %bb.0:
1215; GFX8-NEXT:    v_mov_b32_e32 v0, 0x41000000
1216; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], s0, s0, v0
1217; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1218; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1219; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1220; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1221; GFX8-NEXT:    flat_store_dword v[0:1], v2
1222; GFX8-NEXT:    s_endpgm
1223;
1224; GFX10-LABEL: test_div_scale_f32_val_undef_val:
1225; GFX10:       ; %bb.0:
1226; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1227; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1228; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1229; GFX10-NEXT:    v_div_scale_f32 v0, s2, s0, s0, 0x41000000
1230; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1231; GFX10-NEXT:    s_endpgm
1232  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 false)
1233  %result0 = extractvalue { float, i1 } %result, 0
1234  store float %result0, float addrspace(1)* %out, align 4
1235  ret void
1236}
1237
1238define amdgpu_kernel void @test_div_scale_f32_undef_val_val(float addrspace(1)* %out) #0 {
1239; GFX7-LABEL: test_div_scale_f32_undef_val_val:
1240; GFX7:       ; %bb.0:
1241; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1242; GFX7-NEXT:    v_mov_b32_e32 v0, 0x41000000
1243; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1244; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], v0, v0, s0
1245; GFX7-NEXT:    s_mov_b32 s2, -1
1246; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1247; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1248; GFX7-NEXT:    s_endpgm
1249;
1250; GFX8-LABEL: test_div_scale_f32_undef_val_val:
1251; GFX8:       ; %bb.0:
1252; GFX8-NEXT:    v_mov_b32_e32 v0, 0x41000000
1253; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], v0, v0, s0
1254; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1255; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1256; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1257; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1258; GFX8-NEXT:    flat_store_dword v[0:1], v2
1259; GFX8-NEXT:    s_endpgm
1260;
1261; GFX10-LABEL: test_div_scale_f32_undef_val_val:
1262; GFX10:       ; %bb.0:
1263; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1264; GFX10-NEXT:    v_mov_b32_e32 v0, 0x41000000
1265; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1266; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1267; GFX10-NEXT:    v_div_scale_f32 v0, s2, v0, v0, s0
1268; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1269; GFX10-NEXT:    s_endpgm
1270  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false)
1271  %result0 = extractvalue { float, i1 } %result, 0
1272  store float %result0, float addrspace(1)* %out, align 4
1273  ret void
1274}
1275
1276define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(float addrspace(1)* %out) #0 {
1277; GFX7-LABEL: test_div_scale_f32_undef_undef_val:
1278; GFX7:       ; %bb.0:
1279; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1280; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1281; GFX7-NEXT:    v_div_scale_f32 v0, s[2:3], s0, s0, s0
1282; GFX7-NEXT:    s_mov_b32 s2, -1
1283; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1284; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1285; GFX7-NEXT:    s_endpgm
1286;
1287; GFX8-LABEL: test_div_scale_f32_undef_undef_val:
1288; GFX8:       ; %bb.0:
1289; GFX8-NEXT:    v_div_scale_f32 v2, s[2:3], s0, s0, s0
1290; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1291; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1292; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1293; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1294; GFX8-NEXT:    flat_store_dword v[0:1], v2
1295; GFX8-NEXT:    s_endpgm
1296;
1297; GFX10-LABEL: test_div_scale_f32_undef_undef_val:
1298; GFX10:       ; %bb.0:
1299; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1300; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1301; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1302; GFX10-NEXT:    v_div_scale_f32 v0, s2, s0, s0, s0
1303; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1304; GFX10-NEXT:    s_endpgm
1305  %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false)
1306  %result0 = extractvalue { float, i1 } %result, 0
1307  store float %result0, float addrspace(1)* %out, align 4
1308  ret void
1309}
1310
1311define amdgpu_kernel void @test_div_scale_f64_val_undef_val(double addrspace(1)* %out) #0 {
1312; GFX7-LABEL: test_div_scale_f64_val_undef_val:
1313; GFX7:       ; %bb.0:
1314; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1315; GFX7-NEXT:    s_mov_b32 s2, 0
1316; GFX7-NEXT:    s_mov_b32 s3, 0x40200000
1317; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3]
1318; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1319; GFX7-NEXT:    v_mov_b32_e32 v3, s1
1320; GFX7-NEXT:    v_mov_b32_e32 v2, s0
1321; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1322; GFX7-NEXT:    s_endpgm
1323;
1324; GFX8-LABEL: test_div_scale_f64_val_undef_val:
1325; GFX8:       ; %bb.0:
1326; GFX8-NEXT:    s_mov_b32 s2, 0
1327; GFX8-NEXT:    s_mov_b32 s3, 0x40200000
1328; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3]
1329; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1330; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1331; GFX8-NEXT:    v_mov_b32_e32 v3, s1
1332; GFX8-NEXT:    v_mov_b32_e32 v2, s0
1333; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1334; GFX8-NEXT:    s_endpgm
1335;
1336; GFX10-LABEL: test_div_scale_f64_val_undef_val:
1337; GFX10:       ; %bb.0:
1338; GFX10-NEXT:    s_mov_b32 s2, 0
1339; GFX10-NEXT:    s_mov_b32 s3, 0x40200000
1340; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1341; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], s[2:3]
1342; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1343; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1344; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1345; GFX10-NEXT:    s_endpgm
1346  %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false)
1347  %result0 = extractvalue { double, i1 } %result, 0
1348  store double %result0, double addrspace(1)* %out, align 8
1349  ret void
1350}
1351
1352declare i32 @llvm.amdgcn.workitem.id.x() #1
1353declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1) #1
1354declare { double, i1 } @llvm.amdgcn.div.scale.f64(double, double, i1) #1
1355declare float @llvm.fabs.f32(float) #1
1356
1357attributes #0 = { nounwind }
1358attributes #1 = { nounwind readnone speculatable }
1359