1; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
2
3declare i32 @llvm.r600.read.tidig.x() nounwind readnone
4declare { float, i1 } @llvm.AMDGPU.div.scale.f32(float, float, i1) nounwind readnone
5declare { double, i1 } @llvm.AMDGPU.div.scale.f64(double, double, i1) nounwind readnone
6declare float @llvm.fabs.f32(float) nounwind readnone
7
8; SI-LABEL @test_div_scale_f32_1:
9; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
10; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
11; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
12; SI: buffer_store_dword [[RESULT0]]
13; SI: s_endpgm
14define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
15  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
16  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
17  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
18
19  %a = load float addrspace(1)* %gep.0, align 4
20  %b = load float addrspace(1)* %gep.1, align 4
21
22  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
23  %result0 = extractvalue { float, i1 } %result, 0
24  store float %result0, float addrspace(1)* %out, align 4
25  ret void
26}
27
28; SI-LABEL @test_div_scale_f32_2:
29; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
30; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
31; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
32; SI: buffer_store_dword [[RESULT0]]
33; SI: s_endpgm
34define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
35  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
36  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
37  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
38
39  %a = load float addrspace(1)* %gep.0, align 4
40  %b = load float addrspace(1)* %gep.1, align 4
41
42  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
43  %result0 = extractvalue { float, i1 } %result, 0
44  store float %result0, float addrspace(1)* %out, align 4
45  ret void
46}
47
48; SI-LABEL @test_div_scale_f64_1:
49; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
50; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
51; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
52; SI: buffer_store_dwordx2 [[RESULT0]]
53; SI: s_endpgm
54define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
55  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
56  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
57  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
58
59  %a = load double addrspace(1)* %gep.0, align 8
60  %b = load double addrspace(1)* %gep.1, align 8
61
62  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
63  %result0 = extractvalue { double, i1 } %result, 0
64  store double %result0, double addrspace(1)* %out, align 8
65  ret void
66}
67
68; SI-LABEL @test_div_scale_f64_1:
69; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
70; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
71; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
72; SI: buffer_store_dwordx2 [[RESULT0]]
73; SI: s_endpgm
74define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
75  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
76  %gep.0 = getelementptr double addrspace(1)* %in, i32 %tid
77  %gep.1 = getelementptr double addrspace(1)* %gep.0, i32 1
78
79  %a = load double addrspace(1)* %gep.0, align 8
80  %b = load double addrspace(1)* %gep.1, align 8
81
82  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
83  %result0 = extractvalue { double, i1 } %result, 0
84  store double %result0, double addrspace(1)* %out, align 8
85  ret void
86}
87
88; SI-LABEL @test_div_scale_f32_scalar_num_1:
89; SI-DAG: buffer_load_dword [[B:v[0-9]+]]
90; SI-DAG: s_load_dword [[A:s[0-9]+]]
91; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
92; SI: buffer_store_dword [[RESULT0]]
93; SI: s_endpgm
94define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
95  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
96  %gep = getelementptr float addrspace(1)* %in, i32 %tid
97
98  %b = load float addrspace(1)* %gep, align 4
99
100  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
101  %result0 = extractvalue { float, i1 } %result, 0
102  store float %result0, float addrspace(1)* %out, align 4
103  ret void
104}
105
106; SI-LABEL @test_div_scale_f32_scalar_num_2:
107; SI-DAG: buffer_load_dword [[B:v[0-9]+]]
108; SI-DAG: s_load_dword [[A:s[0-9]+]]
109; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
110; SI: buffer_store_dword [[RESULT0]]
111; SI: s_endpgm
112define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
113  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
114  %gep = getelementptr float addrspace(1)* %in, i32 %tid
115
116  %b = load float addrspace(1)* %gep, align 4
117
118  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
119  %result0 = extractvalue { float, i1 } %result, 0
120  store float %result0, float addrspace(1)* %out, align 4
121  ret void
122}
123
124; SI-LABEL @test_div_scale_f32_scalar_den_1:
125; SI-DAG: buffer_load_dword [[A:v[0-9]+]]
126; SI-DAG: s_load_dword [[B:s[0-9]+]]
127; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
128; SI: buffer_store_dword [[RESULT0]]
129; SI: s_endpgm
130define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
131  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
132  %gep = getelementptr float addrspace(1)* %in, i32 %tid
133
134  %a = load float addrspace(1)* %gep, align 4
135
136  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
137  %result0 = extractvalue { float, i1 } %result, 0
138  store float %result0, float addrspace(1)* %out, align 4
139  ret void
140}
141
142; SI-LABEL @test_div_scale_f32_scalar_den_2:
143; SI-DAG: buffer_load_dword [[A:v[0-9]+]]
144; SI-DAG: s_load_dword [[B:s[0-9]+]]
145; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
146; SI: buffer_store_dword [[RESULT0]]
147; SI: s_endpgm
148define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
149  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
150  %gep = getelementptr float addrspace(1)* %in, i32 %tid
151
152  %a = load float addrspace(1)* %gep, align 4
153
154  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
155  %result0 = extractvalue { float, i1 } %result, 0
156  store float %result0, float addrspace(1)* %out, align 4
157  ret void
158}
159
160; SI-LABEL @test_div_scale_f64_scalar_num_1:
161; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
162; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
163; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
164; SI: buffer_store_dwordx2 [[RESULT0]]
165; SI: s_endpgm
166define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
167  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
168  %gep = getelementptr double addrspace(1)* %in, i32 %tid
169
170  %b = load double addrspace(1)* %gep, align 8
171
172  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
173  %result0 = extractvalue { double, i1 } %result, 0
174  store double %result0, double addrspace(1)* %out, align 8
175  ret void
176}
177
178; SI-LABEL @test_div_scale_f64_scalar_num_2:
179; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
180; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
181; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
182; SI: buffer_store_dwordx2 [[RESULT0]]
183; SI: s_endpgm
184define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
185  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
186  %gep = getelementptr double addrspace(1)* %in, i32 %tid
187
188  %b = load double addrspace(1)* %gep, align 8
189
190  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
191  %result0 = extractvalue { double, i1 } %result, 0
192  store double %result0, double addrspace(1)* %out, align 8
193  ret void
194}
195
196; SI-LABEL @test_div_scale_f64_scalar_den_1:
197; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
198; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
199; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
200; SI: buffer_store_dwordx2 [[RESULT0]]
201; SI: s_endpgm
202define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
203  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
204  %gep = getelementptr double addrspace(1)* %in, i32 %tid
205
206  %a = load double addrspace(1)* %gep, align 8
207
208  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
209  %result0 = extractvalue { double, i1 } %result, 0
210  store double %result0, double addrspace(1)* %out, align 8
211  ret void
212}
213
214; SI-LABEL @test_div_scale_f64_scalar_den_2:
215; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
216; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
217; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
218; SI: buffer_store_dwordx2 [[RESULT0]]
219; SI: s_endpgm
220define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
221  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
222  %gep = getelementptr double addrspace(1)* %in, i32 %tid
223
224  %a = load double addrspace(1)* %gep, align 8
225
226  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
227  %result0 = extractvalue { double, i1 } %result, 0
228  store double %result0, double addrspace(1)* %out, align 8
229  ret void
230}
231
232; SI-LABEL @test_div_scale_f32_all_scalar_1:
233; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
234; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
235; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
236; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]]
237; SI: buffer_store_dword [[RESULT0]]
238; SI: s_endpgm
239define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
240  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
241  %result0 = extractvalue { float, i1 } %result, 0
242  store float %result0, float addrspace(1)* %out, align 4
243  ret void
244}
245
246; SI-LABEL @test_div_scale_f32_all_scalar_2:
247; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
248; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
249; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
250; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]]
251; SI: buffer_store_dword [[RESULT0]]
252; SI: s_endpgm
253define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
254  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
255  %result0 = extractvalue { float, i1 } %result, 0
256  store float %result0, float addrspace(1)* %out, align 4
257  ret void
258}
259
260; SI-LABEL @test_div_scale_f64_all_scalar_1:
261; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
262; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
263; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]]
264; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]]
265; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}}
266; SI: buffer_store_dwordx2 [[RESULT0]]
267; SI: s_endpgm
268define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
269  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
270  %result0 = extractvalue { double, i1 } %result, 0
271  store double %result0, double addrspace(1)* %out, align 8
272  ret void
273}
274
275; SI-LABEL @test_div_scale_f64_all_scalar_2:
276; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
277; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd
278; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]]
279; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]]
280; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]]
281; SI: buffer_store_dwordx2 [[RESULT0]]
282; SI: s_endpgm
283define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
284  %result = call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
285  %result0 = extractvalue { double, i1 } %result, 0
286  store double %result0, double addrspace(1)* %out, align 8
287  ret void
288}
289
290; SI-LABEL @test_div_scale_f32_inline_imm_num:
291; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
292; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0
293; SI: buffer_store_dword [[RESULT0]]
294; SI: s_endpgm
295define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
296  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
297  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
298  %a = load float addrspace(1)* %gep.0, align 4
299
300  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float 1.0, float %a, i1 false) nounwind readnone
301  %result0 = extractvalue { float, i1 } %result, 0
302  store float %result0, float addrspace(1)* %out, align 4
303  ret void
304}
305
306; SI-LABEL @test_div_scale_f32_inline_imm_den:
307; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
308; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]]
309; SI: buffer_store_dword [[RESULT0]]
310; SI: s_endpgm
311define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
312  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
313  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
314  %a = load float addrspace(1)* %gep.0, align 4
315
316  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float 2.0, i1 false) nounwind readnone
317  %result0 = extractvalue { float, i1 } %result, 0
318  store float %result0, float addrspace(1)* %out, align 4
319  ret void
320}
321
322; SI-LABEL @test_div_scale_f32_fabs_num:
323; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
324; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
325; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], |[[A]]|
326; SI: buffer_store_dword [[RESULT0]]
327; SI: s_endpgm
328define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
329  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
330  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
331  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
332
333  %a = load float addrspace(1)* %gep.0, align 4
334  %b = load float addrspace(1)* %gep.1, align 4
335
336  %a.fabs = call float @llvm.fabs.f32(float %a) nounwind readnone
337
338  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a.fabs, float %b, i1 false) nounwind readnone
339  %result0 = extractvalue { float, i1 } %result, 0
340  store float %result0, float addrspace(1)* %out, align 4
341  ret void
342}
343
344; SI-LABEL @test_div_scale_f32_fabs_den:
345; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
346; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
347; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], |[[B]]|, |[[B]]|, [[A]]
348; SI: buffer_store_dword [[RESULT0]]
349; SI: s_endpgm
350define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
351  %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
352  %gep.0 = getelementptr float addrspace(1)* %in, i32 %tid
353  %gep.1 = getelementptr float addrspace(1)* %gep.0, i32 1
354
355  %a = load float addrspace(1)* %gep.0, align 4
356  %b = load float addrspace(1)* %gep.1, align 4
357
358  %b.fabs = call float @llvm.fabs.f32(float %b) nounwind readnone
359
360  %result = call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b.fabs, i1 false) nounwind readnone
361  %result0 = extractvalue { float, i1 } %result, 0
362  store float %result0, float addrspace(1)* %out, align 4
363  ret void
364}
365