1; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck --check-prefixes=GCN,GCN-DENORM %s
2; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck --check-prefixes=GCN,GCN-FLUSH %s
3
4; GCN-LABEL: {{^}}div_1_by_x_25ulp:
5; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
6; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
7; GCN-DAG:        s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
8; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
9; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
10; GCN-DENORM:     v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]]
11; GCN-DENORM:     v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
12; GCN-DENORM:     v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
13
14; GCN-FLUSH:      v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]]
15
16; GCN:            global_store_dword v[{{[0-9:]+}}], [[OUT]], off
17define amdgpu_kernel void @div_1_by_x_25ulp(float addrspace(1)* %arg) {
18  %load = load float, float addrspace(1)* %arg, align 4
19  %div = fdiv float 1.000000e+00, %load, !fpmath !0
20  store float %div, float addrspace(1)* %arg, align 4
21  ret void
22}
23
24; GCN-LABEL: {{^}}div_minus_1_by_x_25ulp:
25; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
26; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
27; GCN-DAG:        s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
28; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
29; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
30; GCN-DENORM:     v_mul_f32_e64 [[PRESCALED:v[0-9]+]], [[VAL]], -[[SCALE]]
31; GCN-DENORM:     v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
32; GCN-DENORM:     v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
33
34; GCN-FLUSH:      v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]]
35
36; GCN:            global_store_dword v[{{[0-9:]+}}], [[OUT]], off
37define amdgpu_kernel void @div_minus_1_by_x_25ulp(float addrspace(1)* %arg) {
38  %load = load float, float addrspace(1)* %arg, align 4
39  %div = fdiv float -1.000000e+00, %load, !fpmath !0
40  store float %div, float addrspace(1)* %arg, align 4
41  ret void
42}
43
44; GCN-LABEL: {{^}}div_1_by_minus_x_25ulp:
45; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
46; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
47; GCN-DAG:        s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
48; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
49; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
50; GCN-DENORM:     v_mul_f32_e64 [[PRESCALED:v[0-9]+]], -[[VAL]], [[SCALE]]
51; GCN-DENORM:     v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
52; GCN-DENORM:     v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
53
54; GCN-FLUSH:      v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]]
55
56; GCN:            global_store_dword v[{{[0-9:]+}}], [[OUT]], off
57define amdgpu_kernel void @div_1_by_minus_x_25ulp(float addrspace(1)* %arg) {
58  %load = load float, float addrspace(1)* %arg, align 4
59  %neg = fsub float -0.000000e+00, %load
60  %div = fdiv float 1.000000e+00, %neg, !fpmath !0
61  store float %div, float addrspace(1)* %arg, align 4
62  ret void
63}
64
65; GCN-LABEL: {{^}}div_minus_1_by_minus_x_25ulp:
66; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
67; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
68; GCN-DAG:        s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
69; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
70; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
71; GCN-DENORM:     v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]]
72; GCN-DENORM:     v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
73; GCN-DENORM:     v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
74
75; GCN-FLUSH:      v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]]
76
77; GCN:            global_store_dword v[{{[0-9:]+}}], [[OUT]], off
78define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(float addrspace(1)* %arg) {
79  %load = load float, float addrspace(1)* %arg, align 4
80  %neg = fsub float -0.000000e+00, %load
81  %div = fdiv float -1.000000e+00, %neg, !fpmath !0
82  store float %div, float addrspace(1)* %arg, align 4
83  ret void
84}
85
86; GCN-LABEL: {{^}}div_v4_1_by_x_25ulp:
87; GCN-DAG:        s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
88; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
89; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
90; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
91; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
92; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
93; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
94; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
95; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
96; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
97; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
98; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
99; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
100; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
101; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
102; GCN-DENORM-DAG: v_rcp_f32_e32
103; GCN-DENORM-DAG: v_rcp_f32_e32
104; GCN-DENORM-DAG: v_rcp_f32_e32
105; GCN-DENORM-DAG: v_rcp_f32_e32
106; GCN-DENORM-DAG: v_mul_f32_e32
107; GCN-DENORM-DAG: v_mul_f32_e32
108; GCN-DENORM-DAG: v_mul_f32_e32
109; GCN-DENORM-DAG: v_mul_f32_e32
110
111; GCN-FLUSH:      v_rcp_f32_e32 v[[OUT0:[0-9]+]], s[[VAL0]]
112; GCN-FLUSH:      v_rcp_f32_e32
113; GCN-FLUSH:      v_rcp_f32_e32
114; GCN-FLUSH:      v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]]
115; GCN-FLUSH:      global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
116define amdgpu_kernel void @div_v4_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
117  %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
118  %div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %load, !fpmath !0
119  store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
120  ret void
121}
122
123; GCN-LABEL: {{^}}div_v4_minus_1_by_x_25ulp:
124; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
125; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
126; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
127; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
128; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
129; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
130; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
131; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
132; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
133; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
134; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
135; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
136; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
137; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
138; GCN-DENORM-DAG: v_rcp_f32_e32
139; GCN-DENORM-DAG: v_rcp_f32_e32
140; GCN-DENORM-DAG: v_rcp_f32_e32
141; GCN-DENORM-DAG: v_rcp_f32_e32
142; GCN-DENORM-DAG: v_mul_f32_e32
143; GCN-DENORM-DAG: v_mul_f32_e32
144; GCN-DENORM-DAG: v_mul_f32_e32
145; GCN-DENORM-DAG: v_mul_f32_e32
146
147; GCN-FLUSH:      v_rcp_f32_e64 v[[OUT0:[0-9]+]], -s[[VAL0]]
148; GCN-FLUSH:      v_rcp_f32_e64
149; GCN-FLUSH:      v_rcp_f32_e64
150; GCN-FLUSH:      v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]]
151define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
152  %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
153  %div = fdiv <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, %load, !fpmath !0
154  store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
155  ret void
156}
157
158; GCN-LABEL: {{^}}div_v4_1_by_minus_x_25ulp:
159; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
160; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
161; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
162; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
163; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
164; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
165; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
166; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
167; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
168; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
169; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
170; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
171; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
172; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
173; GCN-DENORM-DAG: v_rcp_f32_e32
174; GCN-DENORM-DAG: v_rcp_f32_e32
175; GCN-DENORM-DAG: v_rcp_f32_e32
176; GCN-DENORM-DAG: v_rcp_f32_e32
177; GCN-DENORM-DAG: v_mul_f32_e32
178; GCN-DENORM-DAG: v_mul_f32_e32
179; GCN-DENORM-DAG: v_mul_f32_e32
180; GCN-DENORM-DAG: v_mul_f32_e32
181
182; GCN-FLUSH:      v_rcp_f32_e64 v[[OUT0:[0-9]+]], -s[[VAL0]]
183; GCN-FLUSH:      v_rcp_f32_e64
184; GCN-FLUSH:      v_rcp_f32_e64
185; GCN-FLUSH:      v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]]
186; GCN-FLUSH:      global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
187define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
188  %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
189  %neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load
190  %div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %neg, !fpmath !0
191  store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
192  ret void
193}
194
195; GCN-LABEL: {{^}}div_v4_minus_1_by_minus_x_25ulp:
196; GCN-DAG:        s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
197; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
198; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
199; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
200; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
201; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
202; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
203; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
204; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
205; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
206; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
207; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
208; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
209; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
210; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
211; GCN-DENORM-DAG: v_rcp_f32_e32
212; GCN-DENORM-DAG: v_rcp_f32_e32
213; GCN-DENORM-DAG: v_rcp_f32_e32
214; GCN-DENORM-DAG: v_rcp_f32_e32
215; GCN-DENORM-DAG: v_mul_f32_e32
216; GCN-DENORM-DAG: v_mul_f32_e32
217; GCN-DENORM-DAG: v_mul_f32_e32
218; GCN-DENORM-DAG: v_mul_f32_e32
219
220; GCN-FLUSH:      v_rcp_f32_e32 v[[OUT0:[0-9]+]], s[[VAL0]]
221; GCN-FLUSH:      v_rcp_f32_e32
222; GCN-FLUSH:      v_rcp_f32_e32
223; GCN-FLUSH:      v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]]
224; GCN-FLUSH:      global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
225define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
226  %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
227  %neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load
228  %div = fdiv <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, %neg, !fpmath !0
229  store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
230  ret void
231}
232
233; GCN-LABEL: {{^}}div_v4_c_by_x_25ulp:
234; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}}
235; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}}
236; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
237; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
238; GCN-DENORM-DAG: v_rcp_f32_e32
239; GCN-DENORM-DAG: v_rcp_f32_e32
240
241; GCN-DAG:        v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
242; GCN-DAG:        v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
243
244; GCN-DAG:        v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
245; GCN-DAG:        v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
246; GCN-DAG:        v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
247; GCN-DAG:        v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
248
249; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
250; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
251; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}}
252; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]]
253; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}}
254; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP2]]
255
256; GCN-DENORM-DAG: v_div_fmas_f32
257; GCN-DENORM-DAG: v_div_fmas_f32
258; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, 2.0{{$}}
259; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}}
260
261; GCN-FLUSH-DAG:  v_rcp_f32_e32
262; GCN-FLUSH-DAG:  v_rcp_f32_e64
263
264; GCN-NOT:        v_cmp_gt_f32_e64
265; GCN-NOT:        v_cndmask_b32_e32
266; GCN-FLUSH-NOT:  v_div
267
268; GCN:            global_store_dwordx4
269define amdgpu_kernel void @div_v4_c_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
270  %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
271  %div = fdiv <4 x float> <float 2.000000e+00, float 1.000000e+00, float -1.000000e+00, float -2.000000e+00>, %load, !fpmath !0
272  store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
273  ret void
274}
275
276; GCN-LABEL: {{^}}div_v4_c_by_minus_x_25ulp:
277; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
278; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
279; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
280; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
281; GCN-DENORM-DAG: v_rcp_f32_e32
282; GCN-DENORM-DAG: v_rcp_f32_e32
283
284; GCN-DAG:        v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
285; GCN-DAG:        v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
286
287; GCN-DAG:        v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
288; GCN-DAG:        v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
289; GCN-DAG:        v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
290; GCN-DAG:        v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
291
292; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
293; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
294; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}}
295; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]]
296; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}}
297; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP2]]
298
299; GCN-DENORM-DAG: v_div_fmas_f32
300; GCN-DENORM-DAG: v_div_fmas_f32
301; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}}
302; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}}
303
304; GCN-FLUSH-DAG:  v_rcp_f32_e32
305; GCN-FLUSH-DAG:  v_rcp_f32_e64
306
307; GCN-NOT:        v_cmp_gt_f32_e64
308; GCN-NOT:        v_cndmask_b32_e32
309; GCN-FLUSH-NOT:  v_div
310
311; GCN:            global_store_dwordx4
312define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
313  %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
314  %neg = fneg <4 x float> %load
315  %div = fdiv <4 x float> <float 2.000000e+00, float 1.000000e+00, float -1.000000e+00, float -2.000000e+00>, %neg, !fpmath !0
316  store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
317  ret void
318}
319
320; GCN-LABEL: {{^}}div_v_by_x_25ulp:
321; GCN-DAG:        s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
322
323; GCN-DENORM-DAG: v_div_scale_f32
324; GCN-DENORM-DAG: v_rcp_f32_e32
325; GCN-DENORM-DAG: v_div_scale_f32
326; GCN-DENORM:     v_div_fmas_f32
327; GCN-DENORM:     v_div_fixup_f32 [[OUT:v[0-9]+]],
328
329; GCN-FLUSH-DAG:  v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
330; GCN-FLUSH-DAG:  v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
331; GCN-FLUSH-DAG:  v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
332; GCN-FLUSH-DAG:  v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
333; GCN-FLUSH:      v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]]
334; GCN-FLUSH:      v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
335; GCN-FLUSH:      v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
336
337; GCN:            global_store_dword v[{{[0-9:]+}}], [[OUT]], off
338define amdgpu_kernel void @div_v_by_x_25ulp(float addrspace(1)* %arg, float %num) {
339  %load = load float, float addrspace(1)* %arg, align 4
340  %div = fdiv float %num, %load, !fpmath !0
341  store float %div, float addrspace(1)* %arg, align 4
342  ret void
343}
344
345; GCN-LABEL: {{^}}div_1_by_x_fast:
346; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
347; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
348; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
349define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) {
350  %load = load float, float addrspace(1)* %arg, align 4
351  %div = fdiv fast float 1.000000e+00, %load, !fpmath !0
352  store float %div, float addrspace(1)* %arg, align 4
353  ret void
354}
355
356; GCN-LABEL: {{^}}div_minus_1_by_x_fast:
357; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
358; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
359; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
360define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) {
361  %load = load float, float addrspace(1)* %arg, align 4
362  %div = fdiv fast float -1.000000e+00, %load, !fpmath !0
363  store float %div, float addrspace(1)* %arg, align 4
364  ret void
365}
366
367; GCN-LABEL: {{^}}div_1_by_minus_x_fast:
368; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
369; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
370; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
371define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) {
372  %load = load float, float addrspace(1)* %arg, align 4
373  %neg = fsub float -0.000000e+00, %load, !fpmath !0
374  %div = fdiv fast float 1.000000e+00, %neg
375  store float %div, float addrspace(1)* %arg, align 4
376  ret void
377}
378
379; GCN-LABEL: {{^}}div_minus_1_by_minus_x_fast:
380; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
381; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
382; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
383define amdgpu_kernel void @div_minus_1_by_minus_x_fast(float addrspace(1)* %arg) {
384  %load = load float, float addrspace(1)* %arg, align 4
385  %neg = fsub float -0.000000e+00, %load, !fpmath !0
386  %div = fdiv fast float -1.000000e+00, %neg
387  store float %div, float addrspace(1)* %arg, align 4
388  ret void
389}
390
391; GCN-LABEL: {{^}}div_1_by_x_correctly_rounded:
392; GCN-DAG: v_div_scale_f32
393; GCN-DAG: v_rcp_f32_e32
394; GCN-DAG: v_div_scale_f32
395; GCN:     v_div_fmas_f32
396; GCN:     v_div_fixup_f32
397define amdgpu_kernel void @div_1_by_x_correctly_rounded(float addrspace(1)* %arg) {
398  %load = load float, float addrspace(1)* %arg, align 4
399  %div = fdiv float 1.000000e+00, %load
400  store float %div, float addrspace(1)* %arg, align 4
401  ret void
402}
403
404; GCN-LABEL: {{^}}div_minus_1_by_x_correctly_rounded:
405; GCN-DAG: v_div_scale_f32
406; GCN-DAG: v_rcp_f32_e32
407; GCN-DAG: v_div_scale_f32
408; GCN:     v_div_fmas_f32
409; GCN:     v_div_fixup_f32
410define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(float addrspace(1)* %arg) {
411  %load = load float, float addrspace(1)* %arg, align 4
412  %div = fdiv float -1.000000e+00, %load
413  store float %div, float addrspace(1)* %arg, align 4
414  ret void
415}
416
417; GCN-LABEL: {{^}}div_1_by_minus_x_correctly_rounded:
418; GCN-DAG: v_div_scale_f32
419; GCN-DAG: v_rcp_f32_e32
420; GCN-DAG: v_div_scale_f32
421; GCN:     v_div_fmas_f32
422; GCN:     v_div_fixup_f32
423define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) {
424  %load = load float, float addrspace(1)* %arg, align 4
425  %neg = fsub float -0.000000e+00, %load
426  %div = fdiv float 1.000000e+00, %neg
427  store float %div, float addrspace(1)* %arg, align 4
428  ret void
429}
430
431; GCN-LABEL: {{^}}div_minus_1_by_minus_x_correctly_rounded:
432; GCN-DAG: v_div_scale_f32
433; GCN-DAG: v_rcp_f32_e32
434; GCN-DAG: v_div_scale_f32
435; GCN:     v_div_fmas_f32
436; GCN:     v_div_fixup_f32
437define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) {
438  %load = load float, float addrspace(1)* %arg, align 4
439  %neg = fsub float -0.000000e+00, %load
440  %div = fdiv float -1.000000e+00, %neg
441  store float %div, float addrspace(1)* %arg, align 4
442  ret void
443}
444
445!0 = !{float 2.500000e+00}
446