1; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM %s
2; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH %s
3
4; GCN-LABEL: {{^}}div_1_by_x_25ulp:
5; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
6; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
7; GCN-DAG:        s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
8; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
9; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
10; GCN-DENORM:     v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]]
11; GCN-DENORM:     v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
12; GCN-DENORM:     v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
13
14; GCN-FLUSH:      v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]]
15
16; GCN:            global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}}
17define amdgpu_kernel void @div_1_by_x_25ulp(float addrspace(1)* %arg) {
18  %load = load float, float addrspace(1)* %arg, align 4
19  %div = fdiv float 1.000000e+00, %load, !fpmath !0
20  store float %div, float addrspace(1)* %arg, align 4
21  ret void
22}
23
24; GCN-LABEL: {{^}}div_minus_1_by_x_25ulp:
25; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
26; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
27; GCN-DAG:        s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
28; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
29; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
30; GCN-DENORM:     v_mul_f32_e64 [[PRESCALED:v[0-9]+]], [[VAL]], -[[SCALE]]
31; GCN-DENORM:     v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
32; GCN-DENORM:     v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
33
34; GCN-FLUSH:      v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]]
35
36; GCN:            global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}}
37define amdgpu_kernel void @div_minus_1_by_x_25ulp(float addrspace(1)* %arg) {
38  %load = load float, float addrspace(1)* %arg, align 4
39  %div = fdiv float -1.000000e+00, %load, !fpmath !0
40  store float %div, float addrspace(1)* %arg, align 4
41  ret void
42}
43
44; GCN-LABEL: {{^}}div_1_by_minus_x_25ulp:
45; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
46; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
47; GCN-DAG:        s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
48; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
49; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
50; GCN-DENORM:     v_mul_f32_e64 [[PRESCALED:v[0-9]+]], -[[VAL]], [[SCALE]]
51; GCN-DENORM:     v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
52; GCN-DENORM:     v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
53
54; GCN-FLUSH:      v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]]
55
56; GCN:            global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}}
57define amdgpu_kernel void @div_1_by_minus_x_25ulp(float addrspace(1)* %arg) {
58  %load = load float, float addrspace(1)* %arg, align 4
59  %neg = fneg float %load
60  %div = fdiv float 1.000000e+00, %neg, !fpmath !0
61  store float %div, float addrspace(1)* %arg, align 4
62  ret void
63}
64
65; GCN-LABEL: {{^}}div_minus_1_by_minus_x_25ulp:
66; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
67; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
68; GCN-DAG:        s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
69; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
70; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
71; GCN-DENORM:     v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]]
72; GCN-DENORM:     v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
73; GCN-DENORM:     v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
74
75; GCN-FLUSH:      v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]]
76
77; GCN:            global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}}
78define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(float addrspace(1)* %arg) {
79  %load = load float, float addrspace(1)* %arg, align 4
80  %neg = fsub float -0.000000e+00, %load
81  %div = fdiv float -1.000000e+00, %neg, !fpmath !0
82  store float %div, float addrspace(1)* %arg, align 4
83  ret void
84}
85
86; GCN-LABEL: {{^}}div_v4_1_by_x_25ulp:
87; GCN-DAG:        s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
88; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
89; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
90; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
91; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
92; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
93; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
94; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
95; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
96; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
97; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
98; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
99; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
100; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
101; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
102; GCN-DENORM-DAG: v_rcp_f32_e32
103; GCN-DENORM-DAG: v_rcp_f32_e32
104; GCN-DENORM-DAG: v_rcp_f32_e32
105; GCN-DENORM-DAG: v_rcp_f32_e32
106; GCN-DENORM-DAG: v_mul_f32_e32
107; GCN-DENORM-DAG: v_mul_f32_e32
108; GCN-DENORM-DAG: v_mul_f32_e32
109; GCN-DENORM-DAG: v_mul_f32_e32
110
111; GCN-FLUSH:      v_rcp_f32_e32 v[[OUT0:[0-9]+]], s[[VAL0]]
112; GCN-FLUSH:      v_rcp_f32_e32
113; GCN-FLUSH:      v_rcp_f32_e32
114; GCN-FLUSH:      v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]]
115; GCN-FLUSH:      global_store_dwordx4 v{{[0-9]+}}, v{{\[}}[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}}
116define amdgpu_kernel void @div_v4_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
117  %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
118  %div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %load, !fpmath !0
119  store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
120  ret void
121}
122
123; GCN-LABEL: {{^}}div_v4_minus_1_by_x_25ulp:
124; GCN-DAG:        s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
125; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
126; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
127; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
128; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
129; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
130; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
131; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
132; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
133; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
134; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
135; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
136; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
137; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
138; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
139; GCN-DENORM-DAG: v_rcp_f32_e32
140; GCN-DENORM-DAG: v_rcp_f32_e32
141; GCN-DENORM-DAG: v_rcp_f32_e32
142; GCN-DENORM-DAG: v_rcp_f32_e32
143; GCN-DENORM-DAG: v_mul_f32_e32
144; GCN-DENORM-DAG: v_mul_f32_e32
145; GCN-DENORM-DAG: v_mul_f32_e32
146; GCN-DENORM-DAG: v_mul_f32_e32
147
148; GCN-FLUSH:      v_rcp_f32_e64 v[[OUT0:[0-9]+]], -s[[VAL0]]
149; GCN-FLUSH:      v_rcp_f32_e64
150; GCN-FLUSH:      v_rcp_f32_e64
151; GCN-FLUSH:      v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]]
152define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
153  %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
154  %div = fdiv <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, %load, !fpmath !0
155  store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
156  ret void
157}
158
159; GCN-LABEL: {{^}}div_v4_1_by_minus_x_25ulp:
160; GCN-DAG:        s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
161; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
162; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
163; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
164; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
165; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
166; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
167; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
168; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
169; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
170; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
171; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
172; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
173; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
174; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
175; GCN-DENORM-DAG: v_rcp_f32_e32
176; GCN-DENORM-DAG: v_rcp_f32_e32
177; GCN-DENORM-DAG: v_rcp_f32_e32
178; GCN-DENORM-DAG: v_rcp_f32_e32
179; GCN-DENORM-DAG: v_mul_f32_e32
180; GCN-DENORM-DAG: v_mul_f32_e32
181; GCN-DENORM-DAG: v_mul_f32_e32
182; GCN-DENORM-DAG: v_mul_f32_e32
183
184; GCN-FLUSH:      v_rcp_f32_e64 v[[OUT0:[0-9]+]], -s[[VAL0]]
185; GCN-FLUSH:      v_rcp_f32_e64
186; GCN-FLUSH:      v_rcp_f32_e64
187; GCN-FLUSH:      v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]]
188; GCN-FLUSH:      global_store_dwordx4 v{{[0-9]+}}, v{{\[}}[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}}
189define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
190  %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
191  %neg = fneg <4 x float> %load
192  %div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %neg, !fpmath !0
193  store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
194  ret void
195}
196
197; GCN-LABEL: {{^}}div_v4_minus_1_by_minus_x_25ulp:
198; GCN-DAG:        s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
199; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
200; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
201; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
202; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
203; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
204; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
205; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
206; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
207; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
208; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
209; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
210; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
211; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
212; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
213; GCN-DENORM-DAG: v_rcp_f32_e32
214; GCN-DENORM-DAG: v_rcp_f32_e32
215; GCN-DENORM-DAG: v_rcp_f32_e32
216; GCN-DENORM-DAG: v_rcp_f32_e32
217; GCN-DENORM-DAG: v_mul_f32_e32
218; GCN-DENORM-DAG: v_mul_f32_e32
219; GCN-DENORM-DAG: v_mul_f32_e32
220; GCN-DENORM-DAG: v_mul_f32_e32
221
222; GCN-FLUSH:      v_rcp_f32_e32 v[[OUT0:[0-9]+]], s[[VAL0]]
223; GCN-FLUSH:      v_rcp_f32_e32
224; GCN-FLUSH:      v_rcp_f32_e32
225; GCN-FLUSH:      v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]]
226; GCN-FLUSH:      global_store_dwordx4 v{{[0-9]+}}, v{{\[}}[[OUT0]]:[[OUT3]]], s{{\[[0-9]+:[0-9]+\]}}
227define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
228  %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
229  %neg = fneg <4 x float> %load
230  %div = fdiv <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, %neg, !fpmath !0
231  store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
232  ret void
233}
234
235; GCN-LABEL: {{^}}div_v4_c_by_x_25ulp:
236; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}}
237; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}}
238; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
239; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
240; GCN-DENORM-DAG: v_rcp_f32_e32
241; GCN-DENORM-DAG: v_rcp_f32_e32
242
243; GCN-DAG:        v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
244; GCN-DAG:        v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
245
246; GCN-DAG:        v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
247; GCN-DAG:        v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
248; GCN-DAG:        v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
249; GCN-DAG:        v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
250
251; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
252; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
253; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}}
254; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]]
255; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}}
256; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP2]]
257
258; GCN-DENORM-DAG: v_div_fmas_f32
259; GCN-DENORM-DAG: v_div_fmas_f32
260; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, 2.0{{$}}
261; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}}
262
263; GCN-FLUSH-DAG:  v_rcp_f32_e32
264; GCN-FLUSH-DAG:  v_rcp_f32_e64
265
266; GCN-NOT:        v_cmp_gt_f32_e64
267; GCN-NOT:        v_cndmask_b32_e32
268; GCN-FLUSH-NOT:  v_div
269
270; GCN:            global_store_dwordx4
271define amdgpu_kernel void @div_v4_c_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
272  %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
273  %div = fdiv <4 x float> <float 2.000000e+00, float 1.000000e+00, float -1.000000e+00, float -2.000000e+00>, %load, !fpmath !0
274  store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
275  ret void
276}
277
278; GCN-LABEL: {{^}}div_v4_c_by_minus_x_25ulp:
279; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
280; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
281; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
282; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
283; GCN-DENORM-DAG: v_rcp_f32_e32
284; GCN-DENORM-DAG: v_rcp_f32_e32
285
286; GCN-DAG:        v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
287; GCN-DAG:        v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
288
289; GCN-DAG:        v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
290; GCN-DAG:        v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
291; GCN-DAG:        v_cmp_gt_f32_e64 vcc, |s{{[0-9]+}}|, [[L]]
292; GCN-DAG:        v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
293
294; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
295; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
296; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}}
297; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]]
298; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}}
299; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP2]]
300
301; GCN-DENORM-DAG: v_div_fmas_f32
302; GCN-DENORM-DAG: v_div_fmas_f32
303; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}}
304; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}}
305
306; GCN-FLUSH-DAG:  v_rcp_f32_e32
307; GCN-FLUSH-DAG:  v_rcp_f32_e64
308
309; GCN-NOT:        v_cmp_gt_f32_e64
310; GCN-NOT:        v_cndmask_b32_e32
311; GCN-FLUSH-NOT:  v_div
312
313; GCN:            global_store_dwordx4
314define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
315  %load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
316  %neg = fneg <4 x float> %load
317  %div = fdiv <4 x float> <float 2.000000e+00, float 1.000000e+00, float -1.000000e+00, float -2.000000e+00>, %neg, !fpmath !0
318  store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
319  ret void
320}
321
322; GCN-LABEL: {{^}}div_v_by_x_25ulp:
323; GCN-DAG:        s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
324
325; GCN-DENORM-DAG: v_div_scale_f32
326; GCN-DENORM-DAG: v_rcp_f32_e32
327; GCN-DENORM-DAG: v_div_scale_f32
328; GCN-DENORM:     v_div_fmas_f32
329; GCN-DENORM:     v_div_fixup_f32 [[OUT:v[0-9]+]],
330
331; GCN-FLUSH-DAG:  v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
332; GCN-FLUSH-DAG:  v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
333; GCN-FLUSH-DAG:  v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
334; GCN-FLUSH-DAG:  v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
335; GCN-FLUSH:      v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]]
336; GCN-FLUSH:      v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
337; GCN-FLUSH:      v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
338
339; GCN:            global_store_dword v{{[0-9]+}}, [[OUT]], s{{\[[0-9]+:[0-9]+\]}}
340define amdgpu_kernel void @div_v_by_x_25ulp(float addrspace(1)* %arg, float %num) {
341  %load = load float, float addrspace(1)* %arg, align 4
342  %div = fdiv float %num, %load, !fpmath !0
343  store float %div, float addrspace(1)* %arg, align 4
344  ret void
345}
346
347; GCN-LABEL: {{^}}div_1_by_x_fast:
348; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
349; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
350; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]:[0-9]+\]}}
351define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) {
352  %load = load float, float addrspace(1)* %arg, align 4
353  %div = fdiv fast float 1.000000e+00, %load, !fpmath !0
354  store float %div, float addrspace(1)* %arg, align 4
355  ret void
356}
357
358; GCN-LABEL: {{^}}div_minus_1_by_x_fast:
359; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
360; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
361; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]+:[0-9]+\]}}
362define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) {
363  %load = load float, float addrspace(1)* %arg, align 4
364  %div = fdiv fast float -1.000000e+00, %load, !fpmath !0
365  store float %div, float addrspace(1)* %arg, align 4
366  ret void
367}
368
369; GCN-LABEL: {{^}}div_1_by_minus_x_fast:
370; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
371; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
372; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]+:[0-9]+\]}}
373define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) {
374  %load = load float, float addrspace(1)* %arg, align 4
375  %neg = fneg float %load, !fpmath !0
376  %div = fdiv fast float 1.000000e+00, %neg
377  store float %div, float addrspace(1)* %arg, align 4
378  ret void
379}
380
381; GCN-LABEL: {{^}}div_minus_1_by_minus_x_fast:
382; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
383; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
384; GCN: global_store_dword v{{[0-9]+}}, [[RCP]], s{{\[[0-9]+:[0-9]+\]}}
385define amdgpu_kernel void @div_minus_1_by_minus_x_fast(float addrspace(1)* %arg) {
386  %load = load float, float addrspace(1)* %arg, align 4
387  %neg = fsub float -0.000000e+00, %load, !fpmath !0
388  %div = fdiv fast float -1.000000e+00, %neg
389  store float %div, float addrspace(1)* %arg, align 4
390  ret void
391}
392
393; GCN-LABEL: {{^}}div_1_by_x_correctly_rounded:
394; GCN-DAG: v_div_scale_f32
395; GCN-DAG: v_rcp_f32_e32
396; GCN-DAG: v_div_scale_f32
397; GCN:     v_div_fmas_f32
398; GCN:     v_div_fixup_f32
399define amdgpu_kernel void @div_1_by_x_correctly_rounded(float addrspace(1)* %arg) {
400  %load = load float, float addrspace(1)* %arg, align 4
401  %div = fdiv float 1.000000e+00, %load
402  store float %div, float addrspace(1)* %arg, align 4
403  ret void
404}
405
406; GCN-LABEL: {{^}}div_minus_1_by_x_correctly_rounded:
407; GCN-DAG: v_div_scale_f32
408; GCN-DAG: v_rcp_f32_e32
409; GCN-DAG: v_div_scale_f32
410; GCN:     v_div_fmas_f32
411; GCN:     v_div_fixup_f32
412define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(float addrspace(1)* %arg) {
413  %load = load float, float addrspace(1)* %arg, align 4
414  %div = fdiv float -1.000000e+00, %load
415  store float %div, float addrspace(1)* %arg, align 4
416  ret void
417}
418
419; GCN-LABEL: {{^}}div_1_by_minus_x_correctly_rounded:
420; GCN-DAG: v_div_scale_f32
421; GCN-DAG: v_rcp_f32_e32
422; GCN-DAG: v_div_scale_f32
423; GCN:     v_div_fmas_f32
424; GCN:     v_div_fixup_f32
425define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) {
426  %load = load float, float addrspace(1)* %arg, align 4
427  %neg = fsub float -0.000000e+00, %load
428  %div = fdiv float 1.000000e+00, %neg
429  store float %div, float addrspace(1)* %arg, align 4
430  ret void
431}
432
433; GCN-LABEL: {{^}}div_minus_1_by_minus_x_correctly_rounded:
434; GCN-DAG: v_div_scale_f32
435; GCN-DAG: v_rcp_f32_e32
436; GCN-DAG: v_div_scale_f32
437; GCN:     v_div_fmas_f32
438; GCN:     v_div_fixup_f32
439define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) {
440  %load = load float, float addrspace(1)* %arg, align 4
441  %neg = fsub float -0.000000e+00, %load
442  %div = fdiv float -1.000000e+00, %neg
443  store float %div, float addrspace(1)* %arg, align 4
444  ret void
445}
446
447!0 = !{float 2.500000e+00}
448