1; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-FLUSH %s
2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=-fp32-denormals,+fp-exceptions < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-EXCEPT,VI,GCN-FLUSH %s
3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-DENORM,GCN-DENORM %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH %s
5
6; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32:
7; GCN-FLUSH:   v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
8; GFX9-DENORM: v_max_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
9define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(float addrspace(1)* %arg) {
10  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
11  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
12  %v = load float, float addrspace(1)* %gep, align 4
13  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
14  store float %canonicalized, float addrspace(1)* %gep, align 4
15  ret void
16}
17
18; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_value_f32:
19; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
20; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
21; GCN-NOT: 1.0
22define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(float addrspace(1)* %arg) {
23  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
24  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
25  %load = load float, float addrspace(1)* %gep, align 4
26  %v = fmul float %load, 15.0
27  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
28  store float %canonicalized, float addrspace(1)* %gep, align 4
29  ret void
30}
31
32; GCN-LABEL: {{^}}test_fold_canonicalize_sub_value_f32:
33; GCN: v_sub_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
34; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
35; GCN-NOT: 1.0
36define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(float addrspace(1)* %arg) {
37  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
38  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
39  %load = load float, float addrspace(1)* %gep, align 4
40  %v = fsub float 15.0, %load
41  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
42  store float %canonicalized, float addrspace(1)* %gep, align 4
43  ret void
44}
45
46; GCN-LABEL: {{^}}test_fold_canonicalize_add_value_f32:
47; GCN: v_add_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
48; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
49; GCN-NOT: 1.0
50define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(float addrspace(1)* %arg) {
51  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
52  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
53  %load = load float, float addrspace(1)* %gep, align 4
54  %v = fadd float %load, 15.0
55  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
56  store float %canonicalized, float addrspace(1)* %gep, align 4
57  ret void
58}
59
60; GCN-LABEL: {{^}}test_fold_canonicalize_sqrt_value_f32:
61; GCN: v_sqrt_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
62; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
63; GCN-NOT: 1.0
64define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(float addrspace(1)* %arg) {
65  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
66  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
67  %load = load float, float addrspace(1)* %gep, align 4
68  %v = call float @llvm.sqrt.f32(float %load)
69  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
70  store float %canonicalized, float addrspace(1)* %gep, align 4
71  ret void
72}
73
74; GCN-LABEL: test_fold_canonicalize_fceil_value_f32:
75; GCN: v_ceil_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
76; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
77; GCN-NOT: 1.0
78define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(float addrspace(1)* %arg) {
79  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
80  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
81  %load = load float, float addrspace(1)* %gep, align 4
82  %v = call float @llvm.ceil.f32(float %load)
83  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
84  store float %canonicalized, float addrspace(1)* %gep, align 4
85  ret void
86}
87
88; GCN-LABEL: test_fold_canonicalize_floor_value_f32:
89; GCN: v_floor_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
90; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
91; GCN-NOT: 1.0
92define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(float addrspace(1)* %arg) {
93  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
94  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
95  %load = load float, float addrspace(1)* %gep, align 4
96  %v = call float @llvm.floor.f32(float %load)
97  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
98  store float %canonicalized, float addrspace(1)* %gep, align 4
99  ret void
100}
101
102; GCN-LABEL: test_fold_canonicalize_fma_value_f32:
103; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
104; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
105; GCN-NOT: 1.0
106define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(1)* %arg) {
107  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
108  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
109  %load = load float, float addrspace(1)* %gep, align 4
110  %v = call float @llvm.fma.f32(float %load, float 15.0, float 15.0)
111  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
112  store float %canonicalized, float addrspace(1)* %gep, align 4
113  ret void
114}
115
116; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32:
117; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
118; GFX9-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
119; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
120; GCN-NOT: 1.0
121define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrspace(1)* %arg) {
122  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
123  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
124  %load = load float, float addrspace(1)* %gep, align 4
125  %v = call float @llvm.fmuladd.f32(float %load, float 15.0, float 15.0)
126  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
127  store float %canonicalized, float addrspace(1)* %gep, align 4
128  ret void
129}
130
131; GCN-LABEL: test_fold_canonicalize_canonicalize_value_f32:
132; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]],
133; GCN-FLUSH:  v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]]
134; GCN-DENORM: v_max_f32_e32 [[V:v[0-9]+]], [[LOAD]], [[LOAD]]
135; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
136; GCN-NOT: 1.0
137define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float addrspace(1)* %arg) {
138  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
139  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
140  %load = load float, float addrspace(1)* %gep, align 4
141  %v = call float @llvm.canonicalize.f32(float %load)
142  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
143  store float %canonicalized, float addrspace(1)* %gep, align 4
144  ret void
145}
146
147; GCN-LABEL: test_fold_canonicalize_fpextend_value_f64_f32:
148; GCN: v_cvt_f64_f32_e32 [[V:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
149; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
150; GCN-NOT: 1.0
151define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(float addrspace(1)* %arg, double addrspace(1)* %out) {
152  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
153  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
154  %load = load float, float addrspace(1)* %gep, align 4
155  %v = fpext float %load to double
156  %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
157  %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
158  store double %canonicalized, double addrspace(1)* %gep2, align 8
159  ret void
160}
161
162; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16:
163; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}}
164; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
165; GCN-NOT: 1.0
166define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(half addrspace(1)* %arg, float addrspace(1)* %out) {
167  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
168  %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
169  %load = load half, half addrspace(1)* %gep, align 2
170  %v = fpext half %load to float
171  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
172  %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
173  store float %canonicalized, float addrspace(1)* %gep2, align 4
174  ret void
175}
176
177; GCN-LABEL: test_fold_canonicalize_fpround_value_f32_f64:
178; GCN: v_cvt_f32_f64_e32 [[V:v[0-9]+]], v[{{[0-9:]+}}]
179; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
180; GCN-NOT: 1.0
181define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(double addrspace(1)* %arg, float addrspace(1)* %out) {
182  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
183  %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
184  %load = load double, double addrspace(1)* %gep, align 8
185  %v = fptrunc double %load to float
186  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
187  %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
188  store float %canonicalized, float addrspace(1)* %gep2, align 4
189  ret void
190}
191
192; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32:
193; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
194; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
195; GCN-NOT: 1.0
196define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(float addrspace(1)* %arg, half addrspace(1)* %out) {
197  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
198  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
199  %load = load float, float addrspace(1)* %gep, align 4
200  %v = fptrunc float %load to half
201  %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
202  %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
203  store half %canonicalized, half addrspace(1)* %gep2, align 2
204  ret void
205}
206
207; GCN-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32:
208; GCN-DAG: v_cvt_f16_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
209; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}}
210; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V0]], [[V1]]
211; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}}
212; GFX9: v_and_b32_e32 [[V0_16:v[0-9]+]], 0xffff, [[V0]]
213; GFX9: v_lshl_or_b32 [[V:v[0-9]+]], [[V1]], 16, [[V0_16]]
214; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
215; GCN-NOT: 1.0
216define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x float> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) {
217  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
218  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arg, i32 %id
219  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
220  %v = fptrunc <2 x float> %load to <2 x half>
221  %canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v)
222  %gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i32 %id
223  store <2 x half> %canonicalized, <2 x half> addrspace(1)* %gep2, align 4
224  ret void
225}
226
227; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32:
228; GCN-FLUSH:  v_mul_f32_e32 v{{[0-9]+}}, -1.0, v{{[0-9]+}}
229; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
230define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
231  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
232  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
233  %load = load float, float addrspace(1)* %gep, align 4
234  %v = fsub float -0.0, %load
235  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
236  store float %canonicalized, float addrspace(1)* %gep, align 4
237  ret void
238}
239
240; GCN-LABEL: test_fold_canonicalize_fneg_value_f32:
241; GCN: v_xor_b32_e32 [[V:v[0-9]+]], 0x80000000, v{{[0-9]+}}
242; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
243; GCN-NOT: 1.0
244define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
245  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
246  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
247  %load = load float, float addrspace(1)* %gep, align 4
248  %v0 = fadd float %load, 0.0
249  %v = fsub float -0.0, %v0
250  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
251  store float %canonicalized, float addrspace(1)* %gep, align 4
252  ret void
253}
254
255; GCN-LABEL: test_no_fold_canonicalize_fabs_value_f32:
256; GCN-FLUSH:  v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
257; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
258define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
259  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
260  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
261  %load = load float, float addrspace(1)* %gep, align 4
262  %v = tail call float @llvm.fabs.f32(float %load)
263  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
264  store float %canonicalized, float addrspace(1)* %gep, align 4
265  ret void
266}
267
268; GCN-LABEL: test_fold_canonicalize_fabs_value_f32:
269; GCN: v_and_b32_e32 [[V:v[0-9]+]], 0x7fffffff, v{{[0-9]+}}
270; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
271; GCN-NOT: 1.0
272define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
273  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
274  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
275  %load = load float, float addrspace(1)* %gep, align 4
276  %v0 = fadd float %load, 0.0
277  %v = tail call float @llvm.fabs.f32(float %v0)
278  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
279  store float %canonicalized, float addrspace(1)* %gep, align 4
280  ret void
281}
282
283; GCN-LABEL: test_fold_canonicalize_sin_value_f32:
284; GCN: v_sin_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
285; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
286; GCN-NOT: 1.0
287define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(float addrspace(1)* %arg) {
288  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
289  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
290  %load = load float, float addrspace(1)* %gep, align 4
291  %v = tail call float @llvm.sin.f32(float %load)
292  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
293  store float %canonicalized, float addrspace(1)* %gep, align 4
294  ret void
295}
296
297; GCN-LABEL: test_fold_canonicalize_cos_value_f32:
298; GCN: v_cos_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
299; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
300; GCN-NOT: 1.0
301define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(float addrspace(1)* %arg) {
302  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
303  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
304  %load = load float, float addrspace(1)* %gep, align 4
305  %v = tail call float @llvm.cos.f32(float %load)
306  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
307  store float %canonicalized, float addrspace(1)* %gep, align 4
308  ret void
309}
310
311; GCN-LABEL: test_fold_canonicalize_sin_value_f16:
312; GCN: v_sin_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
313; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
314; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
315; GCN-NOT: 1.0
316define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(half addrspace(1)* %arg) {
317  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
318  %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
319  %load = load half, half addrspace(1)* %gep, align 2
320  %v = tail call half @llvm.sin.f16(half %load)
321  %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
322  store half %canonicalized, half addrspace(1)* %gep, align 2
323  ret void
324}
325
326; GCN-LABEL: test_fold_canonicalize_cos_value_f16:
327; GCN: v_cos_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
328; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
329; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
330; GCN-NOT: 1.0
331define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(half addrspace(1)* %arg) {
332  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
333  %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
334  %load = load half, half addrspace(1)* %gep, align 2
335  %v = tail call half @llvm.cos.f16(half %load)
336  %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
337  store half %canonicalized, half addrspace(1)* %gep, align 2
338  ret void
339}
340
341; GCN-LABEL: test_fold_canonicalize_qNaN_value_f32:
342; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x7fc00000
343; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
344; GCN-NOT: 1.0
345define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace(1)* %arg) {
346  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
347  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
348  %canonicalized = tail call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
349  store float %canonicalized, float addrspace(1)* %gep, align 4
350  ret void
351}
352
353; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32:
354; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
355; GFX9: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
356; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
357define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32(float addrspace(1)* %arg) {
358  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
359  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
360  %load = load float, float addrspace(1)* %gep, align 4
361  %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
362  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
363  store float %canonicalized, float addrspace(1)* %gep, align 4
364  ret void
365}
366
367; GCN-LABEL: test_fold_canonicalize_minnum_value_f32:
368; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
369; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
370; GCN-NOT: 1.0
371define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspace(1)* %arg) {
372  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
373  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
374  %load = load float, float addrspace(1)* %gep, align 4
375  %v0 = fadd float %load, 0.0
376  %v = tail call float @llvm.minnum.f32(float %v0, float 0.0)
377  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
378  store float %canonicalized, float addrspace(1)* %gep, align 4
379  ret void
380}
381
382; FIXME: Should there be more checks here? minnum with NaN operand is simplified away.
383
384; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32:
385; VI:   v_add_u32_e32 v{{[0-9]+}}
386; GFX9:	v_add_co_u32_e32 v{{[0-9]+}}
387; GCN:  {{flat|global}}_store_dword v[{{[0-9:]+}}]
388define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace(1)* %arg) {
389  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
390  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
391  %load = load float, float addrspace(1)* %gep, align 4
392  %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 2139095041 to float))
393  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
394  store float %canonicalized, float addrspace(1)* %gep, align 4
395  ret void
396}
397
398; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
399; GFX9:  v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
400; VI:    v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
401; VI:    v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
402; GCN:   {{flat|global}}_store_dword v[{{[0-9:]+}}], [[RESULT]]
403; GFX9-NOT: 1.0
404define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) {
405  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
406  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
407  %load = load float, float addrspace(1)* %gep, align 4
408  %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
409  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
410  store float %canonicalized, float addrspace(1)* %gep, align 4
411  ret void
412}
413
414; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32:
415; GFX9:  v_max_f32_e32 [[RESULT:v[0-9]+]], 0, v{{[0-9]+}}
416; VI:    v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
417; VI:    v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
418; GCN:  {{flat|global}}_store_dword v[{{[0-9:]+}}], [[RESULT]]
419; GFX9-NOT: 1.0
420define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32(float addrspace(1)* %arg) {
421  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
422  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
423  %load = load float, float addrspace(1)* %gep, align 4
424  %v = tail call float @llvm.maxnum.f32(float %load, float 0.0)
425  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
426  store float %canonicalized, float addrspace(1)* %gep, align 4
427  ret void
428}
429
430; GCN-LABEL: test_fold_canonicalize_maxnum_value_f32:
431; GCN: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
432; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
433; GCN-NOT: 1.0
434define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(float addrspace(1)* %arg) {
435  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
436  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
437  %load = load float, float addrspace(1)* %gep, align 4
438  %v0 = fadd float %load, 0.0
439  %v = tail call float @llvm.maxnum.f32(float %v0, float 0.0)
440  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
441  store float %canonicalized, float addrspace(1)* %gep, align 4
442  ret void
443}
444
445; GCN-LABEL: test_fold_canonicalize_maxnum_value_f64:
446; GCN: v_max_f64 [[V:v\[[0-9]+:[0-9]+\]]], v[{{[0-9:]+}}], 0
447; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
448; GCN-NOT: 1.0
449define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(double addrspace(1)* %arg) {
450  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
451  %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
452  %load = load double, double addrspace(1)* %gep, align 8
453  %v0 = fadd double %load, 0.0
454  %v = tail call double @llvm.maxnum.f64(double %v0, double 0.0)
455  %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
456  store double %canonicalized, double addrspace(1)* %gep, align 8
457  ret void
458}
459
460; GCN-LABEL: test_no_fold_canonicalize_fmul_value_f32_no_ieee:
461; GCN-EXCEPT: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
462define amdgpu_ps float @test_no_fold_canonicalize_fmul_value_f32_no_ieee(float %arg) {
463entry:
464  %v = fmul float %arg, 15.0
465  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
466  ret float %canonicalized
467}
468
469; GCN-LABEL: test_fold_canonicalize_fmul_nnan_value_f32_no_ieee:
470; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
471; GCN-NEXT: ; return
472; GCN-NOT: 1.0
473define amdgpu_ps float @test_fold_canonicalize_fmul_nnan_value_f32_no_ieee(float %arg) {
474entry:
475  %v = fmul nnan float %arg, 15.0
476  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
477  ret float %canonicalized
478}
479
480; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f32
481; GFX9-DENORM: global_load_dword [[V:v[0-9]+]],
482; GFX9-DENORM: global_store_dword v[{{[0-9:]+}}], [[V]]
483; GFX9-DENORM-NOT: 1.0
484; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
485define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(float addrspace(1)* %arg, float addrspace(1)* %out) #1 {
486  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
487  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
488  %v = load float, float addrspace(1)* %gep, align 4
489  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
490  %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
491  store float %canonicalized, float addrspace(1)* %gep2, align 4
492  ret void
493}
494
495; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f64
496; GCN: {{flat|global}}_load_dwordx2 [[V:v\[[0-9:]+\]]],
497; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
498; GCN-NOT: 1.0
499define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(double addrspace(1)* %arg, double addrspace(1)* %out) #1 {
500  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
501  %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
502  %v = load double, double addrspace(1)* %gep, align 8
503  %canonicalized = tail call double @llvm.canonicalize.f64(double %v)
504  %gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
505  store double %canonicalized, double addrspace(1)* %gep2, align 8
506  ret void
507}
508
509; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16
510; GCN: {{flat|global}}_load_ushort [[V:v[0-9]+]],
511; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
512; GCN-NOT: 1.0
513define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(half addrspace(1)* %arg, half addrspace(1)* %out) #1 {
514  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
515  %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
516  %v = load half, half addrspace(1)* %gep, align 2
517  %canonicalized = tail call half @llvm.canonicalize.f16(half %v)
518  %gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
519  store half %canonicalized, half addrspace(1)* %gep2, align 2
520  ret void
521}
522
523; Avoid failing the test on FreeBSD11.0 which will match the GCN-NOT: 1.0
524; in the .amd_amdgpu_isa "amdgcn-unknown-freebsd11.0--gfx802" directive
525; CHECK: .amd_amdgpu_isa
526
527declare float @llvm.canonicalize.f32(float) #0
528declare double @llvm.canonicalize.f64(double) #0
529declare half @llvm.canonicalize.f16(half) #0
530declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
531declare i32 @llvm.amdgcn.workitem.id.x() #0
532declare float @llvm.sqrt.f32(float) #0
533declare float @llvm.ceil.f32(float) #0
534declare float @llvm.floor.f32(float) #0
535declare float @llvm.fma.f32(float, float, float) #0
536declare float @llvm.fmuladd.f32(float, float, float) #0
537declare float @llvm.fabs.f32(float) #0
538declare float @llvm.sin.f32(float) #0
539declare float @llvm.cos.f32(float) #0
540declare half @llvm.sin.f16(half) #0
541declare half @llvm.cos.f16(half) #0
542declare float @llvm.minnum.f32(float, float) #0
543declare float @llvm.maxnum.f32(float, float) #0
544declare double @llvm.maxnum.f64(double, double) #0
545
546attributes #0 = { nounwind readnone }
547attributes #1 = { "no-nans-fp-math"="true" }
548