1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
4
5
6; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f32:
7; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v{{[0-9]+}}
8; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
9define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
10  %tid = call i32 @llvm.amdgcn.workitem.id.x()
11  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
12  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
13  %a = load float, float addrspace(1)* %gep0
14  %a.add = fadd nnan float %a, 1.0
15  %max = call float @llvm.maxnum.f32(float %a.add, float 2.0)
16  %med = call float @llvm.minnum.f32(float %max, float 4.0)
17
18  store float %med, float addrspace(1)* %outgep
19  ret void
20}
21
22; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_f32:
23; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
24define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
25  %tid = call i32 @llvm.amdgcn.workitem.id.x()
26  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
27  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
28  %a = load float, float addrspace(1)* %gep0
29  %a.add = fadd nnan float %a, 1.0
30
31  %max = call float @llvm.maxnum.f32(float %a.add, float 2.0)
32  %med = call float @llvm.minnum.f32(float %max, float 4.0)
33
34  store float %med, float addrspace(1)* %outgep
35  ret void
36}
37
38; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_commute0_f32:
39; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
40define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
41  %tid = call i32 @llvm.amdgcn.workitem.id.x()
42  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
43  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
44  %a = load float, float addrspace(1)* %gep0
45  %a.add = fadd nnan float %a, 1.0
46
47  %max = call float @llvm.maxnum.f32(float 2.0, float %a.add)
48  %med = call float @llvm.minnum.f32(float 4.0, float %max)
49
50  store float %med, float addrspace(1)* %outgep
51  ret void
52}
53
54; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_commute1_f32:
55; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
56define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
57  %tid = call i32 @llvm.amdgcn.workitem.id.x()
58  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
59  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
60  %a = load float, float addrspace(1)* %gep0
61  %a.add = fadd nnan float %a, 1.0
62
63  %max = call float @llvm.maxnum.f32(float %a.add, float 2.0)
64  %med = call float @llvm.minnum.f32(float 4.0, float %max)
65
66  store float %med, float addrspace(1)* %outgep
67  ret void
68}
69
70; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_constant_order_f32:
71; GCN: v_max_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
72; GCN: v_min_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
73define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
74  %tid = call i32 @llvm.amdgcn.workitem.id.x()
75  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
76  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
77  %a = load float, float addrspace(1)* %gep0
78  %a.add = fadd nnan float %a, 1.0
79
80  %max = call float @llvm.maxnum.f32(float %a.add, float 4.0)
81  %med = call float @llvm.minnum.f32(float %max, float 2.0)
82
83  store float %med, float addrspace(1)* %outgep
84  ret void
85}
86
87; GCN-LABEL: {{^}}v_test_fmed3_nnan_r_i_i_multi_use_f32:
88; GCN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
89; GCN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
90define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
91  %tid = call i32 @llvm.amdgcn.workitem.id.x()
92  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
93  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
94  %a = load float, float addrspace(1)* %gep0
95  %a.add = fadd nnan float %a, 1.0
96
97  %max = call float @llvm.maxnum.f32(float %a.add, float 2.0)
98  %med = call float @llvm.minnum.f32(float %max, float 4.0)
99
100  store volatile float %med, float addrspace(1)* %outgep
101  store volatile float %max, float addrspace(1)* %outgep
102  ret void
103}
104
105; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f64:
106; GCN: v_max_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 2.0
107; GCN: v_min_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 4.0
108define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 {
109  %tid = call i32 @llvm.amdgcn.workitem.id.x()
110  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
111  %outgep = getelementptr double, double addrspace(1)* %out, i32 %tid
112  %a = load double, double addrspace(1)* %gep0
113  %a.add = fadd nnan double %a, 1.0
114
115  %max = call double @llvm.maxnum.f64(double %a.add, double 2.0)
116  %med = call double @llvm.minnum.f64(double %max, double 4.0)
117
118  store double %med, double addrspace(1)* %outgep
119  ret void
120}
121
122; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_no_nans_f32:
123; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
124define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
125  %tid = call i32 @llvm.amdgcn.workitem.id.x()
126  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
127  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
128  %a = load float, float addrspace(1)* %gep0
129
130  %max = call float @llvm.maxnum.f32(float %a, float 2.0)
131  %med = call float @llvm.minnum.f32(float %max, float 4.0)
132
133  store float %med, float addrspace(1)* %outgep
134  ret void
135}
136
137; GCN-LABEL: {{^}}v_test_legacy_fmed3_r_i_i_f32:
138; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
139define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
140  %tid = call i32 @llvm.amdgcn.workitem.id.x()
141  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
142  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
143  %a = load float, float addrspace(1)* %gep0
144  %a.nnan = fadd nnan float %a, 1.0
145
146  ; fmax_legacy
147  %cmp0 = fcmp ule float %a.nnan, 2.0
148  %max = select i1 %cmp0, float 2.0, float %a.nnan
149
150  ; fmin_legacy
151  %cmp1 = fcmp uge float %max, 4.0
152  %med = select i1 %cmp1, float 4.0, float %max
153
154  store float %med, float addrspace(1)* %outgep
155  ret void
156}
157
158; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod0:
159; GCN: {{buffer_|flat_|global_}}load_dword [[A:v[0-9]+]]
160; GCN: {{buffer_|flat_|global_}}load_dword [[B:v[0-9]+]]
161; GCN: {{buffer_|flat_|global_}}load_dword [[C:v[0-9]+]]
162; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], [[B]], [[C]]
163define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
164  %tid = call i32 @llvm.amdgcn.workitem.id.x()
165  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
166  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
167  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
168  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
169  %a = load volatile float, float addrspace(1)* %gep0
170  %b = load volatile float, float addrspace(1)* %gep1
171  %c = load volatile float, float addrspace(1)* %gep2
172  %a.fneg = fsub float -0.0, %a
173  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
174  %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b)
175  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
176  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
177  store float %med3, float addrspace(1)* %outgep
178  ret void
179}
180
181; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod1:
182; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
183; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
184; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
185; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], -[[B]], [[C]]
186define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
187  %tid = call i32 @llvm.amdgcn.workitem.id.x()
188  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
189  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
190  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
191  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
192  %a = load volatile float, float addrspace(1)* %gep0
193  %b = load volatile float, float addrspace(1)* %gep1
194  %c = load volatile float, float addrspace(1)* %gep2
195  %b.fneg = fsub float -0.0, %b
196  %tmp0 = call float @llvm.minnum.f32(float %a, float %b.fneg)
197  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b.fneg)
198  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
199  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
200  store float %med3, float addrspace(1)* %outgep
201  ret void
202}
203
204; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod2:
205; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
206; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
207; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
208; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], -[[C]]
209define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
210  %tid = call i32 @llvm.amdgcn.workitem.id.x()
211  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
212  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
213  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
214  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
215  %a = load volatile float, float addrspace(1)* %gep0
216  %b = load volatile float, float addrspace(1)* %gep1
217  %c = load volatile float, float addrspace(1)* %gep2
218  %c.fneg = fsub float -0.0, %c
219  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
220  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
221  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fneg)
222  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
223  store float %med3, float addrspace(1)* %outgep
224  ret void
225}
226
227; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod012:
228; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
229; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
230; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
231; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], |[[B]]|, -|[[C]]|
232define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
233  %tid = call i32 @llvm.amdgcn.workitem.id.x()
234  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
235  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
236  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
237  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
238  %a = load volatile float, float addrspace(1)* %gep0
239  %b = load volatile float, float addrspace(1)* %gep1
240  %c = load volatile float, float addrspace(1)* %gep2
241
242  %a.fneg = fsub float -0.0, %a
243  %b.fabs = call float @llvm.fabs.f32(float %b)
244  %c.fabs = call float @llvm.fabs.f32(float %c)
245  %c.fabs.fneg = fsub float -0.0, %c.fabs
246
247  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b.fabs)
248  %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b.fabs)
249  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
250  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
251
252  store float %med3, float addrspace(1)* %outgep
253  ret void
254}
255
256; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_negabs012:
257; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
258; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
259; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
260; GCN: v_med3_f32 v{{[0-9]+}}, -|[[A]]|, -|[[B]]|, -|[[C]]|
261define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
262  %tid = call i32 @llvm.amdgcn.workitem.id.x()
263  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
264  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
265  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
266  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
267  %a = load volatile float, float addrspace(1)* %gep0
268  %b = load volatile float, float addrspace(1)* %gep1
269  %c = load volatile float, float addrspace(1)* %gep2
270
271  %a.fabs = call float @llvm.fabs.f32(float %a)
272  %a.fabs.fneg = fsub float -0.0, %a.fabs
273  %b.fabs = call float @llvm.fabs.f32(float %b)
274  %b.fabs.fneg = fsub float -0.0, %b.fabs
275  %c.fabs = call float @llvm.fabs.f32(float %c)
276  %c.fabs.fneg = fsub float -0.0, %c.fabs
277
278  %tmp0 = call float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
279  %tmp1 = call float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
280  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
281  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
282
283  store float %med3, float addrspace(1)* %outgep
284  ret void
285}
286
287; GCN-LABEL: {{^}}v_nnan_inputs_med3_f32_pat0:
288; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
289; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
290; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
291; GCN-DAG: v_add_f32_e32 [[A_ADD:v[0-9]+]], 1.0, [[A]]
292; GCN-DAG: v_add_f32_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]]
293; GCN-DAG: v_add_f32_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]]
294; GCN: v_med3_f32 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
295define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
296  %tid = call i32 @llvm.amdgcn.workitem.id.x()
297  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
298  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
299  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
300  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
301  %a = load volatile float, float addrspace(1)* %gep0
302  %b = load volatile float, float addrspace(1)* %gep1
303  %c = load volatile float, float addrspace(1)* %gep2
304
305  %a.nnan = fadd nnan float %a, 1.0
306  %b.nnan = fadd nnan float %b, 2.0
307  %c.nnan = fadd nnan float %c, 4.0
308
309  %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
310  %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
311  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
312  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
313  store float %med3, float addrspace(1)* %outgep
314  ret void
315}
316
317; 16 combinations
318
319; 0: max(min(x, y), min(max(x, y), z))
320; 1: max(min(x, y), min(max(y, x), z))
321; 2: max(min(x, y), min(z, max(x, y)))
322; 3: max(min(x, y), min(z, max(y, x)))
323; 4: max(min(y, x), min(max(x, y), z))
324; 5: max(min(y, x), min(max(y, x), z))
325; 6: max(min(y, x), min(z, max(x, y)))
326; 7: max(min(y, x), min(z, max(y, x)))
327;
328; + commute outermost max
329
330; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0:
331; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
332; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
333; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
334; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
335define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
336  %tid = call i32 @llvm.amdgcn.workitem.id.x()
337  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
338  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
339  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
340  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
341  %a = load volatile float, float addrspace(1)* %gep0
342  %b = load volatile float, float addrspace(1)* %gep1
343  %c = load volatile float, float addrspace(1)* %gep2
344  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
345  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
346  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
347  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
348  store float %med3, float addrspace(1)* %outgep
349  ret void
350}
351
352; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat1:
353; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
354; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
355; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
356; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
357define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
358  %tid = call i32 @llvm.amdgcn.workitem.id.x()
359  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
360  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
361  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
362  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
363  %a = load volatile float, float addrspace(1)* %gep0
364  %b = load volatile float, float addrspace(1)* %gep1
365  %c = load volatile float, float addrspace(1)* %gep2
366  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
367  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
368  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
369  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
370  store float %med3, float addrspace(1)* %outgep
371  ret void
372}
373
374; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat2:
375; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
376; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
377; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
378; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
379define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
380  %tid = call i32 @llvm.amdgcn.workitem.id.x()
381  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
382  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
383  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
384  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
385  %a = load volatile float, float addrspace(1)* %gep0
386  %b = load volatile float, float addrspace(1)* %gep1
387  %c = load volatile float, float addrspace(1)* %gep2
388  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
389  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
390  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
391  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
392  store float %med3, float addrspace(1)* %outgep
393  ret void
394}
395
396; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat3:
397; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
398; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
399; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
400; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
401define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
402  %tid = call i32 @llvm.amdgcn.workitem.id.x()
403  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
404  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
405  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
406  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
407  %a = load volatile float, float addrspace(1)* %gep0
408  %b = load volatile float, float addrspace(1)* %gep1
409  %c = load volatile float, float addrspace(1)* %gep2
410  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
411  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
412  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
413  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
414  store float %med3, float addrspace(1)* %outgep
415  ret void
416}
417
418; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat4:
419; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
420; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
421; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
422; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
423define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
424  %tid = call i32 @llvm.amdgcn.workitem.id.x()
425  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
426  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
427  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
428  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
429  %a = load volatile float, float addrspace(1)* %gep0
430  %b = load volatile float, float addrspace(1)* %gep1
431  %c = load volatile float, float addrspace(1)* %gep2
432  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
433  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
434  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
435  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
436  store float %med3, float addrspace(1)* %outgep
437  ret void
438}
439
440; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat5:
441; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
442; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
443; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
444; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
445define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
446  %tid = call i32 @llvm.amdgcn.workitem.id.x()
447  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
448  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
449  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
450  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
451  %a = load volatile float, float addrspace(1)* %gep0
452  %b = load volatile float, float addrspace(1)* %gep1
453  %c = load volatile float, float addrspace(1)* %gep2
454  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
455  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
456  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
457  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
458  store float %med3, float addrspace(1)* %outgep
459  ret void
460}
461
462; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat6:
463; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
464; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
465; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
466; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
467define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
468  %tid = call i32 @llvm.amdgcn.workitem.id.x()
469  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
470  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
471  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
472  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
473  %a = load volatile float, float addrspace(1)* %gep0
474  %b = load volatile float, float addrspace(1)* %gep1
475  %c = load volatile float, float addrspace(1)* %gep2
476  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
477  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
478  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
479  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
480  store float %med3, float addrspace(1)* %outgep
481  ret void
482}
483
484; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat7:
485; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
486; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
487; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
488; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
489define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
490  %tid = call i32 @llvm.amdgcn.workitem.id.x()
491  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
492  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
493  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
494  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
495  %a = load volatile float, float addrspace(1)* %gep0
496  %b = load volatile float, float addrspace(1)* %gep1
497  %c = load volatile float, float addrspace(1)* %gep2
498  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
499  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
500  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
501  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
502  store float %med3, float addrspace(1)* %outgep
503  ret void
504}
505
506; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat8:
507; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
508; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
509; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
510; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
511define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
512  %tid = call i32 @llvm.amdgcn.workitem.id.x()
513  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
514  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
515  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
516  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
517  %a = load volatile float, float addrspace(1)* %gep0
518  %b = load volatile float, float addrspace(1)* %gep1
519  %c = load volatile float, float addrspace(1)* %gep2
520  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
521  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
522  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
523  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
524  store float %med3, float addrspace(1)* %outgep
525  ret void
526}
527
528; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat9:
529; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
530; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
531; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
532; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
533define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
534  %tid = call i32 @llvm.amdgcn.workitem.id.x()
535  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
536  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
537  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
538  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
539  %a = load volatile float, float addrspace(1)* %gep0
540  %b = load volatile float, float addrspace(1)* %gep1
541  %c = load volatile float, float addrspace(1)* %gep2
542  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
543  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
544  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
545  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
546  store float %med3, float addrspace(1)* %outgep
547  ret void
548}
549
550; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat10:
551; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
552; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
553; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
554; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
555define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
556  %tid = call i32 @llvm.amdgcn.workitem.id.x()
557  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
558  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
559  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
560  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
561  %a = load volatile float, float addrspace(1)* %gep0
562  %b = load volatile float, float addrspace(1)* %gep1
563  %c = load volatile float, float addrspace(1)* %gep2
564  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
565  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
566  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
567  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
568  store float %med3, float addrspace(1)* %outgep
569  ret void
570}
571
572; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat11:
573; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
574; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
575; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
576; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
577define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
578  %tid = call i32 @llvm.amdgcn.workitem.id.x()
579  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
580  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
581  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
582  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
583  %a = load volatile float, float addrspace(1)* %gep0
584  %b = load volatile float, float addrspace(1)* %gep1
585  %c = load volatile float, float addrspace(1)* %gep2
586  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
587  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
588  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
589  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
590  store float %med3, float addrspace(1)* %outgep
591  ret void
592}
593
594; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat12:
595; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
596; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
597; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
598; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
599define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
600  %tid = call i32 @llvm.amdgcn.workitem.id.x()
601  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
602  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
603  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
604  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
605  %a = load volatile float, float addrspace(1)* %gep0
606  %b = load volatile float, float addrspace(1)* %gep1
607  %c = load volatile float, float addrspace(1)* %gep2
608  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
609  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
610  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
611  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
612  store float %med3, float addrspace(1)* %outgep
613  ret void
614}
615
616; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat13:
617; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
618; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
619; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
620; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
621define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
622  %tid = call i32 @llvm.amdgcn.workitem.id.x()
623  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
624  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
625  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
626  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
627  %a = load volatile float, float addrspace(1)* %gep0
628  %b = load volatile float, float addrspace(1)* %gep1
629  %c = load volatile float, float addrspace(1)* %gep2
630  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
631  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
632  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
633  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
634  store float %med3, float addrspace(1)* %outgep
635  ret void
636}
637
638; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat14:
639; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
640; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
641; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
642; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
643define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
644  %tid = call i32 @llvm.amdgcn.workitem.id.x()
645  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
646  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
647  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
648  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
649  %a = load volatile float, float addrspace(1)* %gep0
650  %b = load volatile float, float addrspace(1)* %gep1
651  %c = load volatile float, float addrspace(1)* %gep2
652  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
653  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
654  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
655  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
656  store float %med3, float addrspace(1)* %outgep
657  ret void
658}
659
660; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat15:
661; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
662; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
663; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
664; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
665define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
666  %tid = call i32 @llvm.amdgcn.workitem.id.x()
667  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
668  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
669  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
670  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
671  %a = load volatile float, float addrspace(1)* %gep0
672  %b = load volatile float, float addrspace(1)* %gep1
673  %c = load volatile float, float addrspace(1)* %gep2
674  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
675  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
676  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
677  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
678  store float %med3, float addrspace(1)* %outgep
679  ret void
680}
681
682; ---------------------------------------------------------------------
683; Negative patterns
684; ---------------------------------------------------------------------
685
686; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use0:
687; GCN-DAG: v_min_f32
688; GCN-DAG: v_max_f32
689; GCN: v_min_f32
690; GCN: v_max_f32
691define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
692  %tid = call i32 @llvm.amdgcn.workitem.id.x()
693  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
694  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
695  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
696  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
697  %a = load volatile float, float addrspace(1)* %gep0
698  %b = load volatile float, float addrspace(1)* %gep1
699  %c = load volatile float, float addrspace(1)* %gep2
700  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
701  store volatile float %tmp0, float addrspace(1)* undef
702  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
703  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
704  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
705  store float %med3, float addrspace(1)* %outgep
706  ret void
707}
708
709; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use1:
710define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
711  %tid = call i32 @llvm.amdgcn.workitem.id.x()
712  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
713  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
714  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
715  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
716  %a = load volatile float, float addrspace(1)* %gep0
717  %b = load volatile float, float addrspace(1)* %gep1
718  %c = load volatile float, float addrspace(1)* %gep2
719  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
720  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
721  store volatile float %tmp1, float addrspace(1)* undef
722  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
723  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
724  store float %med3, float addrspace(1)* %outgep
725  ret void
726}
727
728; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use2:
729define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
730  %tid = call i32 @llvm.amdgcn.workitem.id.x()
731  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
732  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
733  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
734  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
735  %a = load volatile float, float addrspace(1)* %gep0
736  %b = load volatile float, float addrspace(1)* %gep1
737  %c = load volatile float, float addrspace(1)* %gep2
738  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
739  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
740  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
741  store volatile float %tmp2, float addrspace(1)* undef
742  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
743  store float %med3, float addrspace(1)* %outgep
744  ret void
745}
746
747
748; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0:
749define amdgpu_kernel void @v_test_safe_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
750  %tid = call i32 @llvm.amdgcn.workitem.id.x()
751  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
752  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
753  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
754  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
755  %a = load volatile float, float addrspace(1)* %gep0
756  %b = load volatile float, float addrspace(1)* %gep1
757  %c = load volatile float, float addrspace(1)* %gep2
758  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
759  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
760  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
761  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
762  store float %med3, float addrspace(1)* %outgep
763  ret void
764}
765
766; GCN-LABEL: {{^}}v_nnan_inputs_missing0_med3_f32_pat0:
767define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
768  %tid = call i32 @llvm.amdgcn.workitem.id.x()
769  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
770  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
771  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
772  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
773  %a = load volatile float, float addrspace(1)* %gep0
774  %b = load volatile float, float addrspace(1)* %gep1
775  %c = load volatile float, float addrspace(1)* %gep2
776
777  %a.nnan = fadd float %a, 1.0
778  %b.nnan = fadd nnan float %b, 2.0
779  %c.nnan = fadd nnan float %c, 4.0
780
781  %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
782  %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
783  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
784  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
785  store float %med3, float addrspace(1)* %outgep
786  ret void
787}
788
789; GCN-LABEL: {{^}}v_nnan_inputs_missing1_med3_f32_pat0:
790define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
791  %tid = call i32 @llvm.amdgcn.workitem.id.x()
792  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
793  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
794  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
795  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
796  %a = load volatile float, float addrspace(1)* %gep0
797  %b = load volatile float, float addrspace(1)* %gep1
798  %c = load volatile float, float addrspace(1)* %gep2
799
800  %a.nnan = fadd nnan float %a, 1.0
801  %b.nnan = fadd float %b, 2.0
802  %c.nnan = fadd nnan float %c, 4.0
803
804  %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
805  %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
806  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
807  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
808  store float %med3, float addrspace(1)* %outgep
809  ret void
810}
811
812; GCN-LABEL: {{^}}v_nnan_inputs_missing2_med3_f32_pat0:
813define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
814  %tid = call i32 @llvm.amdgcn.workitem.id.x()
815  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
816  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
817  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
818  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
819  %a = load volatile float, float addrspace(1)* %gep0
820  %b = load volatile float, float addrspace(1)* %gep1
821  %c = load volatile float, float addrspace(1)* %gep2
822
823  %a.nnan = fadd nnan float %a, 1.0
824  %b.nnan = fadd nnan float %b, 2.0
825  %c.nnan = fadd float %c, 4.0
826
827  %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
828  %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
829  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
830  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
831  store float %med3, float addrspace(1)* %outgep
832  ret void
833}
834
835; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
836; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
837; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
838; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
839; GCN-DAG: v_min_f32
840; GCN-DAG: v_max_f32
841; GCN-DAG: v_min_f32
842; GCN-DAG: v_max_f32
843define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
844  %tid = call i32 @llvm.amdgcn.workitem.id.x()
845  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
846  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
847  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
848  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
849  %a = load volatile float, float addrspace(1)* %gep0
850  %b = load volatile float, float addrspace(1)* %gep1
851  %c = load volatile float, float addrspace(1)* %gep2
852  %a.fneg = fsub float -0.0, %a
853  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
854  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
855  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
856  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
857  store float %med3, float addrspace(1)* %outgep
858  ret void
859}
860
861; A simple min and max is not sufficient
862; GCN-LABEL: {{^}}v_test_global_nnans_min_max_f32:
863; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
864; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
865; GCN: {{buffer|flat|global}}_load_dword [[C:v[0-9]+]]
866; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[A]], [[B]]
867; GCN: v_min_f32_e32 v{{[0-9]+}}, [[MAX]], [[C]]
868define amdgpu_kernel void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
869  %tid = call i32 @llvm.amdgcn.workitem.id.x()
870  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
871  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
872  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
873  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
874  %a = load volatile float, float addrspace(1)* %gep0
875  %b = load volatile float, float addrspace(1)* %gep1
876  %c = load volatile float, float addrspace(1)* %gep2
877  %max = call float @llvm.maxnum.f32(float %a, float %b)
878  %minmax = call float @llvm.minnum.f32(float %max, float %c)
879  store float %minmax, float addrspace(1)* %outgep
880  ret void
881}
882
883; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f16:
884; SI: v_cvt_f32_f16
885; SI: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
886; SI: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
887; SI: v_cvt_f16_f32
888
889; VI: v_add_f16_e32 v{{[0-9]+}}, 1.0
890; VI: v_max_f16_e32 v{{[0-9]+}}, 2.0
891; VI: v_min_f16_e32 v{{[0-9]+}}, 4.0
892
893; GFX9: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0
894; GFX9: v_med3_f16 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
895define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #1 {
896  %tid = call i32 @llvm.amdgcn.workitem.id.x()
897  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
898  %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
899  %a = load half, half addrspace(1)* %gep0
900  %a.add = fadd nnan half %a, 1.0
901  %max = call half @llvm.maxnum.f16(half %a.add, half 2.0)
902  %med = call half @llvm.minnum.f16(half %max, half 4.0)
903
904  store half %med, half addrspace(1)* %outgep
905  ret void
906}
907
908; GCN-LABEL: {{^}}v_nnan_inputs_med3_f16_pat0:
909; GCN: {{buffer|flat|global}}_load_ushort [[A:v[0-9]+]]
910; GCN: {{buffer|flat|global}}_load_ushort [[B:v[0-9]+]]
911; GCN: {{buffer|flat|global}}_load_ushort [[C:v[0-9]+]]
912
913; SI: v_cvt_f32_f16
914; SI: v_cvt_f32_f16
915; SI: v_add_f32_e32
916; SI: v_add_f32_e32
917; SI: v_add_f32_e32
918; SI: v_med3_f32
919; SI: v_cvt_f16_f32_e32
920
921
922; GFX89-DAG: v_add_f16_e32 [[A_ADD:v[0-9]+]], 1.0, [[A]]
923; GFX89-DAG: v_add_f16_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]]
924; GFX89-DAG: v_add_f16_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]]
925
926; VI-DAG: v_min_f16
927; VI-DAG: v_max_f16
928; VI: v_min_f16
929; VI: v_max_f16
930
931; GFX9: v_med3_f16 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
932define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #1 {
933  %tid = call i32 @llvm.amdgcn.workitem.id.x()
934  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
935  %gep1 = getelementptr half, half addrspace(1)* %bptr, i32 %tid
936  %gep2 = getelementptr half, half addrspace(1)* %cptr, i32 %tid
937  %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
938  %a = load volatile half, half addrspace(1)* %gep0
939  %b = load volatile half, half addrspace(1)* %gep1
940  %c = load volatile half, half addrspace(1)* %gep2
941
942  %a.nnan = fadd nnan half %a, 1.0
943  %b.nnan = fadd nnan half %b, 2.0
944  %c.nnan = fadd nnan half %c, 4.0
945
946  %tmp0 = call half @llvm.minnum.f16(half %a.nnan, half %b.nnan)
947  %tmp1 = call half @llvm.maxnum.f16(half %a.nnan, half %b.nnan)
948  %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %c.nnan)
949  %med3 = call half @llvm.maxnum.f16(half %tmp0, half %tmp2)
950  store half %med3, half addrspace(1)* %outgep
951  ret void
952}
953
954; GCN-LABEL: {{^}}two_non_inline_constant:
955; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5,
956; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x41000000, [[ADD]]
957; GCN: v_min_f32_e32 v{{[0-9]+}}, 0x41800000, [[MAX]]
958define amdgpu_kernel void @two_non_inline_constant(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
959  %tid = call i32 @llvm.amdgcn.workitem.id.x()
960  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
961  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
962  %a = load float, float addrspace(1)* %gep0
963  %add = fadd nnan float %a, 0.5
964  %max = call float @llvm.maxnum.f32(float %add, float 8.0)
965  %med = call float @llvm.minnum.f32(float %max, float 16.0)
966
967  store float %med, float addrspace(1)* %out.gep
968  ret void
969}
970
971; FIXME: Simple stores do not work as a multiple use because they are bitcasted to integer constants.
972; GCN-LABEL: {{^}}one_non_inline_constant:
973; GCN-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x41800000
974; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5,
975; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 1.0, [[K1]]
976define amdgpu_kernel void @one_non_inline_constant(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
977  %tid = call i32 @llvm.amdgcn.workitem.id.x()
978  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
979  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
980  %a = load float, float addrspace(1)* %gep0
981  %add = fadd nnan float %a, 0.5
982  %max = call float @llvm.maxnum.f32(float %add, float 1.0)
983  %med = call float @llvm.minnum.f32(float %max, float 16.0)
984
985  store float %med, float addrspace(1)* %out.gep
986
987  %extra.use = fadd float %a, 16.0
988  store volatile float %extra.use, float addrspace(1)* undef
989  ret void
990}
991
992; GCN-LABEL: {{^}}two_non_inline_constant_multi_use:
993; GCN-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x41800000
994; GCN-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x41000000
995; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], [[K1]]
996; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5,
997; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], [[K0]], [[VK1]]
998define amdgpu_kernel void @two_non_inline_constant_multi_use(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
999  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1000  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
1001  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
1002  %a = load float, float addrspace(1)* %gep0
1003  %add = fadd nnan float %a, 0.5
1004  %max = call float @llvm.maxnum.f32(float %add, float 8.0)
1005  %med = call float @llvm.minnum.f32(float %max, float 16.0)
1006
1007  store float %med, float addrspace(1)* %out.gep
1008
1009  %extra.use0 = fadd float %a, 16.0
1010  store volatile float %extra.use0, float addrspace(1)* undef
1011  %extra.use1 = fadd float %a, 8.0
1012  store volatile float %extra.use1, float addrspace(1)* undef
1013  ret void
1014}
1015
1016declare i32 @llvm.amdgcn.workitem.id.x() #0
1017declare float @llvm.fabs.f32(float) #0
1018declare float @llvm.minnum.f32(float, float) #0
1019declare float @llvm.maxnum.f32(float, float) #0
1020declare double @llvm.minnum.f64(double, double) #0
1021declare double @llvm.maxnum.f64(double, double) #0
1022declare half @llvm.fabs.f16(half) #0
1023declare half @llvm.minnum.f16(half, half) #0
1024declare half @llvm.maxnum.f16(half, half) #0
1025
1026attributes #0 = { nounwind readnone }
1027attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
1028attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
1029