1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
3; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VI %s
4; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
5; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
6
7define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
8; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
9; SI:       ; %bb.0:
10; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
11; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
12; SI-NEXT:    v_mov_b32_e32 v1, 0
13; SI-NEXT:    s_mov_b32 s10, 0
14; SI-NEXT:    s_mov_b32 s11, 0xf000
15; SI-NEXT:    s_waitcnt lgkmcnt(0)
16; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
17; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
18; SI-NEXT:    s_waitcnt vmcnt(0)
19; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
20; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
21; SI-NEXT:    s_waitcnt vmcnt(0)
22; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
23; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
24; SI-NEXT:    s_waitcnt vmcnt(0)
25; SI-NEXT:    v_sub_f32_e32 v2, 0x80000000, v2
26; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
27; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
28; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
29; SI-NEXT:    s_endpgm
30;
31; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
32; VI:       ; %bb.0:
33; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
34; VI-NEXT:    v_lshlrev_b32_e32 v8, 2, v0
35; VI-NEXT:    s_waitcnt lgkmcnt(0)
36; VI-NEXT:    v_mov_b32_e32 v0, s2
37; VI-NEXT:    v_mov_b32_e32 v1, s3
38; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
39; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
40; VI-NEXT:    v_mov_b32_e32 v2, s4
41; VI-NEXT:    v_mov_b32_e32 v3, s5
42; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v8
43; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
44; VI-NEXT:    v_mov_b32_e32 v4, s6
45; VI-NEXT:    v_mov_b32_e32 v5, s7
46; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
47; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
48; VI-NEXT:    flat_load_dword v0, v[0:1] glc
49; VI-NEXT:    s_waitcnt vmcnt(0)
50; VI-NEXT:    flat_load_dword v1, v[2:3] glc
51; VI-NEXT:    s_waitcnt vmcnt(0)
52; VI-NEXT:    flat_load_dword v2, v[4:5] glc
53; VI-NEXT:    s_waitcnt vmcnt(0)
54; VI-NEXT:    v_mov_b32_e32 v7, s1
55; VI-NEXT:    v_mov_b32_e32 v6, s0
56; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
57; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
58; VI-NEXT:    v_sub_f32_e32 v0, 0x80000000, v0
59; VI-NEXT:    v_med3_f32 v0, v0, v1, v2
60; VI-NEXT:    flat_store_dword v[6:7], v0
61; VI-NEXT:    s_endpgm
62;
63; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
64; GFX9:       ; %bb.0:
65; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
66; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
67; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
69; GFX9-NEXT:    s_waitcnt vmcnt(0)
70; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
71; GFX9-NEXT:    s_waitcnt vmcnt(0)
72; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
73; GFX9-NEXT:    s_waitcnt vmcnt(0)
74; GFX9-NEXT:    v_sub_f32_e32 v1, 0x80000000, v1
75; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
76; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
77; GFX9-NEXT:    s_endpgm
78;
79; GFX10-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
80; GFX10:       ; %bb.0:
81; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
82; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
83; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
84; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
85; GFX10-NEXT:    s_waitcnt vmcnt(0)
86; GFX10-NEXT:    global_load_dword v2, v0, s[4:5] glc dlc
87; GFX10-NEXT:    s_waitcnt vmcnt(0)
88; GFX10-NEXT:    global_load_dword v3, v0, s[6:7] glc dlc
89; GFX10-NEXT:    s_waitcnt vmcnt(0)
90; GFX10-NEXT:    v_sub_f32_e32 v1, 0x80000000, v1
91; GFX10-NEXT:    v_med3_f32 v1, v1, v2, v3
92; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
93; GFX10-NEXT:    s_endpgm
94  %tid = call i32 @llvm.amdgcn.workitem.id.x()
95  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
96  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
97  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
98  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
99  %a = load volatile float, float addrspace(1)* %gep0
100  %b = load volatile float, float addrspace(1)* %gep1
101  %c = load volatile float, float addrspace(1)* %gep2
102  %a.fneg = fsub float -0.0, %a
103  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
104  %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b)
105  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
106  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
107  store float %med3, float addrspace(1)* %outgep
108  ret void
109}
110
111define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
112; SI-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0:
113; SI:       ; %bb.0:
114; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
115; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
116; SI-NEXT:    v_mov_b32_e32 v1, 0
117; SI-NEXT:    s_mov_b32 s10, 0
118; SI-NEXT:    s_mov_b32 s11, 0xf000
119; SI-NEXT:    s_waitcnt lgkmcnt(0)
120; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
121; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
122; SI-NEXT:    s_waitcnt vmcnt(0)
123; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
124; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
125; SI-NEXT:    s_waitcnt vmcnt(0)
126; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
127; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
128; SI-NEXT:    s_waitcnt vmcnt(0)
129; SI-NEXT:    v_sub_f32_e32 v2, 0x80000000, v2
130; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
131; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
132; SI-NEXT:    v_min_f32_e32 v5, v2, v3
133; SI-NEXT:    v_max_f32_e32 v2, v2, v3
134; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
135; SI-NEXT:    v_min_f32_e32 v2, v2, v3
136; SI-NEXT:    v_max_f32_e32 v2, v5, v2
137; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
138; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
139; SI-NEXT:    s_endpgm
140;
141; VI-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0:
142; VI:       ; %bb.0:
143; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
144; VI-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
145; VI-NEXT:    s_waitcnt lgkmcnt(0)
146; VI-NEXT:    v_mov_b32_e32 v0, s2
147; VI-NEXT:    v_mov_b32_e32 v1, s3
148; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
149; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
150; VI-NEXT:    v_mov_b32_e32 v2, s4
151; VI-NEXT:    v_mov_b32_e32 v3, s5
152; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
153; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
154; VI-NEXT:    v_mov_b32_e32 v4, s6
155; VI-NEXT:    v_mov_b32_e32 v5, s7
156; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
157; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
158; VI-NEXT:    flat_load_dword v7, v[0:1] glc
159; VI-NEXT:    s_waitcnt vmcnt(0)
160; VI-NEXT:    flat_load_dword v2, v[2:3] glc
161; VI-NEXT:    s_waitcnt vmcnt(0)
162; VI-NEXT:    flat_load_dword v3, v[4:5] glc
163; VI-NEXT:    s_waitcnt vmcnt(0)
164; VI-NEXT:    v_mov_b32_e32 v0, s0
165; VI-NEXT:    v_mov_b32_e32 v1, s1
166; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
167; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
168; VI-NEXT:    v_sub_f32_e32 v4, 0x80000000, v7
169; VI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
170; VI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
171; VI-NEXT:    v_min_f32_e32 v5, v4, v2
172; VI-NEXT:    v_max_f32_e32 v2, v4, v2
173; VI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
174; VI-NEXT:    v_min_f32_e32 v2, v2, v3
175; VI-NEXT:    v_max_f32_e32 v2, v5, v2
176; VI-NEXT:    flat_store_dword v[0:1], v2
177; VI-NEXT:    s_endpgm
178;
179; GFX9-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0:
180; GFX9:       ; %bb.0:
181; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
182; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
183; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
184; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
185; GFX9-NEXT:    s_waitcnt vmcnt(0)
186; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
187; GFX9-NEXT:    s_waitcnt vmcnt(0)
188; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
189; GFX9-NEXT:    s_waitcnt vmcnt(0)
190; GFX9-NEXT:    v_sub_f32_e32 v1, 0x80000000, v1
191; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
192; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
193; GFX9-NEXT:    v_min_f32_e32 v4, v1, v2
194; GFX9-NEXT:    v_max_f32_e32 v1, v1, v2
195; GFX9-NEXT:    v_max_f32_e32 v2, v3, v3
196; GFX9-NEXT:    v_min_f32_e32 v1, v1, v2
197; GFX9-NEXT:    v_max_f32_e32 v1, v4, v1
198; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
199; GFX9-NEXT:    s_endpgm
200;
201; GFX10-LABEL: v_test_no_global_nnans_med3_f32_pat0_srcmod0:
202; GFX10:       ; %bb.0:
203; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
204; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
205; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
206; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
207; GFX10-NEXT:    s_waitcnt vmcnt(0)
208; GFX10-NEXT:    global_load_dword v2, v0, s[4:5] glc dlc
209; GFX10-NEXT:    s_waitcnt vmcnt(0)
210; GFX10-NEXT:    global_load_dword v3, v0, s[6:7] glc dlc
211; GFX10-NEXT:    s_waitcnt vmcnt(0)
212; GFX10-NEXT:    v_sub_f32_e32 v1, 0x80000000, v1
213; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
214; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
215; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
216; GFX10-NEXT:    v_max_f32_e32 v4, v1, v2
217; GFX10-NEXT:    v_min_f32_e32 v1, v1, v2
218; GFX10-NEXT:    v_min_f32_e32 v2, v4, v3
219; GFX10-NEXT:    v_max_f32_e32 v1, v1, v2
220; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
221; GFX10-NEXT:    s_endpgm
222  %tid = call i32 @llvm.amdgcn.workitem.id.x()
223  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
224  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
225  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
226  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
227  %a = load volatile float, float addrspace(1)* %gep0
228  %b = load volatile float, float addrspace(1)* %gep1
229  %c = load volatile float, float addrspace(1)* %gep2
230  %a.fneg = fsub float -0.0, %a
231  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
232  %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b)
233  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
234  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
235  store float %med3, float addrspace(1)* %outgep
236  ret void
237}
238
239define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
240; SI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
241; SI:       ; %bb.0:
242; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
243; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
244; SI-NEXT:    v_mov_b32_e32 v1, 0
245; SI-NEXT:    s_mov_b32 s10, 0
246; SI-NEXT:    s_mov_b32 s11, 0xf000
247; SI-NEXT:    s_waitcnt lgkmcnt(0)
248; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
249; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
250; SI-NEXT:    s_waitcnt vmcnt(0)
251; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
252; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
253; SI-NEXT:    s_waitcnt vmcnt(0)
254; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
255; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
256; SI-NEXT:    s_waitcnt vmcnt(0)
257; SI-NEXT:    s_mov_b32 s2, 0x80000000
258; SI-NEXT:    v_sub_f32_e32 v2, s2, v2
259; SI-NEXT:    v_sub_f32_e64 v4, s2, |v4|
260; SI-NEXT:    v_med3_f32 v2, v2, |v3|, v4
261; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
262; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
263; SI-NEXT:    s_endpgm
264;
265; VI-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
266; VI:       ; %bb.0:
267; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
268; VI-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
269; VI-NEXT:    s_waitcnt lgkmcnt(0)
270; VI-NEXT:    v_mov_b32_e32 v0, s2
271; VI-NEXT:    v_mov_b32_e32 v1, s3
272; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
273; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
274; VI-NEXT:    v_mov_b32_e32 v2, s4
275; VI-NEXT:    v_mov_b32_e32 v3, s5
276; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
277; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
278; VI-NEXT:    v_mov_b32_e32 v4, s6
279; VI-NEXT:    v_mov_b32_e32 v5, s7
280; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
281; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
282; VI-NEXT:    flat_load_dword v7, v[0:1] glc
283; VI-NEXT:    s_waitcnt vmcnt(0)
284; VI-NEXT:    flat_load_dword v2, v[2:3] glc
285; VI-NEXT:    s_waitcnt vmcnt(0)
286; VI-NEXT:    flat_load_dword v3, v[4:5] glc
287; VI-NEXT:    s_waitcnt vmcnt(0)
288; VI-NEXT:    s_mov_b32 s2, 0x80000000
289; VI-NEXT:    v_mov_b32_e32 v0, s0
290; VI-NEXT:    v_mov_b32_e32 v1, s1
291; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
292; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
293; VI-NEXT:    v_sub_f32_e32 v4, s2, v7
294; VI-NEXT:    v_sub_f32_e64 v3, s2, |v3|
295; VI-NEXT:    v_med3_f32 v2, v4, |v2|, v3
296; VI-NEXT:    flat_store_dword v[0:1], v2
297; VI-NEXT:    s_endpgm
298;
299; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
300; GFX9:       ; %bb.0:
301; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
302; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
303; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
304; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
305; GFX9-NEXT:    s_waitcnt vmcnt(0)
306; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
307; GFX9-NEXT:    s_waitcnt vmcnt(0)
308; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
309; GFX9-NEXT:    s_waitcnt vmcnt(0)
310; GFX9-NEXT:    s_mov_b32 s2, 0x80000000
311; GFX9-NEXT:    v_sub_f32_e32 v1, s2, v1
312; GFX9-NEXT:    v_sub_f32_e64 v3, s2, |v3|
313; GFX9-NEXT:    v_med3_f32 v1, v1, |v2|, v3
314; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
315; GFX9-NEXT:    s_endpgm
316;
317; GFX10-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
318; GFX10:       ; %bb.0:
319; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
320; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
321; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
322; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
323; GFX10-NEXT:    s_waitcnt vmcnt(0)
324; GFX10-NEXT:    global_load_dword v2, v0, s[4:5] glc dlc
325; GFX10-NEXT:    s_waitcnt vmcnt(0)
326; GFX10-NEXT:    global_load_dword v3, v0, s[6:7] glc dlc
327; GFX10-NEXT:    s_waitcnt vmcnt(0)
328; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
329; GFX10-NEXT:    s_mov_b32 s2, 0x80000000
330; GFX10-NEXT:    v_sub_f32_e32 v1, s2, v1
331; GFX10-NEXT:    v_sub_f32_e64 v3, s2, |v3|
332; GFX10-NEXT:    v_med3_f32 v1, v1, |v2|, v3
333; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
334; GFX10-NEXT:    s_endpgm
335  %tid = call i32 @llvm.amdgcn.workitem.id.x()
336  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
337  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
338  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
339  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
340  %a = load volatile float, float addrspace(1)* %gep0
341  %b = load volatile float, float addrspace(1)* %gep1
342  %c = load volatile float, float addrspace(1)* %gep2
343
344  %a.fneg = fsub float -0.0, %a
345  %b.fabs = call float @llvm.fabs.f32(float %b)
346  %c.fabs = call float @llvm.fabs.f32(float %c)
347  %c.fabs.fneg = fsub float -0.0, %c.fabs
348
349  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b.fabs)
350  %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b.fabs)
351  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
352  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
353
354  store float %med3, float addrspace(1)* %outgep
355  ret void
356}
357
358define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
359; SI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
360; SI:       ; %bb.0:
361; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
362; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
363; SI-NEXT:    v_mov_b32_e32 v1, 0
364; SI-NEXT:    s_mov_b32 s10, 0
365; SI-NEXT:    s_mov_b32 s11, 0xf000
366; SI-NEXT:    s_waitcnt lgkmcnt(0)
367; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
368; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
369; SI-NEXT:    s_waitcnt vmcnt(0)
370; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
371; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
372; SI-NEXT:    s_waitcnt vmcnt(0)
373; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
374; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
375; SI-NEXT:    s_waitcnt vmcnt(0)
376; SI-NEXT:    s_mov_b32 s2, 0x80000000
377; SI-NEXT:    v_sub_f32_e64 v2, s2, |v2|
378; SI-NEXT:    v_sub_f32_e64 v3, s2, |v3|
379; SI-NEXT:    v_sub_f32_e64 v4, s2, |v4|
380; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
381; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
382; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
383; SI-NEXT:    s_endpgm
384;
385; VI-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
386; VI:       ; %bb.0:
387; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
388; VI-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
389; VI-NEXT:    s_waitcnt lgkmcnt(0)
390; VI-NEXT:    v_mov_b32_e32 v0, s2
391; VI-NEXT:    v_mov_b32_e32 v1, s3
392; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
393; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
394; VI-NEXT:    v_mov_b32_e32 v2, s4
395; VI-NEXT:    v_mov_b32_e32 v3, s5
396; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
397; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
398; VI-NEXT:    v_mov_b32_e32 v4, s6
399; VI-NEXT:    v_mov_b32_e32 v5, s7
400; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
401; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
402; VI-NEXT:    flat_load_dword v7, v[0:1] glc
403; VI-NEXT:    s_waitcnt vmcnt(0)
404; VI-NEXT:    flat_load_dword v2, v[2:3] glc
405; VI-NEXT:    s_waitcnt vmcnt(0)
406; VI-NEXT:    flat_load_dword v3, v[4:5] glc
407; VI-NEXT:    s_waitcnt vmcnt(0)
408; VI-NEXT:    s_mov_b32 s2, 0x80000000
409; VI-NEXT:    v_mov_b32_e32 v0, s0
410; VI-NEXT:    v_mov_b32_e32 v1, s1
411; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
412; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
413; VI-NEXT:    v_sub_f32_e64 v4, s2, |v7|
414; VI-NEXT:    v_sub_f32_e64 v2, s2, |v2|
415; VI-NEXT:    v_sub_f32_e64 v3, s2, |v3|
416; VI-NEXT:    v_med3_f32 v2, v4, v2, v3
417; VI-NEXT:    flat_store_dword v[0:1], v2
418; VI-NEXT:    s_endpgm
419;
420; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
421; GFX9:       ; %bb.0:
422; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
423; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
424; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
425; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
426; GFX9-NEXT:    s_waitcnt vmcnt(0)
427; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
428; GFX9-NEXT:    s_waitcnt vmcnt(0)
429; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
430; GFX9-NEXT:    s_waitcnt vmcnt(0)
431; GFX9-NEXT:    s_mov_b32 s2, 0x80000000
432; GFX9-NEXT:    v_sub_f32_e64 v1, s2, |v1|
433; GFX9-NEXT:    v_sub_f32_e64 v2, s2, |v2|
434; GFX9-NEXT:    v_sub_f32_e64 v3, s2, |v3|
435; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
436; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
437; GFX9-NEXT:    s_endpgm
438;
439; GFX10-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
440; GFX10:       ; %bb.0:
441; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
442; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
443; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
444; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
445; GFX10-NEXT:    s_waitcnt vmcnt(0)
446; GFX10-NEXT:    global_load_dword v2, v0, s[4:5] glc dlc
447; GFX10-NEXT:    s_waitcnt vmcnt(0)
448; GFX10-NEXT:    global_load_dword v3, v0, s[6:7] glc dlc
449; GFX10-NEXT:    s_waitcnt vmcnt(0)
450; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
451; GFX10-NEXT:    s_mov_b32 s2, 0x80000000
452; GFX10-NEXT:    v_sub_f32_e64 v1, s2, |v1|
453; GFX10-NEXT:    v_sub_f32_e64 v2, s2, |v2|
454; GFX10-NEXT:    v_sub_f32_e64 v3, s2, |v3|
455; GFX10-NEXT:    v_med3_f32 v1, v1, v2, v3
456; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
457; GFX10-NEXT:    s_endpgm
458  %tid = call i32 @llvm.amdgcn.workitem.id.x()
459  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
460  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
461  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
462  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
463  %a = load volatile float, float addrspace(1)* %gep0
464  %b = load volatile float, float addrspace(1)* %gep1
465  %c = load volatile float, float addrspace(1)* %gep2
466
467  %a.fabs = call float @llvm.fabs.f32(float %a)
468  %a.fabs.fneg = fsub float -0.0, %a.fabs
469  %b.fabs = call float @llvm.fabs.f32(float %b)
470  %b.fabs.fneg = fsub float -0.0, %b.fabs
471  %c.fabs = call float @llvm.fabs.f32(float %c)
472  %c.fabs.fneg = fsub float -0.0, %c.fabs
473
474  %tmp0 = call float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
475  %tmp1 = call float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
476  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
477  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
478
479  store float %med3, float addrspace(1)* %outgep
480  ret void
481}
482
483define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
484; SI-LABEL: v_nnan_inputs_med3_f32_pat0:
485; SI:       ; %bb.0:
486; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
487; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
488; SI-NEXT:    v_mov_b32_e32 v1, 0
489; SI-NEXT:    s_mov_b32 s10, 0
490; SI-NEXT:    s_mov_b32 s11, 0xf000
491; SI-NEXT:    s_waitcnt lgkmcnt(0)
492; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
493; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
494; SI-NEXT:    s_waitcnt vmcnt(0)
495; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
496; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
497; SI-NEXT:    s_waitcnt vmcnt(0)
498; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
499; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
500; SI-NEXT:    s_waitcnt vmcnt(0)
501; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
502; SI-NEXT:    v_add_f32_e32 v3, 2.0, v3
503; SI-NEXT:    v_add_f32_e32 v4, 4.0, v4
504; SI-NEXT:    v_med3_f32 v2, v2, v3, v4
505; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
506; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
507; SI-NEXT:    s_endpgm
508;
509; VI-LABEL: v_nnan_inputs_med3_f32_pat0:
510; VI:       ; %bb.0:
511; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
512; VI-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
513; VI-NEXT:    s_waitcnt lgkmcnt(0)
514; VI-NEXT:    v_mov_b32_e32 v0, s2
515; VI-NEXT:    v_mov_b32_e32 v1, s3
516; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
517; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
518; VI-NEXT:    v_mov_b32_e32 v2, s4
519; VI-NEXT:    v_mov_b32_e32 v3, s5
520; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
521; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
522; VI-NEXT:    v_mov_b32_e32 v4, s6
523; VI-NEXT:    v_mov_b32_e32 v5, s7
524; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
525; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
526; VI-NEXT:    flat_load_dword v7, v[0:1] glc
527; VI-NEXT:    s_waitcnt vmcnt(0)
528; VI-NEXT:    flat_load_dword v2, v[2:3] glc
529; VI-NEXT:    s_waitcnt vmcnt(0)
530; VI-NEXT:    flat_load_dword v3, v[4:5] glc
531; VI-NEXT:    s_waitcnt vmcnt(0)
532; VI-NEXT:    v_mov_b32_e32 v0, s0
533; VI-NEXT:    v_mov_b32_e32 v1, s1
534; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
535; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
536; VI-NEXT:    v_add_f32_e32 v4, 1.0, v7
537; VI-NEXT:    v_add_f32_e32 v2, 2.0, v2
538; VI-NEXT:    v_add_f32_e32 v3, 4.0, v3
539; VI-NEXT:    v_med3_f32 v2, v4, v2, v3
540; VI-NEXT:    flat_store_dword v[0:1], v2
541; VI-NEXT:    s_endpgm
542;
543; GFX9-LABEL: v_nnan_inputs_med3_f32_pat0:
544; GFX9:       ; %bb.0:
545; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
546; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
547; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
548; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
549; GFX9-NEXT:    s_waitcnt vmcnt(0)
550; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
551; GFX9-NEXT:    s_waitcnt vmcnt(0)
552; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
553; GFX9-NEXT:    s_waitcnt vmcnt(0)
554; GFX9-NEXT:    v_add_f32_e32 v1, 1.0, v1
555; GFX9-NEXT:    v_add_f32_e32 v2, 2.0, v2
556; GFX9-NEXT:    v_add_f32_e32 v3, 4.0, v3
557; GFX9-NEXT:    v_med3_f32 v1, v1, v2, v3
558; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
559; GFX9-NEXT:    s_endpgm
560;
561; GFX10-LABEL: v_nnan_inputs_med3_f32_pat0:
562; GFX10:       ; %bb.0:
563; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
564; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
565; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
566; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
567; GFX10-NEXT:    s_waitcnt vmcnt(0)
568; GFX10-NEXT:    global_load_dword v2, v0, s[4:5] glc dlc
569; GFX10-NEXT:    s_waitcnt vmcnt(0)
570; GFX10-NEXT:    global_load_dword v3, v0, s[6:7] glc dlc
571; GFX10-NEXT:    s_waitcnt vmcnt(0)
572; GFX10-NEXT:    v_add_f32_e32 v1, 1.0, v1
573; GFX10-NEXT:    v_add_f32_e32 v2, 2.0, v2
574; GFX10-NEXT:    v_add_f32_e32 v3, 4.0, v3
575; GFX10-NEXT:    v_med3_f32 v1, v1, v2, v3
576; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
577; GFX10-NEXT:    s_endpgm
578  %tid = call i32 @llvm.amdgcn.workitem.id.x()
579  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
580  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
581  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
582  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
583  %a = load volatile float, float addrspace(1)* %gep0
584  %b = load volatile float, float addrspace(1)* %gep1
585  %c = load volatile float, float addrspace(1)* %gep2
586
587  %a.nnan = fadd nnan float %a, 1.0
588  %b.nnan = fadd nnan float %b, 2.0
589  %c.nnan = fadd nnan float %c, 4.0
590
591  %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
592  %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
593  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
594  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
595  store float %med3, float addrspace(1)* %outgep
596  ret void
597}
598
599
600; ---------------------------------------------------------------------
601; Negative patterns
602; ---------------------------------------------------------------------
603
604define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
605; SI-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
606; SI:       ; %bb.0:
607; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
608; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
609; SI-NEXT:    v_mov_b32_e32 v1, 0
610; SI-NEXT:    s_mov_b32 s10, 0
611; SI-NEXT:    s_mov_b32 s11, 0xf000
612; SI-NEXT:    s_waitcnt lgkmcnt(0)
613; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
614; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
615; SI-NEXT:    s_waitcnt vmcnt(0)
616; SI-NEXT:    s_mov_b64 s[8:9], s[4:5]
617; SI-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 glc
618; SI-NEXT:    s_waitcnt vmcnt(0)
619; SI-NEXT:    s_mov_b32 s2, -1
620; SI-NEXT:    s_mov_b32 s3, s11
621; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
622; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
623; SI-NEXT:    s_waitcnt vmcnt(0)
624; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
625; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
626; SI-NEXT:    v_min_f32_e32 v5, v2, v3
627; SI-NEXT:    v_max_f32_e32 v2, v2, v3
628; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
629; SI-NEXT:    buffer_store_dword v5, off, s[0:3], 0
630; SI-NEXT:    s_waitcnt vmcnt(0)
631; SI-NEXT:    v_min_f32_e32 v2, v2, v3
632; SI-NEXT:    v_max_f32_e32 v2, v5, v2
633; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
634; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
635; SI-NEXT:    s_endpgm
636;
637; VI-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
638; VI:       ; %bb.0:
639; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
640; VI-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
641; VI-NEXT:    s_waitcnt lgkmcnt(0)
642; VI-NEXT:    v_mov_b32_e32 v0, s2
643; VI-NEXT:    v_mov_b32_e32 v1, s3
644; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
645; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
646; VI-NEXT:    v_mov_b32_e32 v2, s4
647; VI-NEXT:    v_mov_b32_e32 v3, s5
648; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
649; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
650; VI-NEXT:    v_mov_b32_e32 v4, s6
651; VI-NEXT:    v_mov_b32_e32 v5, s7
652; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
653; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
654; VI-NEXT:    flat_load_dword v7, v[0:1] glc
655; VI-NEXT:    s_waitcnt vmcnt(0)
656; VI-NEXT:    flat_load_dword v2, v[2:3] glc
657; VI-NEXT:    s_waitcnt vmcnt(0)
658; VI-NEXT:    flat_load_dword v3, v[4:5] glc
659; VI-NEXT:    s_waitcnt vmcnt(0)
660; VI-NEXT:    v_mov_b32_e32 v0, s0
661; VI-NEXT:    v_mov_b32_e32 v1, s1
662; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
663; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
664; VI-NEXT:    v_mul_f32_e32 v4, 1.0, v7
665; VI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
666; VI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
667; VI-NEXT:    v_min_f32_e32 v5, v4, v2
668; VI-NEXT:    v_max_f32_e32 v2, v4, v2
669; VI-NEXT:    v_min_f32_e32 v2, v2, v3
670; VI-NEXT:    v_max_f32_e32 v2, v5, v2
671; VI-NEXT:    flat_store_dword v[0:1], v5
672; VI-NEXT:    s_waitcnt vmcnt(0)
673; VI-NEXT:    flat_store_dword v[0:1], v2
674; VI-NEXT:    s_endpgm
675;
676; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
677; GFX9:       ; %bb.0:
678; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
679; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
680; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
681; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
682; GFX9-NEXT:    s_waitcnt vmcnt(0)
683; GFX9-NEXT:    global_load_dword v2, v0, s[4:5] glc
684; GFX9-NEXT:    s_waitcnt vmcnt(0)
685; GFX9-NEXT:    global_load_dword v3, v0, s[6:7] glc
686; GFX9-NEXT:    s_waitcnt vmcnt(0)
687; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
688; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
689; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
690; GFX9-NEXT:    v_min_f32_e32 v4, v1, v2
691; GFX9-NEXT:    v_max_f32_e32 v1, v1, v2
692; GFX9-NEXT:    global_store_dword v[0:1], v4, off
693; GFX9-NEXT:    s_waitcnt vmcnt(0)
694; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
695; GFX9-NEXT:    v_max_f32_e32 v1, v4, v1
696; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
697; GFX9-NEXT:    s_endpgm
698;
699; GFX10-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
700; GFX10:       ; %bb.0:
701; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
702; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
703; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
704; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
705; GFX10-NEXT:    s_waitcnt vmcnt(0)
706; GFX10-NEXT:    global_load_dword v2, v0, s[4:5] glc dlc
707; GFX10-NEXT:    s_waitcnt vmcnt(0)
708; GFX10-NEXT:    global_load_dword v3, v0, s[6:7] glc dlc
709; GFX10-NEXT:    s_waitcnt vmcnt(0)
710; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
711; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
712; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
713; GFX10-NEXT:    v_max_f32_e32 v4, v1, v2
714; GFX10-NEXT:    v_min_f32_e32 v1, v1, v2
715; GFX10-NEXT:    v_min_f32_e32 v2, v4, v3
716; GFX10-NEXT:    v_max_f32_e32 v2, v1, v2
717; GFX10-NEXT:    global_store_dword v[0:1], v1, off
718; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
719; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
720; GFX10-NEXT:    s_endpgm
721  %tid = call i32 @llvm.amdgcn.workitem.id.x()
722  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
723  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
724  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
725  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
726  %a = load volatile float, float addrspace(1)* %gep0
727  %b = load volatile float, float addrspace(1)* %gep1
728  %c = load volatile float, float addrspace(1)* %gep2
729  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
730  store volatile float %tmp0, float addrspace(1)* undef
731  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
732  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
733  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
734  store float %med3, float addrspace(1)* %outgep
735  ret void
736}
737
738declare i32 @llvm.amdgcn.workitem.id.x() #0
739declare float @llvm.fabs.f32(float) #0
740declare float @llvm.minnum.f32(float, float) #0
741declare float @llvm.maxnum.f32(float, float) #0
742declare double @llvm.minnum.f64(double, double) #0
743declare double @llvm.maxnum.f64(double, double) #0
744declare half @llvm.fabs.f16(half) #0
745declare half @llvm.minnum.f16(half, half) #0
746declare half @llvm.maxnum.f16(half, half) #0
747
748attributes #0 = { nounwind readnone }
749attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
750attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
751