1; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
2; RUN: llc -march=amdgcn -mcpu=fiji  -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
3
4; GCN-LABEL: {{^}}reduction_fadd_v4f16:
5; GFX9:      v_pk_add_f16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
6; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
7
8; VI:      v_add_f16_sdwa
9; VI-NEXT: v_add_f16_e32
10; VI-NEXT: v_add_f16_e32
11define half @reduction_fadd_v4f16(<4 x half> %vec4) {
12entry:
13  %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
14  %bin.rdx = fadd <4 x half> %vec4, %rdx.shuf
15  %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
16  %bin.rdx2 = fadd <4 x half> %bin.rdx, %rdx.shuf1
17  %res = extractelement <4 x half> %bin.rdx2, i32 0
18  ret half %res
19}
20
21; GCN-LABEL: {{^}}reduction_fsub_v4f16:
22; GFX9: s_waitcnt
23; GFX9-NEXT: v_pk_add_f16 [[ADD:v[0-9]+]], v0, v1 neg_lo:[0,1] neg_hi:[0,1]{{$}}
24; GFX9-NEXT: v_sub_f16_sdwa v0, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
25; GFX9-NEXT: s_setpc_b64
26
27; VI:      v_sub_f16_sdwa
28; VI-NEXT: v_sub_f16_e32
29; VI-NEXT: v_sub_f16_e32
30; VI-NEXT: s_setpc_b64
31define half @reduction_fsub_v4f16(<4 x half> %vec4) {
32entry:
33  %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
34  %bin.rdx = fsub <4 x half> %vec4, %rdx.shuf
35  %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
36  %bin.rdx2 = fsub <4 x half> %bin.rdx, %rdx.shuf1
37  %res = extractelement <4 x half> %bin.rdx2, i32 0
38  ret half %res
39}
40
41; Make sure nsz is preserved when the operations are split.
42; GCN-LABEL: {{^}}reduction_fsub_v4f16_preserve_fmf:
43; GFX9: s_waitcnt
44; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]{{$}}
45; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
46; GFX9-NEXT: s_setpc_b64
47
48; VI: s_waitcnt
49; VI-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
50; VI-NEXT: v_sub_f16_e32 v0, v1, v0
51; VI-NEXT: v_add_f16_e32 v0, v2, v0
52; VI-NEXT: s_setpc_b64
53define half @reduction_fsub_v4f16_preserve_fmf(<4 x half> %vec4) {
54entry:
55  %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
56  %bin.rdx = fsub nsz <4 x half> %vec4, %rdx.shuf
57  %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
58  %bin.rdx2 = fsub nsz <4 x half> %bin.rdx, %rdx.shuf1
59  %res = extractelement <4 x half> %bin.rdx2, i32 0
60  %neg.res = fsub half -0.0, %res
61  ret half %neg.res
62}
63
64; GCN-LABEL: {{^}}reduction_fmul_half4:
65; GFX9:      v_pk_mul_f16 [[MUL:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
66; GFX9-NEXT: v_mul_f16_sdwa v{{[0-9]+}}, [[MUL]], [[MUL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
67
68; VI:      v_mul_f16_sdwa
69; VI-NEXT: v_mul_f16_e32
70; VI-NEXT: v_mul_f16_e32
71define half @reduction_fmul_half4(<4 x half> %vec4) {
72entry:
73  %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
74  %bin.rdx = fmul <4 x half> %vec4, %rdx.shuf
75  %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
76  %bin.rdx2 = fmul <4 x half> %bin.rdx, %rdx.shuf1
77  %res = extractelement <4 x half> %bin.rdx2, i32 0
78  ret half %res
79}
80
81; GCN-LABEL: {{^}}reduction_v4i16:
82; GFX9:      v_pk_add_u16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
83; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
84
85; VI:      v_add_u16_sdwa
86; VI-NEXT: v_add_u16_e32
87; VI-NEXT: v_add_u16_e32
88define i16 @reduction_v4i16(<4 x i16> %vec4) {
89entry:
90  %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
91  %bin.rdx = add <4 x i16> %vec4, %rdx.shuf
92  %rdx.shuf1 = shufflevector <4 x i16> %bin.rdx, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
93  %bin.rdx2 = add <4 x i16> %bin.rdx, %rdx.shuf1
94  %res = extractelement <4 x i16> %bin.rdx2, i32 0
95  ret i16 %res
96}
97
98; GCN-LABEL: {{^}}reduction_half8:
99; GFX9:      v_pk_add_f16 [[ADD1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
100; GFX9-NEXT: v_pk_add_f16 [[ADD2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
101; GFX9-NEXT: v_pk_add_f16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}}
102; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
103
104; VI:      v_add_f16_sdwa
105; VI-NEXT: v_add_f16_sdwa
106; VI-NEXT: v_add_f16_e32
107; VI-NEXT: v_add_f16_e32
108; VI-NEXT: v_add_f16_e32
109; VI-NEXT: v_add_f16_e32
110; VI-NEXT: v_add_f16_e32
111
112define half @reduction_half8(<8 x half> %vec8) {
113entry:
114  %rdx.shuf = shufflevector <8 x half> %vec8, <8 x half> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
115  %bin.rdx = fadd <8 x half> %vec8, %rdx.shuf
116  %rdx.shuf1 = shufflevector <8 x half> %bin.rdx, <8 x half> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
117  %bin.rdx2 = fadd <8 x half> %bin.rdx, %rdx.shuf1
118  %rdx.shuf3 = shufflevector <8 x half> %bin.rdx2, <8 x half> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
119  %bin.rdx4 = fadd <8 x half> %bin.rdx2, %rdx.shuf3
120  %res = extractelement <8 x half> %bin.rdx4, i32 0
121  ret half %res
122}
123
124; GCN-LABEL: {{^}}reduction_v8i16:
125; GFX9:      v_pk_add_u16 [[ADD1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
126; GFX9-NEXT: v_pk_add_u16 [[ADD2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
127; GFX9-NEXT: v_pk_add_u16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}}
128; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
129
130; VI:      v_add_u16_sdwa
131; VI-NEXT: v_add_u16_sdwa
132; VI-NEXT: v_add_u16_e32
133; VI-NEXT: v_add_u16_e32
134; VI-NEXT: v_add_u16_e32
135; VI-NEXT: v_add_u16_e32
136; VI-NEXT: v_add_u16_e32
137
138define i16 @reduction_v8i16(<8 x i16> %vec8) {
139entry:
140  %rdx.shuf = shufflevector <8 x i16> %vec8, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
141  %bin.rdx = add <8 x i16> %vec8, %rdx.shuf
142  %rdx.shuf1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
143  %bin.rdx2 = add <8 x i16> %bin.rdx, %rdx.shuf1
144  %rdx.shuf3 = shufflevector <8 x i16> %bin.rdx2, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
145  %bin.rdx4 = add <8 x i16> %bin.rdx2, %rdx.shuf3
146  %res = extractelement <8 x i16> %bin.rdx4, i32 0
147  ret i16 %res
148}
149
150; GCN-LABEL: {{^}}reduction_half16:
151; GFX9:      v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
152; GFX9-NEXT: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
153; GFX9-NEXT: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
154; GFX9:      v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
155; GFX9-NEXT: v_pk_add_f16 [[ADD1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
156; GFX9-NEXT: v_pk_add_f16 [[ADD2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
157; GFX9-NEXT: v_pk_add_f16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}}
158; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
159
160; VI:      v_add_f16_sdwa
161; VI-NEXT: v_add_f16_sdwa
162; VI-NEXT: v_add_f16_sdwa
163; VI-NEXT: v_add_f16_sdwa
164; VI-NEXT: v_add_f16_e32
165; VI-NEXT: v_add_f16_e32
166; VI-NEXT: v_add_f16_e32
167; VI-NEXT: v_add_f16_e32
168; VI-NEXT: v_add_f16_e32
169; VI-NEXT: v_add_f16_e32
170; VI-NEXT: v_add_f16_e32
171; VI-NEXT: v_add_f16_e32
172; VI-NEXT: v_add_f16_e32
173; VI-NEXT: v_add_f16_e32
174; VI-NEXT: v_add_f16_e32
175
176define half @reduction_half16(<16 x half> %vec16) {
177entry:
178  %rdx.shuf = shufflevector <16 x half> %vec16, <16 x half> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
179  %bin.rdx = fadd <16 x half> %vec16, %rdx.shuf
180  %rdx.shuf1 = shufflevector <16 x half> %bin.rdx, <16 x half> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
181  %bin.rdx2 = fadd <16 x half> %bin.rdx, %rdx.shuf1
182  %rdx.shuf3 = shufflevector <16 x half> %bin.rdx2, <16 x half> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
183  %bin.rdx4 = fadd <16 x half> %bin.rdx2, %rdx.shuf3
184  %rdx.shuf5 = shufflevector <16 x half> %bin.rdx4, <16 x half> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
185  %bin.rdx6 = fadd <16 x half> %bin.rdx4, %rdx.shuf5
186  %res = extractelement <16 x half> %bin.rdx6, i32 0
187  ret half %res
188}
189
190; GCN-LABEL: {{^}}reduction_min_v4i16:
191; GFX9:      v_pk_min_u16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
192; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
193
194; VI:      v_min_u16_sdwa
195; VI-NEXT: v_min_u16_e32
196; VI-NEXT: v_min_u16_e32
197define i16 @reduction_min_v4i16(<4 x i16> %vec4) {
198entry:
199  %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
200  %rdx.minmax.cmp = icmp ult <4 x i16> %vec4, %rdx.shuf
201  %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf
202  %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
203  %rdx.minmax.cmp2 = icmp ult <4 x i16> %rdx.minmax.select, %rdx.shuf1
204  %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1
205  %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0
206  ret i16 %res
207}
208
209; GCN-LABEL: {{^}}reduction_umin_v8i16:
210; GFX9:      v_pk_min_u16 [[MIN1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
211; GFX9-NEXT: v_pk_min_u16 [[MIN2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
212; GFX9-NEXT: v_pk_min_u16 [[MIN3:v[0-9]+]], [[MIN2]], [[MIN1]]{{$}}
213; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN3]], [[MIN3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
214
215; VI:      v_min_u16_sdwa
216; VI-NEXT: v_min_u16_sdwa
217; VI-NEXT: v_min_u16_e32
218; VI-NEXT: v_min_u16_e32
219; VI-NEXT: v_min_u16_e32
220; VI-NEXT: v_min_u16_e32
221; VI-NEXT: v_min_u16_e32
222define i16 @reduction_umin_v8i16(<8 x i16> %vec8) {
223entry:
224  %rdx.shuf = shufflevector <8 x i16> %vec8, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
225  %rdx.minmax.cmp = icmp ult <8 x i16> %vec8, %rdx.shuf
226  %rdx.minmax.select = select <8 x i1> %rdx.minmax.cmp, <8 x i16> %vec8, <8 x i16> %rdx.shuf
227  %rdx.shuf1 = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
228  %rdx.minmax.cmp2 = icmp ult <8 x i16> %rdx.minmax.select, %rdx.shuf1
229  %rdx.minmax.select3 = select <8 x i1> %rdx.minmax.cmp2, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf1
230  %rdx.shuf4 = shufflevector <8 x i16> %rdx.minmax.select3, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
231  %rdx.minmax.cmp5 = icmp ult <8 x i16> %rdx.minmax.select3, %rdx.shuf4
232  %rdx.minmax.select6 = select <8 x i1> %rdx.minmax.cmp5, <8 x i16> %rdx.minmax.select3, <8 x i16> %rdx.shuf4
233  %res = extractelement <8 x i16> %rdx.minmax.select6, i32 0
234  ret i16 %res
235}
236
237; Tests to make sure without slp the number of instructions are more.
238; GCN-LABEL: {{^}}reduction_umin_v8i16_woslp:
239; GFX9:      v_lshrrev_b32_e32
240; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
241; GFX9-NEXT: v_lshrrev_b32_e32
242; GFX9-NEXT: v_min3_u16
243; GFX9-NEXT: v_lshrrev_b32_e32
244; GFX9-NEXT: v_min3_u16
245; GFX9-NEXT: v_min3_u16
246define i16 @reduction_umin_v8i16_woslp(<8 x i16> %vec8) {
247entry:
248  %elt0 = extractelement <8 x i16> %vec8, i64 0
249  %elt1 = extractelement <8 x i16> %vec8, i64 1
250  %elt2 = extractelement <8 x i16> %vec8, i64 2
251  %elt3 = extractelement <8 x i16> %vec8, i64 3
252  %elt4 = extractelement <8 x i16> %vec8, i64 4
253  %elt5 = extractelement <8 x i16> %vec8, i64 5
254  %elt6 = extractelement <8 x i16> %vec8, i64 6
255  %elt7 = extractelement <8 x i16> %vec8, i64 7
256
257  %cmp0 = icmp ult i16 %elt1, %elt0
258  %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0
259  %cmp1 = icmp ult i16 %elt2, %min1
260  %min2 = select i1 %cmp1, i16 %elt2, i16 %min1
261  %cmp2 = icmp ult i16 %elt3, %min2
262  %min3 = select i1 %cmp2, i16 %elt3, i16 %min2
263
264  %cmp3 = icmp ult i16 %elt4, %min3
265  %min4 = select i1 %cmp3, i16 %elt4, i16 %min3
266  %cmp4 = icmp ult i16 %elt5, %min4
267  %min5 = select i1 %cmp4, i16 %elt5, i16 %min4
268
269  %cmp5 = icmp ult i16 %elt6, %min5
270  %min6 = select i1 %cmp5, i16 %elt6, i16 %min5
271  %cmp6 = icmp ult i16 %elt7, %min6
272  %min7 = select i1 %cmp6, i16 %elt7, i16 %min6
273
274  ret i16 %min7
275}
276
277; GCN-LABEL: {{^}}reduction_smin_v16i16:
278; GFX9:        v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
279; GFX9-NEXT:   v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
280; GFX9-NEXT:   v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
281; GFX9-NEXT:   v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
282; GFX9-NEXT:   v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
283; GFX9-NEXT:   v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
284; GFX9-NEXT:   v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
285; GFX9-NEXT:   v_min_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
286
287; VI:      v_min_i16_sdwa
288; VI-NEXT: v_min_i16_sdwa
289; VI-NEXT: v_min_i16_sdwa
290; VI-NEXT: v_min_i16_sdwa
291; VI-NEXT: v_min_i16_e32
292; VI-NEXT: v_min_i16_e32
293; VI-NEXT: v_min_i16_e32
294; VI-NEXT: v_min_i16_e32
295; VI-NEXT: v_min_i16_e32
296; VI-NEXT: v_min_i16_e32
297; VI-NEXT: v_min_i16_e32
298; VI-NEXT: v_min_i16_e32
299; VI-NEXT: v_min_i16_e32
300; VI-NEXT: v_min_i16_e32
301; VI-NEXT: v_min_i16_e32
302define i16 @reduction_smin_v16i16(<16 x i16> %vec16) {
303entry:
304  %rdx.shuf = shufflevector <16 x i16> %vec16, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
305  %rdx.minmax.cmp = icmp slt <16 x i16> %vec16, %rdx.shuf
306  %rdx.minmax.select = select <16 x i1> %rdx.minmax.cmp, <16 x i16> %vec16, <16 x i16> %rdx.shuf
307  %rdx.shuf1 = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
308  %rdx.minmax.cmp2 = icmp slt <16 x i16> %rdx.minmax.select, %rdx.shuf1
309  %rdx.minmax.select3 = select <16 x i1> %rdx.minmax.cmp2, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf1
310  %rdx.shuf4 = shufflevector <16 x i16> %rdx.minmax.select3, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
311  %rdx.minmax.cmp5 = icmp slt <16 x i16> %rdx.minmax.select3, %rdx.shuf4
312  %rdx.minmax.select6 = select <16 x i1> %rdx.minmax.cmp5, <16 x i16> %rdx.minmax.select3, <16 x i16> %rdx.shuf4
313  %rdx.shuf7 = shufflevector <16 x i16> %rdx.minmax.select6, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
314  %rdx.minmax.cmp8 = icmp slt <16 x i16> %rdx.minmax.select6, %rdx.shuf7
315  %rdx.minmax.select9 = select <16 x i1> %rdx.minmax.cmp8, <16 x i16> %rdx.minmax.select6, <16 x i16> %rdx.shuf7
316  %res = extractelement <16 x i16> %rdx.minmax.select9, i32 0
317  ret i16 %res
318}
319
320; Tests to make sure without slp the number of instructions are more.
321; GCN-LABEL: {{^}}reduction_smin_v16i16_woslp:
322; GFX9:      v_lshrrev_b32_e32
323; GFX9-NEXT: v_min_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
324; GFX9-NEXT: v_lshrrev_b32_e32
325; GFX9-NEXT: v_min3_i16
326; GFX9-NEXT: v_lshrrev_b32_e32
327; GFX9-NEXT: v_min3_i16
328; GFX9-NEXT: v_lshrrev_b32_e32
329; GFX9-NEXT: v_min3_i16
330; GFX9-NEXT: v_lshrrev_b32_e32
331; GFX9-NEXT: v_min3_i16
332; GFX9-NEXT: v_lshrrev_b32_e32
333; GFX9-NEXT: v_min3_i16
334; GFX9-NEXT: v_lshrrev_b32_e32
335; GFX9-NEXT: v_min3_i16
336; GFX9-NEXT: v_min3_i16
337define i16 @reduction_smin_v16i16_woslp(<16 x i16> %vec16) {
338entry:
339  %elt0 = extractelement <16 x i16> %vec16, i64 0
340  %elt1 = extractelement <16 x i16> %vec16, i64 1
341  %elt2 = extractelement <16 x i16> %vec16, i64 2
342  %elt3 = extractelement <16 x i16> %vec16, i64 3
343  %elt4 = extractelement <16 x i16> %vec16, i64 4
344  %elt5 = extractelement <16 x i16> %vec16, i64 5
345  %elt6 = extractelement <16 x i16> %vec16, i64 6
346  %elt7 = extractelement <16 x i16> %vec16, i64 7
347
348  %elt8 = extractelement <16 x i16> %vec16, i64 8
349  %elt9 = extractelement <16 x i16> %vec16, i64 9
350  %elt10 = extractelement <16 x i16> %vec16, i64 10
351  %elt11 = extractelement <16 x i16> %vec16, i64 11
352  %elt12 = extractelement <16 x i16> %vec16, i64 12
353  %elt13 = extractelement <16 x i16> %vec16, i64 13
354  %elt14 = extractelement <16 x i16> %vec16, i64 14
355  %elt15 = extractelement <16 x i16> %vec16, i64 15
356
357  %cmp0 = icmp slt i16 %elt1, %elt0
358  %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0
359  %cmp1 = icmp slt i16 %elt2, %min1
360  %min2 = select i1 %cmp1, i16 %elt2, i16 %min1
361  %cmp2 = icmp slt i16 %elt3, %min2
362  %min3 = select i1 %cmp2, i16 %elt3, i16 %min2
363
364  %cmp3 = icmp slt i16 %elt4, %min3
365  %min4 = select i1 %cmp3, i16 %elt4, i16 %min3
366  %cmp4 = icmp slt i16 %elt5, %min4
367  %min5 = select i1 %cmp4, i16 %elt5, i16 %min4
368
369  %cmp5 = icmp slt i16 %elt6, %min5
370  %min6 = select i1 %cmp5, i16 %elt6, i16 %min5
371  %cmp6 = icmp slt i16 %elt7, %min6
372  %min7 = select i1 %cmp6, i16 %elt7, i16 %min6
373
374  %cmp7 = icmp slt i16 %elt8, %min7
375  %min8 = select i1 %cmp7, i16 %elt8, i16 %min7
376  %cmp8 = icmp slt i16 %elt9, %min8
377  %min9 = select i1 %cmp8, i16 %elt9, i16 %min8
378
379  %cmp9 = icmp slt i16 %elt10, %min9
380  %min10 = select i1 %cmp9, i16 %elt10, i16 %min9
381  %cmp10 = icmp slt i16 %elt11, %min10
382  %min11 = select i1 %cmp10, i16 %elt11, i16 %min10
383
384  %cmp11 = icmp slt i16 %elt12, %min11
385  %min12 = select i1 %cmp11, i16 %elt12, i16 %min11
386  %cmp12 = icmp slt i16 %elt13, %min12
387  %min13 = select i1 %cmp12, i16 %elt13, i16 %min12
388
389  %cmp13 = icmp slt i16 %elt14, %min13
390  %min14 = select i1 %cmp13, i16 %elt14, i16 %min13
391  %cmp14 = icmp slt i16 %elt15, %min14
392  %min15 = select i1 %cmp14, i16 %elt15, i16 %min14
393
394
395  ret i16 %min15
396}
397
398; GCN-LABEL: {{^}}reduction_umax_v4i16:
399; GFX9:      v_pk_max_u16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
400; GFX9-NEXT: v_max_u16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
401
402; VI:      v_max_u16_sdwa
403; VI-NEXT: v_max_u16_e32
404; VI-NEXT: v_max_u16_e32
405define i16 @reduction_umax_v4i16(<4 x i16> %vec4) {
406entry:
407  %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
408  %rdx.minmax.cmp = icmp ugt <4 x i16> %vec4, %rdx.shuf
409  %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf
410  %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
411  %rdx.minmax.cmp2 = icmp ugt <4 x i16> %rdx.minmax.select, %rdx.shuf1
412  %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1
413  %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0
414  ret i16 %res
415}
416
417; GCN-LABEL: {{^}}reduction_smax_v4i16:
418; GFX9:      v_pk_max_i16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
419; GFX9-NEXT: v_max_i16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
420
421; VI:      v_max_i16_sdwa
422; VI-NEXT: v_max_i16_e32
423; VI-NEXT: v_max_i16_e32
424define i16 @reduction_smax_v4i16(<4 x i16> %vec4) #0 {
425entry:
426  %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
427  %rdx.minmax.cmp = icmp sgt <4 x i16> %vec4, %rdx.shuf
428  %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf
429  %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
430  %rdx.minmax.cmp2 = icmp sgt <4 x i16> %rdx.minmax.select, %rdx.shuf1
431  %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1
432  %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0
433  ret i16 %res
434}
435
436; GCN-LABEL: {{^}}reduction_maxnum_v4f16:
437; GFX9: s_waitcnt
438; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
439; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
440; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
441; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
442
443
444; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
445; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
446; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
447; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
448
449; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
450; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
451; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]]
452define half @reduction_maxnum_v4f16(<4 x half> %vec4) {
453entry:
454  %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
455  %rdx.minmax = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %vec4, <4 x half> %rdx.shuf)
456  %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
457  %rdx.minmax3 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %rdx.minmax, <4 x half> %rdx.shuf1)
458  %res = extractelement <4 x half> %rdx.minmax3, i32 0
459  ret half %res
460}
461
462; GCN-LABEL: {{^}}reduction_minnum_v4f16:
463; GFX9: s_waitcnt
464; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
465; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
466; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
467; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
468
469; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
470; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
471; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
472; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
473
474; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
475; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
476; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]]
477define half @reduction_minnum_v4f16(<4 x half> %vec4) {
478entry:
479  %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
480  %rdx.minmax = call <4 x half> @llvm.minnum.v4f16(<4 x half> %vec4, <4 x half> %rdx.shuf)
481  %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
482  %rdx.minmax3 = call <4 x half> @llvm.minnum.v4f16(<4 x half> %rdx.minmax, <4 x half> %rdx.shuf1)
483  %res = extractelement <4 x half> %rdx.minmax3, i32 0
484  ret half %res
485}
486
487; FIXME: Need to preserve fast math flags when fmaxnum matched
488; directly from the IR to avoid unnecessary quieting.
489
490; GCN-LABEL: {{^}}reduction_fast_max_pattern_v4f16:
491; XGFX9:      v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
492; XGFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
493
494; XVI: s_waitcnt
495; XVI-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
496; XVI-NEXT: v_max_f16_e32 v0, v0, v1
497; XVI-NEXT: v_max_f16_e32 v0, v0, v2
498; XVI-NEXT: s_setpc_b64
499
500; GFX9: s_waitcnt
501; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
502; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
503; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
504; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
505
506; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
507; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
508; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
509; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
510
511; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
512; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
513; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]]
514define half @reduction_fast_max_pattern_v4f16(<4 x half> %vec4) {
515entry:
516  %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
517  %rdx.minmax.cmp = fcmp nnan nsz ogt <4 x half> %vec4, %rdx.shuf
518  %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x half> %vec4, <4 x half> %rdx.shuf
519  %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax.select, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
520  %rdx.minmax.cmp2 = fcmp nnan nsz ogt <4 x half> %rdx.minmax.select, %rdx.shuf1
521  %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1
522  %res = extractelement <4 x half> %rdx.minmax.select3, i32 0
523  ret half %res
524}
525
526; FIXME: Need to preserve fast math flags when fmaxnum matched
527; directly from the IR to avoid unnecessary quieting.
528
529; GCN-LABEL: {{^}}reduction_fast_min_pattern_v4f16:
530; XGFX9:      v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
531; XGFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
532
533; XVI: s_waitcnt
534; XVI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
535; XVI-NEXT: v_min_f16_e32 v0, v0, v1
536; XVI-NEXT: v_min_f16_e32 v0, v0, v2
537; XVI-NEXT: s_setpc_b64
538
539; GFX9: s_waitcnt
540; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
541; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
542; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}
543; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
544
545; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
546; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
547; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
548; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1
549
550; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
551; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
552; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]]
553define half @reduction_fast_min_pattern_v4f16(<4 x half> %vec4) {
554entry:
555  %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
556  %rdx.minmax.cmp = fcmp nnan nsz olt <4 x half> %vec4, %rdx.shuf
557  %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x half> %vec4, <4 x half> %rdx.shuf
558  %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax.select, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
559  %rdx.minmax.cmp2 = fcmp nnan nsz olt <4 x half> %rdx.minmax.select, %rdx.shuf1
560  %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1
561  %res = extractelement <4 x half> %rdx.minmax.select3, i32 0
562  ret half %res
563}
564
565declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>)
566declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>)
567