1; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s 2; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s 3 4; GCN-LABEL: {{^}}reduction_fadd_v4f16: 5; GFX9: v_pk_add_f16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} 6; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 7 8; VI: v_add_f16_sdwa 9; VI-NEXT: v_add_f16_e32 10; VI-NEXT: v_add_f16_e32 11define half @reduction_fadd_v4f16(<4 x half> %vec4) { 12entry: 13 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 14 %bin.rdx = fadd <4 x half> %vec4, %rdx.shuf 15 %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 16 %bin.rdx2 = fadd <4 x half> %bin.rdx, %rdx.shuf1 17 %res = extractelement <4 x half> %bin.rdx2, i32 0 18 ret half %res 19} 20 21; GCN-LABEL: {{^}}reduction_fsub_v4f16: 22; GFX9: s_waitcnt 23; GFX9-NEXT: v_pk_add_f16 [[ADD:v[0-9]+]], v0, v1 neg_lo:[0,1] neg_hi:[0,1]{{$}} 24; GFX9-NEXT: v_sub_f16_sdwa v0, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 25; GFX9-NEXT: s_setpc_b64 26 27; VI: v_sub_f16_sdwa 28; VI-NEXT: v_sub_f16_e32 29; VI-NEXT: v_sub_f16_e32 30; VI-NEXT: s_setpc_b64 31define half @reduction_fsub_v4f16(<4 x half> %vec4) { 32entry: 33 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 34 %bin.rdx = fsub <4 x half> %vec4, %rdx.shuf 35 %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 36 %bin.rdx2 = fsub <4 x half> %bin.rdx, %rdx.shuf1 37 %res = extractelement <4 x half> %bin.rdx2, i32 0 38 ret half %res 39} 40 41; Make sure nsz is preserved when the operations are split. 42; GCN-LABEL: {{^}}reduction_fsub_v4f16_preserve_fmf: 43; GFX9: s_waitcnt 44; GFX9-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]{{$}} 45; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 46; GFX9-NEXT: s_setpc_b64 47 48; VI: s_waitcnt 49; VI-NEXT: v_sub_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 50; VI-NEXT: v_sub_f16_e32 v0, v1, v0 51; VI-NEXT: v_add_f16_e32 v0, v2, v0 52; VI-NEXT: s_setpc_b64 53define half @reduction_fsub_v4f16_preserve_fmf(<4 x half> %vec4) { 54entry: 55 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 56 %bin.rdx = fsub nsz <4 x half> %vec4, %rdx.shuf 57 %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 58 %bin.rdx2 = fsub nsz <4 x half> %bin.rdx, %rdx.shuf1 59 %res = extractelement <4 x half> %bin.rdx2, i32 0 60 %neg.res = fsub half -0.0, %res 61 ret half %neg.res 62} 63 64; GCN-LABEL: {{^}}reduction_fmul_half4: 65; GFX9: v_pk_mul_f16 [[MUL:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} 66; GFX9-NEXT: v_mul_f16_sdwa v{{[0-9]+}}, [[MUL]], [[MUL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 67 68; VI: v_mul_f16_sdwa 69; VI-NEXT: v_mul_f16_e32 70; VI-NEXT: v_mul_f16_e32 71define half @reduction_fmul_half4(<4 x half> %vec4) { 72entry: 73 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 74 %bin.rdx = fmul <4 x half> %vec4, %rdx.shuf 75 %rdx.shuf1 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 76 %bin.rdx2 = fmul <4 x half> %bin.rdx, %rdx.shuf1 77 %res = extractelement <4 x half> %bin.rdx2, i32 0 78 ret half %res 79} 80 81; GCN-LABEL: {{^}}reduction_v4i16: 82; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} 83; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD]], [[ADD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 84 85; VI: v_add_u16_sdwa 86; VI-NEXT: v_add_u16_e32 87; VI-NEXT: v_add_u16_e32 88define i16 @reduction_v4i16(<4 x i16> %vec4) { 89entry: 90 %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 91 %bin.rdx = add <4 x i16> %vec4, %rdx.shuf 92 %rdx.shuf1 = shufflevector <4 x i16> %bin.rdx, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 93 %bin.rdx2 = add <4 x i16> %bin.rdx, %rdx.shuf1 94 %res = extractelement <4 x i16> %bin.rdx2, i32 0 95 ret i16 %res 96} 97 98; GCN-LABEL: {{^}}reduction_half8: 99; GFX9: v_pk_add_f16 [[ADD1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} 100; GFX9-NEXT: v_pk_add_f16 [[ADD2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} 101; GFX9-NEXT: v_pk_add_f16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}} 102; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 103 104; VI: v_add_f16_sdwa 105; VI-NEXT: v_add_f16_sdwa 106; VI-NEXT: v_add_f16_e32 107; VI-NEXT: v_add_f16_e32 108; VI-NEXT: v_add_f16_e32 109; VI-NEXT: v_add_f16_e32 110; VI-NEXT: v_add_f16_e32 111 112define half @reduction_half8(<8 x half> %vec8) { 113entry: 114 %rdx.shuf = shufflevector <8 x half> %vec8, <8 x half> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 115 %bin.rdx = fadd <8 x half> %vec8, %rdx.shuf 116 %rdx.shuf1 = shufflevector <8 x half> %bin.rdx, <8 x half> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 117 %bin.rdx2 = fadd <8 x half> %bin.rdx, %rdx.shuf1 118 %rdx.shuf3 = shufflevector <8 x half> %bin.rdx2, <8 x half> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 119 %bin.rdx4 = fadd <8 x half> %bin.rdx2, %rdx.shuf3 120 %res = extractelement <8 x half> %bin.rdx4, i32 0 121 ret half %res 122} 123 124; GCN-LABEL: {{^}}reduction_v8i16: 125; GFX9: v_pk_add_u16 [[ADD1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} 126; GFX9-NEXT: v_pk_add_u16 [[ADD2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} 127; GFX9-NEXT: v_pk_add_u16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}} 128; GFX9-NEXT: v_add_u16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 129 130; VI: v_add_u16_sdwa 131; VI-NEXT: v_add_u16_sdwa 132; VI-NEXT: v_add_u16_e32 133; VI-NEXT: v_add_u16_e32 134; VI-NEXT: v_add_u16_e32 135; VI-NEXT: v_add_u16_e32 136; VI-NEXT: v_add_u16_e32 137 138define i16 @reduction_v8i16(<8 x i16> %vec8) { 139entry: 140 %rdx.shuf = shufflevector <8 x i16> %vec8, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 141 %bin.rdx = add <8 x i16> %vec8, %rdx.shuf 142 %rdx.shuf1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 143 %bin.rdx2 = add <8 x i16> %bin.rdx, %rdx.shuf1 144 %rdx.shuf3 = shufflevector <8 x i16> %bin.rdx2, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 145 %bin.rdx4 = add <8 x i16> %bin.rdx2, %rdx.shuf3 146 %res = extractelement <8 x i16> %bin.rdx4, i32 0 147 ret i16 %res 148} 149 150; GCN-LABEL: {{^}}reduction_half16: 151; GFX9: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} 152; GFX9-NEXT: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} 153; GFX9-NEXT: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} 154; GFX9: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} 155; GFX9-NEXT: v_pk_add_f16 [[ADD1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} 156; GFX9-NEXT: v_pk_add_f16 [[ADD2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} 157; GFX9-NEXT: v_pk_add_f16 [[ADD3:v[0-9]+]], [[ADD2]], [[ADD1]]{{$}} 158; GFX9-NEXT: v_add_f16_sdwa v{{[0-9]+}}, [[ADD3]], [[ADD3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 159 160; VI: v_add_f16_sdwa 161; VI-NEXT: v_add_f16_sdwa 162; VI-NEXT: v_add_f16_sdwa 163; VI-NEXT: v_add_f16_sdwa 164; VI-NEXT: v_add_f16_e32 165; VI-NEXT: v_add_f16_e32 166; VI-NEXT: v_add_f16_e32 167; VI-NEXT: v_add_f16_e32 168; VI-NEXT: v_add_f16_e32 169; VI-NEXT: v_add_f16_e32 170; VI-NEXT: v_add_f16_e32 171; VI-NEXT: v_add_f16_e32 172; VI-NEXT: v_add_f16_e32 173; VI-NEXT: v_add_f16_e32 174; VI-NEXT: v_add_f16_e32 175 176define half @reduction_half16(<16 x half> %vec16) { 177entry: 178 %rdx.shuf = shufflevector <16 x half> %vec16, <16 x half> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 179 %bin.rdx = fadd <16 x half> %vec16, %rdx.shuf 180 %rdx.shuf1 = shufflevector <16 x half> %bin.rdx, <16 x half> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 181 %bin.rdx2 = fadd <16 x half> %bin.rdx, %rdx.shuf1 182 %rdx.shuf3 = shufflevector <16 x half> %bin.rdx2, <16 x half> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 183 %bin.rdx4 = fadd <16 x half> %bin.rdx2, %rdx.shuf3 184 %rdx.shuf5 = shufflevector <16 x half> %bin.rdx4, <16 x half> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 185 %bin.rdx6 = fadd <16 x half> %bin.rdx4, %rdx.shuf5 186 %res = extractelement <16 x half> %bin.rdx6, i32 0 187 ret half %res 188} 189 190; GCN-LABEL: {{^}}reduction_min_v4i16: 191; GFX9: v_pk_min_u16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} 192; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 193 194; VI: v_min_u16_sdwa 195; VI-NEXT: v_min_u16_e32 196; VI-NEXT: v_min_u16_e32 197define i16 @reduction_min_v4i16(<4 x i16> %vec4) { 198entry: 199 %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 200 %rdx.minmax.cmp = icmp ult <4 x i16> %vec4, %rdx.shuf 201 %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf 202 %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 203 %rdx.minmax.cmp2 = icmp ult <4 x i16> %rdx.minmax.select, %rdx.shuf1 204 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1 205 %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0 206 ret i16 %res 207} 208 209; GCN-LABEL: {{^}}reduction_umin_v8i16: 210; GFX9: v_pk_min_u16 [[MIN1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} 211; GFX9-NEXT: v_pk_min_u16 [[MIN2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} 212; GFX9-NEXT: v_pk_min_u16 [[MIN3:v[0-9]+]], [[MIN2]], [[MIN1]]{{$}} 213; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, [[MIN3]], [[MIN3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 214 215; VI: v_min_u16_sdwa 216; VI-NEXT: v_min_u16_sdwa 217; VI-NEXT: v_min_u16_e32 218; VI-NEXT: v_min_u16_e32 219; VI-NEXT: v_min_u16_e32 220; VI-NEXT: v_min_u16_e32 221; VI-NEXT: v_min_u16_e32 222define i16 @reduction_umin_v8i16(<8 x i16> %vec8) { 223entry: 224 %rdx.shuf = shufflevector <8 x i16> %vec8, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 225 %rdx.minmax.cmp = icmp ult <8 x i16> %vec8, %rdx.shuf 226 %rdx.minmax.select = select <8 x i1> %rdx.minmax.cmp, <8 x i16> %vec8, <8 x i16> %rdx.shuf 227 %rdx.shuf1 = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 228 %rdx.minmax.cmp2 = icmp ult <8 x i16> %rdx.minmax.select, %rdx.shuf1 229 %rdx.minmax.select3 = select <8 x i1> %rdx.minmax.cmp2, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf1 230 %rdx.shuf4 = shufflevector <8 x i16> %rdx.minmax.select3, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 231 %rdx.minmax.cmp5 = icmp ult <8 x i16> %rdx.minmax.select3, %rdx.shuf4 232 %rdx.minmax.select6 = select <8 x i1> %rdx.minmax.cmp5, <8 x i16> %rdx.minmax.select3, <8 x i16> %rdx.shuf4 233 %res = extractelement <8 x i16> %rdx.minmax.select6, i32 0 234 ret i16 %res 235} 236 237; Tests to make sure without slp the number of instructions are more. 238; GCN-LABEL: {{^}}reduction_umin_v8i16_woslp: 239; GFX9: v_lshrrev_b32_e32 240; GFX9-NEXT: v_min_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 241; GFX9-NEXT: v_lshrrev_b32_e32 242; GFX9-NEXT: v_min3_u16 243; GFX9-NEXT: v_lshrrev_b32_e32 244; GFX9-NEXT: v_min3_u16 245; GFX9-NEXT: v_min3_u16 246define i16 @reduction_umin_v8i16_woslp(<8 x i16> %vec8) { 247entry: 248 %elt0 = extractelement <8 x i16> %vec8, i64 0 249 %elt1 = extractelement <8 x i16> %vec8, i64 1 250 %elt2 = extractelement <8 x i16> %vec8, i64 2 251 %elt3 = extractelement <8 x i16> %vec8, i64 3 252 %elt4 = extractelement <8 x i16> %vec8, i64 4 253 %elt5 = extractelement <8 x i16> %vec8, i64 5 254 %elt6 = extractelement <8 x i16> %vec8, i64 6 255 %elt7 = extractelement <8 x i16> %vec8, i64 7 256 257 %cmp0 = icmp ult i16 %elt1, %elt0 258 %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0 259 %cmp1 = icmp ult i16 %elt2, %min1 260 %min2 = select i1 %cmp1, i16 %elt2, i16 %min1 261 %cmp2 = icmp ult i16 %elt3, %min2 262 %min3 = select i1 %cmp2, i16 %elt3, i16 %min2 263 264 %cmp3 = icmp ult i16 %elt4, %min3 265 %min4 = select i1 %cmp3, i16 %elt4, i16 %min3 266 %cmp4 = icmp ult i16 %elt5, %min4 267 %min5 = select i1 %cmp4, i16 %elt5, i16 %min4 268 269 %cmp5 = icmp ult i16 %elt6, %min5 270 %min6 = select i1 %cmp5, i16 %elt6, i16 %min5 271 %cmp6 = icmp ult i16 %elt7, %min6 272 %min7 = select i1 %cmp6, i16 %elt7, i16 %min6 273 274 ret i16 %min7 275} 276 277; GCN-LABEL: {{^}}reduction_smin_v16i16: 278; GFX9: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} 279; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} 280; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} 281; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} 282; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} 283; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} 284; GFX9-NEXT: v_pk_min_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} 285; GFX9-NEXT: v_min_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 286 287; VI: v_min_i16_sdwa 288; VI-NEXT: v_min_i16_sdwa 289; VI-NEXT: v_min_i16_sdwa 290; VI-NEXT: v_min_i16_sdwa 291; VI-NEXT: v_min_i16_e32 292; VI-NEXT: v_min_i16_e32 293; VI-NEXT: v_min_i16_e32 294; VI-NEXT: v_min_i16_e32 295; VI-NEXT: v_min_i16_e32 296; VI-NEXT: v_min_i16_e32 297; VI-NEXT: v_min_i16_e32 298; VI-NEXT: v_min_i16_e32 299; VI-NEXT: v_min_i16_e32 300; VI-NEXT: v_min_i16_e32 301; VI-NEXT: v_min_i16_e32 302define i16 @reduction_smin_v16i16(<16 x i16> %vec16) { 303entry: 304 %rdx.shuf = shufflevector <16 x i16> %vec16, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 305 %rdx.minmax.cmp = icmp slt <16 x i16> %vec16, %rdx.shuf 306 %rdx.minmax.select = select <16 x i1> %rdx.minmax.cmp, <16 x i16> %vec16, <16 x i16> %rdx.shuf 307 %rdx.shuf1 = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 308 %rdx.minmax.cmp2 = icmp slt <16 x i16> %rdx.minmax.select, %rdx.shuf1 309 %rdx.minmax.select3 = select <16 x i1> %rdx.minmax.cmp2, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf1 310 %rdx.shuf4 = shufflevector <16 x i16> %rdx.minmax.select3, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 311 %rdx.minmax.cmp5 = icmp slt <16 x i16> %rdx.minmax.select3, %rdx.shuf4 312 %rdx.minmax.select6 = select <16 x i1> %rdx.minmax.cmp5, <16 x i16> %rdx.minmax.select3, <16 x i16> %rdx.shuf4 313 %rdx.shuf7 = shufflevector <16 x i16> %rdx.minmax.select6, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 314 %rdx.minmax.cmp8 = icmp slt <16 x i16> %rdx.minmax.select6, %rdx.shuf7 315 %rdx.minmax.select9 = select <16 x i1> %rdx.minmax.cmp8, <16 x i16> %rdx.minmax.select6, <16 x i16> %rdx.shuf7 316 %res = extractelement <16 x i16> %rdx.minmax.select9, i32 0 317 ret i16 %res 318} 319 320; Tests to make sure without slp the number of instructions are more. 321; GCN-LABEL: {{^}}reduction_smin_v16i16_woslp: 322; GFX9: v_lshrrev_b32_e32 323; GFX9-NEXT: v_min_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 324; GFX9-NEXT: v_lshrrev_b32_e32 325; GFX9-NEXT: v_min3_i16 326; GFX9-NEXT: v_lshrrev_b32_e32 327; GFX9-NEXT: v_min3_i16 328; GFX9-NEXT: v_lshrrev_b32_e32 329; GFX9-NEXT: v_min3_i16 330; GFX9-NEXT: v_lshrrev_b32_e32 331; GFX9-NEXT: v_min3_i16 332; GFX9-NEXT: v_lshrrev_b32_e32 333; GFX9-NEXT: v_min3_i16 334; GFX9-NEXT: v_lshrrev_b32_e32 335; GFX9-NEXT: v_min3_i16 336; GFX9-NEXT: v_min3_i16 337define i16 @reduction_smin_v16i16_woslp(<16 x i16> %vec16) { 338entry: 339 %elt0 = extractelement <16 x i16> %vec16, i64 0 340 %elt1 = extractelement <16 x i16> %vec16, i64 1 341 %elt2 = extractelement <16 x i16> %vec16, i64 2 342 %elt3 = extractelement <16 x i16> %vec16, i64 3 343 %elt4 = extractelement <16 x i16> %vec16, i64 4 344 %elt5 = extractelement <16 x i16> %vec16, i64 5 345 %elt6 = extractelement <16 x i16> %vec16, i64 6 346 %elt7 = extractelement <16 x i16> %vec16, i64 7 347 348 %elt8 = extractelement <16 x i16> %vec16, i64 8 349 %elt9 = extractelement <16 x i16> %vec16, i64 9 350 %elt10 = extractelement <16 x i16> %vec16, i64 10 351 %elt11 = extractelement <16 x i16> %vec16, i64 11 352 %elt12 = extractelement <16 x i16> %vec16, i64 12 353 %elt13 = extractelement <16 x i16> %vec16, i64 13 354 %elt14 = extractelement <16 x i16> %vec16, i64 14 355 %elt15 = extractelement <16 x i16> %vec16, i64 15 356 357 %cmp0 = icmp slt i16 %elt1, %elt0 358 %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0 359 %cmp1 = icmp slt i16 %elt2, %min1 360 %min2 = select i1 %cmp1, i16 %elt2, i16 %min1 361 %cmp2 = icmp slt i16 %elt3, %min2 362 %min3 = select i1 %cmp2, i16 %elt3, i16 %min2 363 364 %cmp3 = icmp slt i16 %elt4, %min3 365 %min4 = select i1 %cmp3, i16 %elt4, i16 %min3 366 %cmp4 = icmp slt i16 %elt5, %min4 367 %min5 = select i1 %cmp4, i16 %elt5, i16 %min4 368 369 %cmp5 = icmp slt i16 %elt6, %min5 370 %min6 = select i1 %cmp5, i16 %elt6, i16 %min5 371 %cmp6 = icmp slt i16 %elt7, %min6 372 %min7 = select i1 %cmp6, i16 %elt7, i16 %min6 373 374 %cmp7 = icmp slt i16 %elt8, %min7 375 %min8 = select i1 %cmp7, i16 %elt8, i16 %min7 376 %cmp8 = icmp slt i16 %elt9, %min8 377 %min9 = select i1 %cmp8, i16 %elt9, i16 %min8 378 379 %cmp9 = icmp slt i16 %elt10, %min9 380 %min10 = select i1 %cmp9, i16 %elt10, i16 %min9 381 %cmp10 = icmp slt i16 %elt11, %min10 382 %min11 = select i1 %cmp10, i16 %elt11, i16 %min10 383 384 %cmp11 = icmp slt i16 %elt12, %min11 385 %min12 = select i1 %cmp11, i16 %elt12, i16 %min11 386 %cmp12 = icmp slt i16 %elt13, %min12 387 %min13 = select i1 %cmp12, i16 %elt13, i16 %min12 388 389 %cmp13 = icmp slt i16 %elt14, %min13 390 %min14 = select i1 %cmp13, i16 %elt14, i16 %min13 391 %cmp14 = icmp slt i16 %elt15, %min14 392 %min15 = select i1 %cmp14, i16 %elt15, i16 %min14 393 394 395 ret i16 %min15 396} 397 398; GCN-LABEL: {{^}}reduction_umax_v4i16: 399; GFX9: v_pk_max_u16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} 400; GFX9-NEXT: v_max_u16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 401 402; VI: v_max_u16_sdwa 403; VI-NEXT: v_max_u16_e32 404; VI-NEXT: v_max_u16_e32 405define i16 @reduction_umax_v4i16(<4 x i16> %vec4) { 406entry: 407 %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 408 %rdx.minmax.cmp = icmp ugt <4 x i16> %vec4, %rdx.shuf 409 %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf 410 %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 411 %rdx.minmax.cmp2 = icmp ugt <4 x i16> %rdx.minmax.select, %rdx.shuf1 412 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1 413 %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0 414 ret i16 %res 415} 416 417; GCN-LABEL: {{^}}reduction_smax_v4i16: 418; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} 419; GFX9-NEXT: v_max_i16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 420 421; VI: v_max_i16_sdwa 422; VI-NEXT: v_max_i16_e32 423; VI-NEXT: v_max_i16_e32 424define i16 @reduction_smax_v4i16(<4 x i16> %vec4) #0 { 425entry: 426 %rdx.shuf = shufflevector <4 x i16> %vec4, <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 427 %rdx.minmax.cmp = icmp sgt <4 x i16> %vec4, %rdx.shuf 428 %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x i16> %vec4, <4 x i16> %rdx.shuf 429 %rdx.shuf1 = shufflevector <4 x i16> %rdx.minmax.select, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 430 %rdx.minmax.cmp2 = icmp sgt <4 x i16> %rdx.minmax.select, %rdx.shuf1 431 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x i16> %rdx.minmax.select, <4 x i16> %rdx.shuf1 432 %res = extractelement <4 x i16> %rdx.minmax.select3, i32 0 433 ret i16 %res 434} 435 436; GCN-LABEL: {{^}}reduction_maxnum_v4f16: 437; GFX9: s_waitcnt 438; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1 439; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0 440; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}} 441; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 442 443 444; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 445; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 446; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0 447; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1 448 449; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]] 450; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]] 451; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]] 452define half @reduction_maxnum_v4f16(<4 x half> %vec4) { 453entry: 454 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 455 %rdx.minmax = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %vec4, <4 x half> %rdx.shuf) 456 %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 457 %rdx.minmax3 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %rdx.minmax, <4 x half> %rdx.shuf1) 458 %res = extractelement <4 x half> %rdx.minmax3, i32 0 459 ret half %res 460} 461 462; GCN-LABEL: {{^}}reduction_minnum_v4f16: 463; GFX9: s_waitcnt 464; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1 465; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0 466; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}} 467; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 468 469; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 470; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 471; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0 472; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1 473 474; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]] 475; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]] 476; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]] 477define half @reduction_minnum_v4f16(<4 x half> %vec4) { 478entry: 479 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 480 %rdx.minmax = call <4 x half> @llvm.minnum.v4f16(<4 x half> %vec4, <4 x half> %rdx.shuf) 481 %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 482 %rdx.minmax3 = call <4 x half> @llvm.minnum.v4f16(<4 x half> %rdx.minmax, <4 x half> %rdx.shuf1) 483 %res = extractelement <4 x half> %rdx.minmax3, i32 0 484 ret half %res 485} 486 487; FIXME: Need to preserve fast math flags when fmaxnum matched 488; directly from the IR to avoid unnecessary quieting. 489 490; GCN-LABEL: {{^}}reduction_fast_max_pattern_v4f16: 491; XGFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} 492; XGFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 493 494; XVI: s_waitcnt 495; XVI-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 496; XVI-NEXT: v_max_f16_e32 v0, v0, v1 497; XVI-NEXT: v_max_f16_e32 v0, v0, v2 498; XVI-NEXT: s_setpc_b64 499 500; GFX9: s_waitcnt 501; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1 502; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0 503; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}} 504; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 505 506; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 507; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 508; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0 509; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1 510 511; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]] 512; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]] 513; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]] 514define half @reduction_fast_max_pattern_v4f16(<4 x half> %vec4) { 515entry: 516 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 517 %rdx.minmax.cmp = fcmp nnan nsz ogt <4 x half> %vec4, %rdx.shuf 518 %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x half> %vec4, <4 x half> %rdx.shuf 519 %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax.select, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 520 %rdx.minmax.cmp2 = fcmp nnan nsz ogt <4 x half> %rdx.minmax.select, %rdx.shuf1 521 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1 522 %res = extractelement <4 x half> %rdx.minmax.select3, i32 0 523 ret half %res 524} 525 526; FIXME: Need to preserve fast math flags when fmaxnum matched 527; directly from the IR to avoid unnecessary quieting. 528 529; GCN-LABEL: {{^}}reduction_fast_min_pattern_v4f16: 530; XGFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}} 531; XGFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 532 533; XVI: s_waitcnt 534; XVI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 535; XVI-NEXT: v_min_f16_e32 v0, v0, v1 536; XVI-NEXT: v_min_f16_e32 v0, v0, v2 537; XVI-NEXT: s_setpc_b64 538 539; GFX9: s_waitcnt 540; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1 541; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0 542; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}} 543; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 544 545; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 546; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 547; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0 548; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1 549 550; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]] 551; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]] 552; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]] 553define half @reduction_fast_min_pattern_v4f16(<4 x half> %vec4) { 554entry: 555 %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 556 %rdx.minmax.cmp = fcmp nnan nsz olt <4 x half> %vec4, %rdx.shuf 557 %rdx.minmax.select = select <4 x i1> %rdx.minmax.cmp, <4 x half> %vec4, <4 x half> %rdx.shuf 558 %rdx.shuf1 = shufflevector <4 x half> %rdx.minmax.select, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 559 %rdx.minmax.cmp2 = fcmp nnan nsz olt <4 x half> %rdx.minmax.select, %rdx.shuf1 560 %rdx.minmax.select3 = select <4 x i1> %rdx.minmax.cmp2, <4 x half> %rdx.minmax.select, <4 x half> %rdx.shuf1 561 %res = extractelement <4 x half> %rdx.minmax.select3, i32 0 562 ret half %res 563} 564 565declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>) 566declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>) 567