1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -S -slp-threshold=-100 -slp-vectorize-hor-store -dce | FileCheck %s --check-prefix=GFX9 3 4@arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16 5@arr64 = local_unnamed_addr global [32 x i64] zeroinitializer, align 16 6@var = global i32 zeroinitializer, align 8 7@var64 = global i64 zeroinitializer, align 8 8 9@farr = local_unnamed_addr global [32 x float] zeroinitializer, align 16 10@fvar = global float zeroinitializer, align 8 11 12@darr = local_unnamed_addr global [32 x double] zeroinitializer, align 16 13@dvar = global double zeroinitializer, align 8 14 15; Tests whether the min/max reduction pattern is vectorized if SLP starts at the store. 16define i32 @smaxv6() { 17; GFX9-LABEL: @smaxv6( 18; GFX9-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 19; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 20; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 21; GFX9-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] 22; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[TMP3]] 23; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 24; GFX9-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]]) 25; GFX9-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP5]], [[SELECT1]] 26; GFX9-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP5]], i32 [[SELECT1]] 27; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i32 3, i32 4 28; GFX9-NEXT: store i32 [[STORE_SELECT]], i32* @var, align 8 29; GFX9-NEXT: ret i32 [[OP_EXTRA1]] 30; 31 %load1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 32 %load2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 33 %cmp1 = icmp sgt i32 %load1, %load2 34 %select1 = select i1 %cmp1, i32 %load1, i32 %load2 35 36 %load3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 37 %cmp2 = icmp sgt i32 %select1, %load3 38 %select2 = select i1 %cmp2, i32 %select1, i32 %load3 39 40 %load4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 41 %cmp3 = icmp sgt i32 %select2, %load4 42 %select3 = select i1 %cmp3, i32 %select2, i32 %load4 43 44 %load5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 45 %cmp4 = icmp sgt i32 %select3, %load5 46 %select4 = select i1 %cmp4, i32 %select3, i32 %load5 47 48 %load6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 49 %cmp5 = icmp sgt i32 %select4, %load6 50 %select5 = select i1 %cmp5, i32 %select4, i32 %load6 51 52 %store-select = select i1 %cmp1, i32 3, i32 4 53 store i32 %store-select, i32* @var, align 8 54 ret i32 %select5 55} 56 57define i64 @sminv6() { 58; GFX9-LABEL: @sminv6( 59; GFX9-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([32 x i64]* @arr64 to <2 x i64>*), align 16 60; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 61; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 62; GFX9-NEXT: [[CMP1:%.*]] = icmp slt i64 [[TMP2]], [[TMP3]] 63; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i64 [[TMP2]], i64 [[TMP3]] 64; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2) to <4 x i64>*), align 16 65; GFX9-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP4]]) 66; GFX9-NEXT: [[OP_EXTRA:%.*]] = icmp slt i64 [[TMP5]], [[SELECT1]] 67; GFX9-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i64 [[TMP5]], i64 [[SELECT1]] 68; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i64 3, i64 4 69; GFX9-NEXT: store i64 [[STORE_SELECT]], i64* @var64, align 8 70; GFX9-NEXT: ret i64 [[OP_EXTRA1]] 71; 72 %load1 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 0), align 16 73 %load2 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 1), align 8 74 %cmp1 = icmp slt i64 %load1, %load2 75 %select1 = select i1 %cmp1, i64 %load1, i64 %load2 76 77 %load3 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2), align 16 78 %cmp2 = icmp slt i64 %select1, %load3 79 %select2 = select i1 %cmp2, i64 %select1, i64 %load3 80 81 %load4 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 3), align 8 82 %cmp3 = icmp slt i64 %select2, %load4 83 %select3 = select i1 %cmp3, i64 %select2, i64 %load4 84 85 %load5 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 4), align 16 86 %cmp4 = icmp slt i64 %select3, %load5 87 %select4 = select i1 %cmp4, i64 %select3, i64 %load5 88 89 %load6 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 5), align 8 90 %cmp5 = icmp slt i64 %select4, %load6 91 %select5 = select i1 %cmp5, i64 %select4, i64 %load6 92 93 %store-select = select i1 %cmp1, i64 3, i64 4 94 store i64 %store-select, i64* @var64, align 8 95 ret i64 %select5 96} 97 98; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select 99; with fastmath on the select. 100define float @fmaxv6() { 101; GFX9-LABEL: @fmaxv6( 102; GFX9-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([32 x float]* @farr to <2 x float>*), align 16 103; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 104; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 105; GFX9-NEXT: [[CMP1:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] 106; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], float [[TMP2]], float [[TMP3]] 107; GFX9-NEXT: [[LOAD3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2), align 8 108; GFX9-NEXT: [[CMP2:%.*]] = fcmp fast ogt float [[SELECT1]], [[LOAD3]] 109; GFX9-NEXT: [[SELECT2:%.*]] = select i1 [[CMP2]], float [[SELECT1]], float [[LOAD3]] 110; GFX9-NEXT: [[LOAD4:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 3), align 4 111; GFX9-NEXT: [[CMP3:%.*]] = fcmp fast ogt float [[SELECT2]], [[LOAD4]] 112; GFX9-NEXT: [[SELECT3:%.*]] = select i1 [[CMP3]], float [[SELECT2]], float [[LOAD4]] 113; GFX9-NEXT: [[LOAD5:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 4), align 16 114; GFX9-NEXT: [[CMP4:%.*]] = fcmp fast ogt float [[SELECT3]], [[LOAD5]] 115; GFX9-NEXT: [[SELECT4:%.*]] = select i1 [[CMP4]], float [[SELECT3]], float [[LOAD5]] 116; GFX9-NEXT: [[LOAD6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 5), align 4 117; GFX9-NEXT: [[CMP5:%.*]] = fcmp fast ogt float [[SELECT4]], [[LOAD6]] 118; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], float [[SELECT4]], float [[LOAD6]] 119; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], float 3.000000e+00, float 4.000000e+00 120; GFX9-NEXT: store float [[STORE_SELECT]], float* @fvar, align 8 121; GFX9-NEXT: ret float [[SELECT5]] 122; 123 %load1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 0), align 16 124 %load2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 1), align 4 125 %cmp1 = fcmp fast ogt float %load1, %load2 126 %select1 = select i1 %cmp1, float %load1, float %load2 127 128 %load3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2), align 8 129 %cmp2 = fcmp fast ogt float %select1, %load3 130 %select2 = select i1 %cmp2, float %select1, float %load3 131 132 %load4 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 3), align 4 133 %cmp3 = fcmp fast ogt float %select2, %load4 134 %select3 = select i1 %cmp3, float %select2, float %load4 135 136 %load5 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 4), align 16 137 %cmp4 = fcmp fast ogt float %select3, %load5 138 %select4 = select i1 %cmp4, float %select3, float %load5 139 140 %load6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 5), align 4 141 %cmp5 = fcmp fast ogt float %select4, %load6 142 %select5 = select i1 %cmp5, float %select4, float %load6 143 144 %store-select = select i1 %cmp1, float 3.0, float 4.0 145 store float %store-select, float* @fvar, align 8 146 ret float %select5 147} 148 149; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select 150; with fastmath on the select. 151define double @dminv6() { 152; GFX9-LABEL: @dminv6( 153; GFX9-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([32 x double]* @darr to <2 x double>*), align 16 154; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 155; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 156; GFX9-NEXT: [[CMP1:%.*]] = fcmp fast olt double [[TMP2]], [[TMP3]] 157; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], double [[TMP2]], double [[TMP3]] 158; GFX9-NEXT: [[LOAD3:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2), align 8 159; GFX9-NEXT: [[CMP2:%.*]] = fcmp fast olt double [[SELECT1]], [[LOAD3]] 160; GFX9-NEXT: [[SELECT2:%.*]] = select i1 [[CMP2]], double [[SELECT1]], double [[LOAD3]] 161; GFX9-NEXT: [[LOAD4:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 3), align 4 162; GFX9-NEXT: [[CMP3:%.*]] = fcmp fast olt double [[SELECT2]], [[LOAD4]] 163; GFX9-NEXT: [[SELECT3:%.*]] = select i1 [[CMP3]], double [[SELECT2]], double [[LOAD4]] 164; GFX9-NEXT: [[LOAD5:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 4), align 16 165; GFX9-NEXT: [[CMP4:%.*]] = fcmp fast olt double [[SELECT3]], [[LOAD5]] 166; GFX9-NEXT: [[SELECT4:%.*]] = select i1 [[CMP4]], double [[SELECT3]], double [[LOAD5]] 167; GFX9-NEXT: [[LOAD6:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 5), align 4 168; GFX9-NEXT: [[CMP5:%.*]] = fcmp fast olt double [[SELECT4]], [[LOAD6]] 169; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], double [[SELECT4]], double [[LOAD6]] 170; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], double 3.000000e+00, double 4.000000e+00 171; GFX9-NEXT: store double [[STORE_SELECT]], double* @dvar, align 8 172; GFX9-NEXT: ret double [[SELECT5]] 173; 174 %load1 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 0), align 16 175 %load2 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 1), align 4 176 %cmp1 = fcmp fast olt double %load1, %load2 177 %select1 = select i1 %cmp1, double %load1, double %load2 178 179 %load3 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2), align 8 180 %cmp2 = fcmp fast olt double %select1, %load3 181 %select2 = select i1 %cmp2, double %select1, double %load3 182 183 %load4 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 3), align 4 184 %cmp3 = fcmp fast olt double %select2, %load4 185 %select3 = select i1 %cmp3, double %select2, double %load4 186 187 %load5 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 4), align 16 188 %cmp4 = fcmp fast olt double %select3, %load5 189 %select4 = select i1 %cmp4, double %select3, double %load5 190 191 %load6 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 5), align 4 192 %cmp5 = fcmp fast olt double %select4, %load6 193 %select5 = select i1 %cmp5, double %select4, double %load6 194 195 %store-select = select i1 %cmp1, double 3.0, double 4.0 196 store double %store-select, double* @dvar, align 8 197 ret double %select5 198} 199 200define i32 @smax_wdiff_valuenum(i32, i32 %v1) { 201; GFX9-LABEL: @smax_wdiff_valuenum( 202; GFX9-NEXT: [[VLOAD:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 203; GFX9-NEXT: [[ELT1:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0 204; GFX9-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[ELT1]], [[V1:%.*]] 205; GFX9-NEXT: [[EX0:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0 206; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[EX0]], i32 [[V1]] 207; GFX9-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 208; GFX9-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]]) 209; GFX9-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP3]], [[SELECT1]] 210; GFX9-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP3]], i32 [[SELECT1]] 211; GFX9-NEXT: [[STOREVAL:%.*]] = select i1 [[CMP1]], i32 3, i32 4 212; GFX9-NEXT: store i32 [[STOREVAL]], i32* @var, align 8 213; GFX9-NEXT: ret i32 [[OP_EXTRA1]] 214; 215 %vload = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 216 %elt1 = extractelement <2 x i32> %vload, i32 0 217 %cmp1 = icmp sgt i32 %elt1, %v1 218 %ex0 = extractelement <2 x i32> %vload, i32 0 219 %select1 = select i1 %cmp1, i32 %ex0, i32 %v1 220 221 %load3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8 222 %cmp2 = icmp sgt i32 %select1, %load3 223 %select2 = select i1 %cmp2, i32 %select1, i32 %load3 224 225 %load4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4 226 %cmp3 = icmp sgt i32 %select2, %load4 227 %select3 = select i1 %cmp3, i32 %select2, i32 %load4 228 229 %load5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 230 %cmp4 = icmp sgt i32 %select3, %load5 231 %select4 = select i1 %cmp4, i32 %select3, i32 %load5 232 233 %load6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 234 %cmp5 = icmp sgt i32 %select4, %load6 235 %select5 = select i1 %cmp5, i32 %select4, i32 %load6 236 237 %storeval = select i1 %cmp1, i32 3, i32 4 238 store i32 %storeval, i32* @var, align 8 239 ret i32 %select5 240} 241