1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -S -slp-threshold=-100 -slp-vectorize-hor-store -dce | FileCheck %s --check-prefix=GFX9
3
4@arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16
5@arr64 = local_unnamed_addr global [32 x i64] zeroinitializer, align 16
6@var = global i32 zeroinitializer, align 8
7@var64 = global i64 zeroinitializer, align 8
8
9@farr = local_unnamed_addr global [32 x float] zeroinitializer, align 16
10@fvar = global float zeroinitializer, align 8
11
12@darr = local_unnamed_addr global [32 x double] zeroinitializer, align 16
13@dvar = global double zeroinitializer, align 8
14
15; Tests whether the min/max reduction pattern is vectorized if SLP starts at the store.
16define i32 @smaxv6() {
17; GFX9-LABEL: @smaxv6(
18; GFX9-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
19; GFX9-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
20; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
21; GFX9-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
22; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[TMP3]]
23; GFX9-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
24; GFX9-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
25; GFX9-NEXT:    [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP5]], [[SELECT1]]
26; GFX9-NEXT:    [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP5]], i32 [[SELECT1]]
27; GFX9-NEXT:    [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i32 3, i32 4
28; GFX9-NEXT:    store i32 [[STORE_SELECT]], i32* @var, align 8
29; GFX9-NEXT:    ret i32 [[OP_EXTRA1]]
30;
31  %load1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
32  %load2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
33  %cmp1 = icmp sgt i32 %load1, %load2
34  %select1 = select i1 %cmp1, i32 %load1, i32 %load2
35
36  %load3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
37  %cmp2 = icmp sgt i32 %select1, %load3
38  %select2 = select i1 %cmp2, i32 %select1, i32 %load3
39
40  %load4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
41  %cmp3 = icmp sgt i32 %select2, %load4
42  %select3 = select i1 %cmp3, i32 %select2, i32 %load4
43
44  %load5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
45  %cmp4 = icmp sgt i32 %select3, %load5
46  %select4 = select i1 %cmp4, i32 %select3, i32 %load5
47
48  %load6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
49  %cmp5 = icmp sgt i32 %select4, %load6
50  %select5 = select i1 %cmp5, i32 %select4, i32 %load6
51
52  %store-select = select i1 %cmp1, i32 3, i32 4
53  store i32 %store-select, i32* @var, align 8
54  ret i32 %select5
55}
56
57define i64 @sminv6() {
58; GFX9-LABEL: @sminv6(
59; GFX9-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([32 x i64]* @arr64 to <2 x i64>*), align 16
60; GFX9-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
61; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
62; GFX9-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[TMP2]], [[TMP3]]
63; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], i64 [[TMP2]], i64 [[TMP3]]
64; GFX9-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2) to <4 x i64>*), align 16
65; GFX9-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP4]])
66; GFX9-NEXT:    [[OP_EXTRA:%.*]] = icmp slt i64 [[TMP5]], [[SELECT1]]
67; GFX9-NEXT:    [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i64 [[TMP5]], i64 [[SELECT1]]
68; GFX9-NEXT:    [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i64 3, i64 4
69; GFX9-NEXT:    store i64 [[STORE_SELECT]], i64* @var64, align 8
70; GFX9-NEXT:    ret i64 [[OP_EXTRA1]]
71;
72  %load1 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 0), align 16
73  %load2 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 1), align 8
74  %cmp1 = icmp slt i64 %load1, %load2
75  %select1 = select i1 %cmp1, i64 %load1, i64 %load2
76
77  %load3 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2), align 16
78  %cmp2 = icmp slt i64 %select1, %load3
79  %select2 = select i1 %cmp2, i64 %select1, i64 %load3
80
81  %load4 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 3), align 8
82  %cmp3 = icmp slt i64 %select2, %load4
83  %select3 = select i1 %cmp3, i64 %select2, i64 %load4
84
85  %load5 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 4), align 16
86  %cmp4 = icmp slt i64 %select3, %load5
87  %select4 = select i1 %cmp4, i64 %select3, i64 %load5
88
89  %load6 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 5), align 8
90  %cmp5 = icmp slt i64 %select4, %load6
91  %select5 = select i1 %cmp5, i64 %select4, i64 %load6
92
93  %store-select = select i1 %cmp1, i64 3, i64 4
94  store i64 %store-select, i64* @var64, align 8
95  ret i64 %select5
96}
97
98; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
99; with fastmath on the select.
100define float @fmaxv6() {
101; GFX9-LABEL: @fmaxv6(
102; GFX9-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([32 x float]* @farr to <2 x float>*), align 16
103; GFX9-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
104; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
105; GFX9-NEXT:    [[CMP1:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
106; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], float [[TMP2]], float [[TMP3]]
107; GFX9-NEXT:    [[LOAD3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2), align 8
108; GFX9-NEXT:    [[CMP2:%.*]] = fcmp fast ogt float [[SELECT1]], [[LOAD3]]
109; GFX9-NEXT:    [[SELECT2:%.*]] = select i1 [[CMP2]], float [[SELECT1]], float [[LOAD3]]
110; GFX9-NEXT:    [[LOAD4:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 3), align 4
111; GFX9-NEXT:    [[CMP3:%.*]] = fcmp fast ogt float [[SELECT2]], [[LOAD4]]
112; GFX9-NEXT:    [[SELECT3:%.*]] = select i1 [[CMP3]], float [[SELECT2]], float [[LOAD4]]
113; GFX9-NEXT:    [[LOAD5:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 4), align 16
114; GFX9-NEXT:    [[CMP4:%.*]] = fcmp fast ogt float [[SELECT3]], [[LOAD5]]
115; GFX9-NEXT:    [[SELECT4:%.*]] = select i1 [[CMP4]], float [[SELECT3]], float [[LOAD5]]
116; GFX9-NEXT:    [[LOAD6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 5), align 4
117; GFX9-NEXT:    [[CMP5:%.*]] = fcmp fast ogt float [[SELECT4]], [[LOAD6]]
118; GFX9-NEXT:    [[SELECT5:%.*]] = select i1 [[CMP5]], float [[SELECT4]], float [[LOAD6]]
119; GFX9-NEXT:    [[STORE_SELECT:%.*]] = select i1 [[CMP1]], float 3.000000e+00, float 4.000000e+00
120; GFX9-NEXT:    store float [[STORE_SELECT]], float* @fvar, align 8
121; GFX9-NEXT:    ret float [[SELECT5]]
122;
123  %load1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 0), align 16
124  %load2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 1), align 4
125  %cmp1 = fcmp fast ogt float %load1, %load2
126  %select1 = select i1 %cmp1, float %load1, float %load2
127
128  %load3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2), align 8
129  %cmp2 = fcmp fast ogt float %select1, %load3
130  %select2 = select i1 %cmp2, float %select1, float %load3
131
132  %load4 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 3), align 4
133  %cmp3 = fcmp fast ogt float %select2, %load4
134  %select3 = select i1 %cmp3, float %select2, float %load4
135
136  %load5 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 4), align 16
137  %cmp4 = fcmp fast ogt float %select3, %load5
138  %select4 = select i1 %cmp4, float %select3, float %load5
139
140  %load6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 5), align 4
141  %cmp5 = fcmp fast ogt float %select4, %load6
142  %select5 = select i1 %cmp5, float %select4, float %load6
143
144  %store-select = select i1 %cmp1, float 3.0, float 4.0
145  store float %store-select, float* @fvar, align 8
146  ret float %select5
147}
148
149; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select
150; with fastmath on the select.
151define double @dminv6() {
152; GFX9-LABEL: @dminv6(
153; GFX9-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([32 x double]* @darr to <2 x double>*), align 16
154; GFX9-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
155; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
156; GFX9-NEXT:    [[CMP1:%.*]] = fcmp fast olt double [[TMP2]], [[TMP3]]
157; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], double [[TMP2]], double [[TMP3]]
158; GFX9-NEXT:    [[LOAD3:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2), align 8
159; GFX9-NEXT:    [[CMP2:%.*]] = fcmp fast olt double [[SELECT1]], [[LOAD3]]
160; GFX9-NEXT:    [[SELECT2:%.*]] = select i1 [[CMP2]], double [[SELECT1]], double [[LOAD3]]
161; GFX9-NEXT:    [[LOAD4:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 3), align 4
162; GFX9-NEXT:    [[CMP3:%.*]] = fcmp fast olt double [[SELECT2]], [[LOAD4]]
163; GFX9-NEXT:    [[SELECT3:%.*]] = select i1 [[CMP3]], double [[SELECT2]], double [[LOAD4]]
164; GFX9-NEXT:    [[LOAD5:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 4), align 16
165; GFX9-NEXT:    [[CMP4:%.*]] = fcmp fast olt double [[SELECT3]], [[LOAD5]]
166; GFX9-NEXT:    [[SELECT4:%.*]] = select i1 [[CMP4]], double [[SELECT3]], double [[LOAD5]]
167; GFX9-NEXT:    [[LOAD6:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 5), align 4
168; GFX9-NEXT:    [[CMP5:%.*]] = fcmp fast olt double [[SELECT4]], [[LOAD6]]
169; GFX9-NEXT:    [[SELECT5:%.*]] = select i1 [[CMP5]], double [[SELECT4]], double [[LOAD6]]
170; GFX9-NEXT:    [[STORE_SELECT:%.*]] = select i1 [[CMP1]], double 3.000000e+00, double 4.000000e+00
171; GFX9-NEXT:    store double [[STORE_SELECT]], double* @dvar, align 8
172; GFX9-NEXT:    ret double [[SELECT5]]
173;
174  %load1 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 0), align 16
175  %load2 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 1), align 4
176  %cmp1 = fcmp fast olt double %load1, %load2
177  %select1 = select i1 %cmp1, double %load1, double %load2
178
179  %load3 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2), align 8
180  %cmp2 = fcmp fast olt double %select1, %load3
181  %select2 = select i1 %cmp2, double %select1, double %load3
182
183  %load4 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 3), align 4
184  %cmp3 = fcmp fast olt double %select2, %load4
185  %select3 = select i1 %cmp3, double %select2, double %load4
186
187  %load5 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 4), align 16
188  %cmp4 = fcmp fast olt double %select3, %load5
189  %select4 = select i1 %cmp4, double %select3, double %load5
190
191  %load6 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 5), align 4
192  %cmp5 = fcmp fast olt double %select4, %load6
193  %select5 = select i1 %cmp5, double %select4, double %load6
194
195  %store-select = select i1 %cmp1, double 3.0, double 4.0
196  store double %store-select, double* @dvar, align 8
197  ret double %select5
198}
199
200define i32 @smax_wdiff_valuenum(i32, i32 %v1) {
201; GFX9-LABEL: @smax_wdiff_valuenum(
202; GFX9-NEXT:    [[VLOAD:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
203; GFX9-NEXT:    [[ELT1:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0
204; GFX9-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[ELT1]], [[V1:%.*]]
205; GFX9-NEXT:    [[EX0:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0
206; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[EX0]], i32 [[V1]]
207; GFX9-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
208; GFX9-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
209; GFX9-NEXT:    [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP3]], [[SELECT1]]
210; GFX9-NEXT:    [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP3]], i32 [[SELECT1]]
211; GFX9-NEXT:    [[STOREVAL:%.*]] = select i1 [[CMP1]], i32 3, i32 4
212; GFX9-NEXT:    store i32 [[STOREVAL]], i32* @var, align 8
213; GFX9-NEXT:    ret i32 [[OP_EXTRA1]]
214;
215  %vload = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
216  %elt1 = extractelement <2 x i32> %vload, i32 0
217  %cmp1 = icmp sgt i32 %elt1, %v1
218  %ex0 = extractelement <2 x i32> %vload, i32 0
219  %select1 = select i1 %cmp1, i32 %ex0, i32 %v1
220
221  %load3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
222  %cmp2 = icmp sgt i32 %select1, %load3
223  %select2 = select i1 %cmp2, i32 %select1, i32 %load3
224
225  %load4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
226  %cmp3 = icmp sgt i32 %select2, %load4
227  %select3 = select i1 %cmp3, i32 %select2, i32 %load4
228
229  %load5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
230  %cmp4 = icmp sgt i32 %select3, %load5
231  %select4 = select i1 %cmp4, i32 %select3, i32 %load5
232
233  %load6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
234  %cmp5 = icmp sgt i32 %select4, %load6
235  %select5 = select i1 %cmp5, i32 %select4, i32 %load6
236
237  %storeval = select i1 %cmp1, i32 3, i32 4
238  store i32 %storeval, i32* @var, align 8
239  ret i32 %select5
240}
241