1; RUN: opt -S -slp-vectorizer -slp-threshold=-10000 < %s | FileCheck %s
2target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-n8:16:32:64-S128"
3
4target triple = "x86_64-apple-macosx10.8.0"
5
6define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
7; CHECK-LABEL: @simple_select(
8; CHECK-NEXT: %1 = icmp ne <4 x i32> %c, zeroinitializer
9; CHECK-NEXT: select <4 x i1> %1, <4 x float> %a, <4 x float> %b
10  %c0 = extractelement <4 x i32> %c, i32 0
11  %c1 = extractelement <4 x i32> %c, i32 1
12  %c2 = extractelement <4 x i32> %c, i32 2
13  %c3 = extractelement <4 x i32> %c, i32 3
14  %a0 = extractelement <4 x float> %a, i32 0
15  %a1 = extractelement <4 x float> %a, i32 1
16  %a2 = extractelement <4 x float> %a, i32 2
17  %a3 = extractelement <4 x float> %a, i32 3
18  %b0 = extractelement <4 x float> %b, i32 0
19  %b1 = extractelement <4 x float> %b, i32 1
20  %b2 = extractelement <4 x float> %b, i32 2
21  %b3 = extractelement <4 x float> %b, i32 3
22  %cmp0 = icmp ne i32 %c0, 0
23  %cmp1 = icmp ne i32 %c1, 0
24  %cmp2 = icmp ne i32 %c2, 0
25  %cmp3 = icmp ne i32 %c3, 0
26  %s0 = select i1 %cmp0, float %a0, float %b0
27  %s1 = select i1 %cmp1, float %a1, float %b1
28  %s2 = select i1 %cmp2, float %a2, float %b2
29  %s3 = select i1 %cmp3, float %a3, float %b3
30  %ra = insertelement <4 x float> undef, float %s0, i32 0
31  %rb = insertelement <4 x float> %ra, float %s1, i32 1
32  %rc = insertelement <4 x float> %rb, float %s2, i32 2
33  %rd = insertelement <4 x float> %rc, float %s3, i32 3
34  ret <4 x float> %rd
35}
36
37; Insert in an order different from the vector indices to make sure it
38; doesn't matter
39define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
40; CHECK-LABEL: @simple_select_insert_out_of_order(
41; CHECK-NEXT: %1 = icmp ne <4 x i32> %c, zeroinitializer
42; CHECK-NEXT: select <4 x i1> %1, <4 x float> %a, <4 x float> %b
43  %c0 = extractelement <4 x i32> %c, i32 0
44  %c1 = extractelement <4 x i32> %c, i32 1
45  %c2 = extractelement <4 x i32> %c, i32 2
46  %c3 = extractelement <4 x i32> %c, i32 3
47  %a0 = extractelement <4 x float> %a, i32 0
48  %a1 = extractelement <4 x float> %a, i32 1
49  %a2 = extractelement <4 x float> %a, i32 2
50  %a3 = extractelement <4 x float> %a, i32 3
51  %b0 = extractelement <4 x float> %b, i32 0
52  %b1 = extractelement <4 x float> %b, i32 1
53  %b2 = extractelement <4 x float> %b, i32 2
54  %b3 = extractelement <4 x float> %b, i32 3
55  %cmp0 = icmp ne i32 %c0, 0
56  %cmp1 = icmp ne i32 %c1, 0
57  %cmp2 = icmp ne i32 %c2, 0
58  %cmp3 = icmp ne i32 %c3, 0
59  %s0 = select i1 %cmp0, float %a0, float %b0
60  %s1 = select i1 %cmp1, float %a1, float %b1
61  %s2 = select i1 %cmp2, float %a2, float %b2
62  %s3 = select i1 %cmp3, float %a3, float %b3
63  %ra = insertelement <4 x float> undef, float %s0, i32 2
64  %rb = insertelement <4 x float> %ra, float %s1, i32 1
65  %rc = insertelement <4 x float> %rb, float %s2, i32 0
66  %rd = insertelement <4 x float> %rc, float %s3, i32 3
67  ret <4 x float> %rd
68}
69
70declare void @v4f32_user(<4 x float>) #0
71declare void @f32_user(float) #0
72
73; Multiple users of the final constructed vector
74define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
75; CHECK-LABEL: @simple_select_users(
76; CHECK-NEXT: %1 = icmp ne <4 x i32> %c, zeroinitializer
77; CHECK-NEXT: select <4 x i1> %1, <4 x float> %a, <4 x float> %b
78  %c0 = extractelement <4 x i32> %c, i32 0
79  %c1 = extractelement <4 x i32> %c, i32 1
80  %c2 = extractelement <4 x i32> %c, i32 2
81  %c3 = extractelement <4 x i32> %c, i32 3
82  %a0 = extractelement <4 x float> %a, i32 0
83  %a1 = extractelement <4 x float> %a, i32 1
84  %a2 = extractelement <4 x float> %a, i32 2
85  %a3 = extractelement <4 x float> %a, i32 3
86  %b0 = extractelement <4 x float> %b, i32 0
87  %b1 = extractelement <4 x float> %b, i32 1
88  %b2 = extractelement <4 x float> %b, i32 2
89  %b3 = extractelement <4 x float> %b, i32 3
90  %cmp0 = icmp ne i32 %c0, 0
91  %cmp1 = icmp ne i32 %c1, 0
92  %cmp2 = icmp ne i32 %c2, 0
93  %cmp3 = icmp ne i32 %c3, 0
94  %s0 = select i1 %cmp0, float %a0, float %b0
95  %s1 = select i1 %cmp1, float %a1, float %b1
96  %s2 = select i1 %cmp2, float %a2, float %b2
97  %s3 = select i1 %cmp3, float %a3, float %b3
98  %ra = insertelement <4 x float> undef, float %s0, i32 0
99  %rb = insertelement <4 x float> %ra, float %s1, i32 1
100  %rc = insertelement <4 x float> %rb, float %s2, i32 2
101  %rd = insertelement <4 x float> %rc, float %s3, i32 3
102  call void @v4f32_user(<4 x float> %rd) #0
103  ret <4 x float> %rd
104}
105
106; Unused insertelement
107define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
108; CHECK-LABEL: @simple_select_no_users(
109; CHECK-NOT: icmp ne <4 x i32>
110; CHECK-NOT: select <4 x i1>
111  %c0 = extractelement <4 x i32> %c, i32 0
112  %c1 = extractelement <4 x i32> %c, i32 1
113  %c2 = extractelement <4 x i32> %c, i32 2
114  %c3 = extractelement <4 x i32> %c, i32 3
115  %a0 = extractelement <4 x float> %a, i32 0
116  %a1 = extractelement <4 x float> %a, i32 1
117  %a2 = extractelement <4 x float> %a, i32 2
118  %a3 = extractelement <4 x float> %a, i32 3
119  %b0 = extractelement <4 x float> %b, i32 0
120  %b1 = extractelement <4 x float> %b, i32 1
121  %b2 = extractelement <4 x float> %b, i32 2
122  %b3 = extractelement <4 x float> %b, i32 3
123  %cmp0 = icmp ne i32 %c0, 0
124  %cmp1 = icmp ne i32 %c1, 0
125  %cmp2 = icmp ne i32 %c2, 0
126  %cmp3 = icmp ne i32 %c3, 0
127  %s0 = select i1 %cmp0, float %a0, float %b0
128  %s1 = select i1 %cmp1, float %a1, float %b1
129  %s2 = select i1 %cmp2, float %a2, float %b2
130  %s3 = select i1 %cmp3, float %a3, float %b3
131  %ra = insertelement <4 x float> undef, float %s0, i32 0
132  %rb = insertelement <4 x float> %ra, float %s1, i32 1
133  %rc = insertelement <4 x float> undef, float %s2, i32 2
134  %rd = insertelement <4 x float> %rc, float %s3, i32 3
135  ret <4 x float> %rd
136}
137
138; Make sure infinite loop doesn't happen which I ran into when trying
139; to do this backwards this backwards
140define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
141; CHECK-LABEL: @reconstruct(
142  %c0 = extractelement <4 x i32> %c, i32 0
143  %c1 = extractelement <4 x i32> %c, i32 1
144  %c2 = extractelement <4 x i32> %c, i32 2
145  %c3 = extractelement <4 x i32> %c, i32 3
146  %ra = insertelement <4 x i32> undef, i32 %c0, i32 0
147  %rb = insertelement <4 x i32> %ra, i32 %c1, i32 1
148  %rc = insertelement <4 x i32> %rb, i32 %c2, i32 2
149  %rd = insertelement <4 x i32> %rc, i32 %c3, i32 3
150  ret <4 x i32> %rd
151}
152
153define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) #0 {
154; CHECK-LABEL: @simple_select_v2(
155; CHECK: icmp ne <2 x i32>
156; CHECK: select <2 x i1>
157  %c0 = extractelement <2 x i32> %c, i32 0
158  %c1 = extractelement <2 x i32> %c, i32 1
159  %a0 = extractelement <2 x float> %a, i32 0
160  %a1 = extractelement <2 x float> %a, i32 1
161  %b0 = extractelement <2 x float> %b, i32 0
162  %b1 = extractelement <2 x float> %b, i32 1
163  %cmp0 = icmp ne i32 %c0, 0
164  %cmp1 = icmp ne i32 %c1, 0
165  %s0 = select i1 %cmp0, float %a0, float %b0
166  %s1 = select i1 %cmp1, float %a1, float %b1
167  %ra = insertelement <2 x float> undef, float %s0, i32 0
168  %rb = insertelement <2 x float> %ra, float %s1, i32 1
169  ret <2 x float> %rb
170}
171
172; Make sure when we construct partial vectors, we don't keep
173; re-visiting the insertelement chains starting with undef
174; (low cost threshold needed to force this to happen)
175define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
176  %c0 = extractelement <4 x i32> %c, i32 0
177  %c1 = extractelement <4 x i32> %c, i32 1
178  %a0 = extractelement <4 x float> %a, i32 0
179  %a1 = extractelement <4 x float> %a, i32 1
180  %b0 = extractelement <4 x float> %b, i32 0
181  %b1 = extractelement <4 x float> %b, i32 1
182  %1 = insertelement <2 x i32> undef, i32 %c0, i32 0
183  %2 = insertelement <2 x i32> %1, i32 %c1, i32 1
184  %3 = icmp ne <2 x i32> %2, zeroinitializer
185  %4 = insertelement <2 x float> undef, float %a0, i32 0
186  %5 = insertelement <2 x float> %4, float %a1, i32 1
187  %6 = insertelement <2 x float> undef, float %b0, i32 0
188  %7 = insertelement <2 x float> %6, float %b1, i32 1
189  %8 = select <2 x i1> %3, <2 x float> %5, <2 x float> %7
190  %9 = extractelement <2 x float> %8, i32 0
191  %ra = insertelement <4 x float> undef, float %9, i32 0
192  %10 = extractelement <2 x float> %8, i32 1
193  %rb = insertelement <4 x float> %ra, float %10, i32 1
194  ret <4 x float> %rb
195}
196
197attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
198