1; RUN: opt < %s -instcombine -S | FileCheck %s
2target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
3
4define i16 @test1(float %f) {
5entry:
6; CHECK-LABEL: @test1(
7; CHECK: fmul float
8; CHECK-NOT: insertelement {{.*}} 0.00
9; CHECK-NOT: call {{.*}} @llvm.x86.sse.mul
10; CHECK-NOT: call {{.*}} @llvm.x86.sse.sub
11; CHECK: ret
12	%tmp = insertelement <4 x float> undef, float %f, i32 0		; <<4 x float>> [#uses=1]
13	%tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1		; <<4 x float>> [#uses=1]
14	%tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2		; <<4 x float>> [#uses=1]
15	%tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3		; <<4 x float>> [#uses=1]
16	%tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )		; <<4 x float>> [#uses=1]
17	%tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )		; <<4 x float>> [#uses=1]
18	%tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )		; <<4 x float>> [#uses=1]
19	%tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer )		; <<4 x float>> [#uses=1]
20	%tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 )		; <i32> [#uses=1]
21	%tmp69 = trunc i32 %tmp.upgrd.1 to i16		; <i16> [#uses=1]
22	ret i16 %tmp69
23}
24
25define i32 @test2(float %f) {
26; CHECK-LABEL: @test2(
27; CHECK-NOT: insertelement
28; CHECK-NOT: extractelement
29; CHECK: ret
30  %tmp5 = fmul float %f, %f
31  %tmp9 = insertelement <4 x float> undef, float %tmp5, i32 0
32  %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1
33  %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2
34  %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3
35  %tmp19 = bitcast <4 x float> %tmp12 to <4 x i32>
36  %tmp21 = extractelement <4 x i32> %tmp19, i32 0
37  ret i32 %tmp21
38}
39
40define i64 @test3(float %f, double %d) {
41; CHECK-LABEL: @test3(
42; CHECK-NOT: insertelement {{.*}} 0.00
43; CHECK: ret
44entry:
45  %v00 = insertelement <4 x float> undef, float %f, i32 0
46  %v01 = insertelement <4 x float> %v00, float 0.000000e+00, i32 1
47  %v02 = insertelement <4 x float> %v01, float 0.000000e+00, i32 2
48  %v03 = insertelement <4 x float> %v02, float 0.000000e+00, i32 3
49  %tmp0 = tail call i32 @llvm.x86.sse.cvtss2si(<4 x float> %v03)
50  %v10 = insertelement <4 x float> undef, float %f, i32 0
51  %v11 = insertelement <4 x float> %v10, float 0.000000e+00, i32 1
52  %v12 = insertelement <4 x float> %v11, float 0.000000e+00, i32 2
53  %v13 = insertelement <4 x float> %v12, float 0.000000e+00, i32 3
54  %tmp1 = tail call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %v13)
55  %v20 = insertelement <4 x float> undef, float %f, i32 0
56  %v21 = insertelement <4 x float> %v20, float 0.000000e+00, i32 1
57  %v22 = insertelement <4 x float> %v21, float 0.000000e+00, i32 2
58  %v23 = insertelement <4 x float> %v22, float 0.000000e+00, i32 3
59  %tmp2 = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %v23)
60  %v30 = insertelement <4 x float> undef, float %f, i32 0
61  %v31 = insertelement <4 x float> %v30, float 0.000000e+00, i32 1
62  %v32 = insertelement <4 x float> %v31, float 0.000000e+00, i32 2
63  %v33 = insertelement <4 x float> %v32, float 0.000000e+00, i32 3
64  %tmp3 = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %v33)
65  %v40 = insertelement <2 x double> undef, double %d, i32 0
66  %v41 = insertelement <2 x double> %v40, double 0.000000e+00, i32 1
67  %tmp4 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %v41)
68  %v50 = insertelement <2 x double> undef, double %d, i32 0
69  %v51 = insertelement <2 x double> %v50, double 0.000000e+00, i32 1
70  %tmp5 = tail call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %v51)
71  %v60 = insertelement <2 x double> undef, double %d, i32 0
72  %v61 = insertelement <2 x double> %v60, double 0.000000e+00, i32 1
73  %tmp6 = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %v61)
74  %v70 = insertelement <2 x double> undef, double %d, i32 0
75  %v71 = insertelement <2 x double> %v70, double 0.000000e+00, i32 1
76  %tmp7 = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %v71)
77  %tmp8 = add i32 %tmp0, %tmp2
78  %tmp9 = add i32 %tmp4, %tmp6
79  %tmp10 = add i32 %tmp8, %tmp9
80  %tmp11 = sext i32 %tmp10 to i64
81  %tmp12 = add i64 %tmp1, %tmp3
82  %tmp13 = add i64 %tmp5, %tmp7
83  %tmp14 = add i64 %tmp12, %tmp13
84  %tmp15 = add i64 %tmp11, %tmp14
85  ret i64 %tmp15
86}
87
88define void @get_image() nounwind {
89; CHECK-LABEL: @get_image(
90; CHECK-NOT: extractelement
91; CHECK: unreachable
92entry:
93  %0 = call i32 @fgetc(i8* null) nounwind               ; <i32> [#uses=1]
94  %1 = trunc i32 %0 to i8         ; <i8> [#uses=1]
95  %tmp2 = insertelement <100 x i8> zeroinitializer, i8 %1, i32 1          ; <<100 x i8>> [#uses=1]
96  %tmp1 = extractelement <100 x i8> %tmp2, i32 0          ; <i8> [#uses=1]
97  %2 = icmp eq i8 %tmp1, 80               ; <i1> [#uses=1]
98  br i1 %2, label %bb2, label %bb3
99
100bb2:            ; preds = %entry
101  br label %bb3
102
103bb3:            ; preds = %bb2, %entry
104  unreachable
105}
106
107; PR4340
108define void @vac(<4 x float>* nocapture %a) nounwind {
109; CHECK-LABEL: @vac(
110; CHECK-NOT: load
111; CHECK: ret
112entry:
113	%tmp1 = load <4 x float>* %a		; <<4 x float>> [#uses=1]
114	%vecins = insertelement <4 x float> %tmp1, float 0.000000e+00, i32 0	; <<4 x float>> [#uses=1]
115	%vecins4 = insertelement <4 x float> %vecins, float 0.000000e+00, i32 1; <<4 x float>> [#uses=1]
116	%vecins6 = insertelement <4 x float> %vecins4, float 0.000000e+00, i32 2; <<4 x float>> [#uses=1]
117	%vecins8 = insertelement <4 x float> %vecins6, float 0.000000e+00, i32 3; <<4 x float>> [#uses=1]
118	store <4 x float> %vecins8, <4 x float>* %a
119	ret void
120}
121
122declare i32 @fgetc(i8*)
123
124declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>)
125
126declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>)
127
128declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
129
130declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
131
132declare i32 @llvm.x86.sse.cvtss2si(<4 x float>)
133declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>)
134declare i32 @llvm.x86.sse.cvttss2si(<4 x float>)
135declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>)
136declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>)
137declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>)
138declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>)
139declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>)
140
141; <rdar://problem/6945110>
142define <4 x i32> @kernel3_vertical(<4 x i16> * %src, <8 x i16> * %foo) nounwind {
143entry:
144	%tmp = load <4 x i16>* %src
145	%tmp1 = load <8 x i16>* %foo
146; CHECK: %tmp2 = shufflevector
147	%tmp2 = shufflevector <4 x i16> %tmp, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
148; pmovzxwd ignores the upper 64-bits of its input; -instcombine should remove this shuffle:
149; CHECK-NOT: shufflevector
150	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
151; CHECK-NEXT: pmovzxwd
152	%0 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %tmp3)
153	ret <4 x i32> %0
154}
155declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
156
157define <4 x float> @dead_shuffle_elt(<4 x float> %x, <2 x float> %y) nounwind {
158entry:
159; CHECK-LABEL: define <4 x float> @dead_shuffle_elt(
160; CHECK: shufflevector <2 x float> %y, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
161  %shuffle.i = shufflevector <2 x float> %y, <2 x float> %y, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
162  %shuffle9.i = shufflevector <4 x float> %x, <4 x float> %shuffle.i, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
163  ret <4 x float> %shuffle9.i
164}
165
166define <2 x float> @test_fptrunc(double %f) {
167; CHECK-LABEL: @test_fptrunc(
168; CHECK: insertelement
169; CHECK: insertelement
170; CHECK-NOT: insertelement
171  %tmp9 = insertelement <4 x double> undef, double %f, i32 0
172  %tmp10 = insertelement <4 x double> %tmp9, double 0.000000e+00, i32 1
173  %tmp11 = insertelement <4 x double> %tmp10, double 0.000000e+00, i32 2
174  %tmp12 = insertelement <4 x double> %tmp11, double 0.000000e+00, i32 3
175  %tmp5 = fptrunc <4 x double> %tmp12 to <4 x float>
176  %ret = shufflevector <4 x float> %tmp5, <4 x float> undef, <2 x i32> <i32 0, i32 1>
177  ret <2 x float> %ret
178}
179
180define <2 x double> @test_fpext(float %f) {
181; CHECK-LABEL: @test_fpext(
182; CHECK: insertelement
183; CHECK: insertelement
184; CHECK-NOT: insertelement
185  %tmp9 = insertelement <4 x float> undef, float %f, i32 0
186  %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1
187  %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2
188  %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3
189  %tmp5 = fpext <4 x float> %tmp12 to <4 x double>
190  %ret = shufflevector <4 x double> %tmp5, <4 x double> undef, <2 x i32> <i32 0, i32 1>
191  ret <2 x double> %ret
192}
193
194define <4 x float> @test_select(float %f, float %g) {
195; CHECK-LABEL: @test_select(
196; CHECK: %a0 = insertelement <4 x float> undef, float %f, i32 0
197; CHECK-NOT: insertelement
198; CHECK: %a3 = insertelement <4 x float> %a0, float 3.000000e+00, i32 3
199; CHECK-NOT: insertelement
200; CHECK: %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> <float undef, float 4.000000e+00, float 5.000000e+00, float undef>
201  %a0 = insertelement <4 x float> undef, float %f, i32 0
202  %a1 = insertelement <4 x float> %a0, float 1.000000e+00, i32 1
203  %a2 = insertelement <4 x float> %a1, float 2.000000e+00, i32 2
204  %a3 = insertelement <4 x float> %a2, float 3.000000e+00, i32 3
205  %b0 = insertelement <4 x float> undef, float %g, i32 0
206  %b1 = insertelement <4 x float> %b0, float 4.000000e+00, i32 1
207  %b2 = insertelement <4 x float> %b1, float 5.000000e+00, i32 2
208  %b3 = insertelement <4 x float> %b2, float 6.000000e+00, i32 3
209  %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> %b3
210  ret <4 x float> %ret
211}
212
213; We should optimize these two redundant insertqi into one
214; CHECK: define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i)
215define <2 x i64> @testInsertTwice(<2 x i64> %v, <2 x i64> %i) {
216; CHECK: call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
217; CHECK-NOT: insertqi
218  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 32)
219  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 32)
220  ret <2 x i64> %2
221}
222
223; The result of this insert is the second arg, since the top 64 bits of
224; the result are undefined, and we copy the bottom 64 bits from the
225; second arg
226; CHECK: define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i)
227define <2 x i64> @testInsert64Bits(<2 x i64> %v, <2 x i64> %i) {
228; CHECK: ret <2 x i64> %i
229  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 0)
230  ret <2 x i64> %1
231}
232
233; Test the several types of ranges and ordering that exist for two insertqi
234; CHECK: define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i)
235define <2 x i64> @testInsertContainedRange(<2 x i64> %v, <2 x i64> %i) {
236; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
237; CHECK: ret <2 x i64> %[[RES]]
238  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
239  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 16)
240  ret <2 x i64> %2
241}
242
243; CHECK: define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i)
244define <2 x i64> @testInsertContainedRange_2(<2 x i64> %v, <2 x i64> %i) {
245; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
246; CHECK: ret <2 x i64> %[[RES]]
247  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 16)
248  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
249  ret <2 x i64> %2
250}
251
252; CHECK: define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i)
253define <2 x i64> @testInsertOverlappingRange(<2 x i64> %v, <2 x i64> %i) {
254; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
255; CHECK: ret <2 x i64> %[[RES]]
256  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
257  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 16)
258  ret <2 x i64> %2
259}
260
261; CHECK: define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i)
262define <2 x i64> @testInsertOverlappingRange_2(<2 x i64> %v, <2 x i64> %i) {
263; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
264; CHECK: ret <2 x i64> %[[RES]]
265  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 16)
266  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
267  ret <2 x i64> %2
268}
269
270; CHECK: define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i)
271define <2 x i64> @testInsertAdjacentRange(<2 x i64> %v, <2 x i64> %i) {
272; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
273; CHECK: ret <2 x i64> %[[RES]]
274  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 32, i8 0)
275  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
276  ret <2 x i64> %2
277}
278
279; CHECK: define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i)
280define <2 x i64> @testInsertAdjacentRange_2(<2 x i64> %v, <2 x i64> %i) {
281; CHECK: %[[RES:.*]] = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 0)
282; CHECK: ret <2 x i64> %[[RES]]
283  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 32)
284  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 32, i8 0)
285  ret <2 x i64> %2
286}
287
288; CHECK: define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i)
289define <2 x i64> @testInsertDisjointRange(<2 x i64> %v, <2 x i64> %i) {
290; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
291; CHECK: tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
292  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
293  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
294  ret <2 x i64> %2
295}
296
297; CHECK: define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i)
298define <2 x i64> @testInsertDisjointRange_2(<2 x i64> %v, <2 x i64> %i) {
299; CHECK:  tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
300; CHECK:  tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
301  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 16, i8 0)
302  %2 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %1, <2 x i64> %i, i8 16, i8 32)
303  ret <2 x i64> %2
304}
305
306; CHECK: define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i)
307define <2 x i64> @testZeroLength(<2 x i64> %v, <2 x i64> %i) {
308; CHECK: ret <2 x i64> %i
309  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 0)
310  ret <2 x i64> %1
311}
312
313; CHECK: define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i)
314define <2 x i64> @testUndefinedInsertq_1(<2 x i64> %v, <2 x i64> %i) {
315; CHECK: ret <2 x i64> undef
316  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 0, i8 16)
317  ret <2 x i64> %1
318}
319
320; CHECK: define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i)
321define <2 x i64> @testUndefinedInsertq_2(<2 x i64> %v, <2 x i64> %i) {
322; CHECK: ret <2 x i64> undef
323  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 48, i8 32)
324  ret <2 x i64> %1
325}
326
327; CHECK: define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i)
328define <2 x i64> @testUndefinedInsertq_3(<2 x i64> %v, <2 x i64> %i) {
329; CHECK: ret <2 x i64> undef
330  %1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %v, <2 x i64> %i, i8 64, i8 16)
331  ret <2 x i64> %1
332}
333
334; CHECK: declare <2 x i64> @llvm.x86.sse4a.insertqi
335declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwind
336
337declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>)
338define <4 x float> @test_vpermilvar_ps(<4 x float> %v) {
339; CHECK-LABEL: @test_vpermilvar_ps(
340; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
341  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
342  ret <4 x float> %a
343}
344
345declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>)
346define <8 x float> @test_vpermilvar_ps_256(<8 x float> %v) {
347; CHECK-LABEL: @test_vpermilvar_ps_256(
348; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
349  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>)
350  ret <8 x float> %a
351}
352
353declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i32>)
354define <2 x double> @test_vpermilvar_pd(<2 x double> %v) {
355; CHECK-LABEL: @test_vpermilvar_pd(
356; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 0>
357  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i32> <i32 2, i32 0>)
358  ret <2 x double> %a
359}
360
361declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i32>)
362define <4 x double> @test_vpermilvar_pd_256(<4 x double> %v) {
363; CHECK-LABEL: @test_vpermilvar_pd_256(
364; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
365  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i32> <i32 3, i32 1, i32 2, i32 0>)
366  ret <4 x double> %a
367}
368
369define <4 x float> @test_vpermilvar_ps_zero(<4 x float> %v) {
370; CHECK-LABEL: @test_vpermilvar_ps_zero(
371; CHECK: shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
372  %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> zeroinitializer)
373  ret <4 x float> %a
374}
375
376define <8 x float> @test_vpermilvar_ps_256_zero(<8 x float> %v) {
377; CHECK-LABEL: @test_vpermilvar_ps_256_zero(
378; CHECK: shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
379  %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> zeroinitializer)
380  ret <8 x float> %a
381}
382
383define <2 x double> @test_vpermilvar_pd_zero(<2 x double> %v) {
384; CHECK-LABEL: @test_vpermilvar_pd_zero(
385; CHECK: shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
386  %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i32> zeroinitializer)
387  ret <2 x double> %a
388}
389
390define <4 x double> @test_vpermilvar_pd_256_zero(<4 x double> %v) {
391; CHECK-LABEL: @test_vpermilvar_pd_256_zero(
392; CHECK: shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
393  %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i32> zeroinitializer)
394  ret <4 x double> %a
395}
396
397define <2 x i64> @test_sse2_1() nounwind readnone uwtable {
398  %S = bitcast i32 1 to i32
399  %1 = zext i32 %S to i64
400  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
401  %3 = insertelement <2 x i64> %2, i64 0, i32 1
402  %4 = bitcast <2 x i64> %3 to <8 x i16>
403  %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
404  %6 = bitcast <8 x i16> %5 to <4 x i32>
405  %7 = bitcast <2 x i64> %3 to <4 x i32>
406  %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
407  %9 = bitcast <4 x i32> %8 to <2 x i64>
408  %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
409  %11 = bitcast <2 x i64> %10 to <8 x i16>
410  %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
411  %13 = bitcast <8 x i16> %12 to <4 x i32>
412  %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
413  %15 = bitcast <4 x i32> %14 to <2 x i64>
414  %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
415  ret <2 x i64> %16
416; CHECK: test_sse2_1
417; CHECK: ret <2 x i64> <i64 72058418680037440, i64 144117112246370624>
418}
419
420define <4 x i64> @test_avx2_1() nounwind readnone uwtable {
421  %S = bitcast i32 1 to i32
422  %1 = zext i32 %S to i64
423  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
424  %3 = insertelement <2 x i64> %2, i64 0, i32 1
425  %4 = bitcast <2 x i64> %3 to <8 x i16>
426  %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
427  %6 = bitcast <16 x i16> %5 to <8 x i32>
428  %7 = bitcast <2 x i64> %3 to <4 x i32>
429  %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
430  %9 = bitcast <8 x i32> %8 to <4 x i64>
431  %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
432  %11 = bitcast <4 x i64> %10 to <16 x i16>
433  %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
434  %13 = bitcast <16 x i16> %12 to <8 x i32>
435  %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
436  %15 = bitcast <8 x i32> %14 to <4 x i64>
437  %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
438  ret <4 x i64> %16
439; CHECK: test_avx2_1
440; CHECK: ret <4 x i64> <i64 64, i64 128, i64 192, i64 256>
441}
442
443define <2 x i64> @test_sse2_0() nounwind readnone uwtable {
444  %S = bitcast i32 128 to i32
445  %1 = zext i32 %S to i64
446  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
447  %3 = insertelement <2 x i64> %2, i64 0, i32 1
448  %4 = bitcast <2 x i64> %3 to <8 x i16>
449  %5 = tail call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, <8 x i16> %4)
450  %6 = bitcast <8 x i16> %5 to <4 x i32>
451  %7 = bitcast <2 x i64> %3 to <4 x i32>
452  %8 = tail call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %6, <4 x i32> %7)
453  %9 = bitcast <4 x i32> %8 to <2 x i64>
454  %10 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %9, <2 x i64> %3)
455  %11 = bitcast <2 x i64> %10 to <8 x i16>
456  %12 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %11, i32 %S)
457  %13 = bitcast <8 x i16> %12 to <4 x i32>
458  %14 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %13, i32 %S)
459  %15 = bitcast <4 x i32> %14 to <2 x i64>
460  %16 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %15, i32 %S)
461  ret <2 x i64> %16
462; CHECK: test_sse2_0
463; CHECK: ret <2 x i64> zeroinitializer
464}
465
466define <4 x i64> @test_avx2_0() nounwind readnone uwtable {
467  %S = bitcast i32 128 to i32
468  %1 = zext i32 %S to i64
469  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
470  %3 = insertelement <2 x i64> %2, i64 0, i32 1
471  %4 = bitcast <2 x i64> %3 to <8 x i16>
472  %5 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> <i16 1, i16 0, i16 0, i16 0, i16 2, i16 0, i16 0, i16 0, i16 3, i16 0, i16 0, i16 0, i16 4, i16 0, i16 0, i16 0>, <8 x i16> %4)
473  %6 = bitcast <16 x i16> %5 to <8 x i32>
474  %7 = bitcast <2 x i64> %3 to <4 x i32>
475  %8 = tail call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %6, <4 x i32> %7)
476  %9 = bitcast <8 x i32> %8 to <4 x i64>
477  %10 = tail call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %9, <2 x i64> %3)
478  %11 = bitcast <4 x i64> %10 to <16 x i16>
479  %12 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %11, i32 %S)
480  %13 = bitcast <16 x i16> %12 to <8 x i32>
481  %14 = tail call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %13, i32 %S)
482  %15 = bitcast <8 x i32> %14 to <4 x i64>
483  %16 = tail call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %15, i32 %S)
484  ret <4 x i64> %16
485; CHECK: test_avx2_0
486; CHECK: ret <4 x i64> zeroinitializer
487}
488define <2 x i64> @test_sse2_psrl_1() nounwind readnone uwtable {
489  %S = bitcast i32 1 to i32
490  %1 = zext i32 %S to i64
491  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
492  %3 = insertelement <2 x i64> %2, i64 0, i32 1
493  %4 = bitcast <2 x i64> %3 to <8 x i16>
494  %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 16, i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048>, <8 x i16> %4)
495  %6 = bitcast <8 x i16> %5 to <4 x i32>
496  %7 = bitcast <2 x i64> %3 to <4 x i32>
497  %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7)
498  %9 = bitcast <4 x i32> %8 to <2 x i64>
499  %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3)
500  %11 = bitcast <2 x i64> %10 to <8 x i16>
501  %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S)
502  %13 = bitcast <8 x i16> %12 to <4 x i32>
503  %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S)
504  %15 = bitcast <4 x i32> %14 to <2 x i64>
505  %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S)
506  ret <2 x i64> %16
507; CHECK: test_sse2_psrl_1
508; CHECK: ret <2 x i64> <i64 562954248421376, i64 9007267974742020>
509}
510
511define <4 x i64> @test_avx2_psrl_1() nounwind readnone uwtable {
512  %S = bitcast i32 1 to i32
513  %1 = zext i32 %S to i64
514  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
515  %3 = insertelement <2 x i64> %2, i64 0, i32 1
516  %4 = bitcast <2 x i64> %3 to <8 x i16>
517  %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4)
518  %6 = bitcast <16 x i16> %5 to <8 x i32>
519  %7 = bitcast <2 x i64> %3 to <4 x i32>
520  %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7)
521  %9 = bitcast <8 x i32> %8 to <4 x i64>
522  %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3)
523  %11 = bitcast <4 x i64> %10 to <16 x i16>
524  %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S)
525  %13 = bitcast <16 x i16> %12 to <8 x i32>
526  %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S)
527  %15 = bitcast <8 x i32> %14 to <4 x i64>
528  %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S)
529  ret <4 x i64> %16
530; CHECK: test_avx2_psrl_1
531; CHECK: ret <4 x i64> <i64 16, i64 32, i64 64, i64 128>
532}
533
534define <2 x i64> @test_sse2_psrl_0() nounwind readnone uwtable {
535  %S = bitcast i32 128 to i32
536  %1 = zext i32 %S to i64
537  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
538  %3 = insertelement <2 x i64> %2, i64 0, i32 1
539  %4 = bitcast <2 x i64> %3 to <8 x i16>
540  %5 = tail call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> <i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096>, <8 x i16> %4)
541  %6 = bitcast <8 x i16> %5 to <4 x i32>
542  %7 = bitcast <2 x i64> %3 to <4 x i32>
543  %8 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %6, <4 x i32> %7)
544  %9 = bitcast <4 x i32> %8 to <2 x i64>
545  %10 = tail call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %9, <2 x i64> %3)
546  %11 = bitcast <2 x i64> %10 to <8 x i16>
547  %12 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %11, i32 %S)
548  %13 = bitcast <8 x i16> %12 to <4 x i32>
549  %14 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %13, i32 %S)
550  %15 = bitcast <4 x i32> %14 to <2 x i64>
551  %16 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %15, i32 %S)
552  ret <2 x i64> %16
553; CHECK: test_sse2_psrl_0
554; CHECK: ret <2 x i64> zeroinitializer
555}
556
557define <4 x i64> @test_avx2_psrl_0() nounwind readnone uwtable {
558  %S = bitcast i32 128 to i32
559  %1 = zext i32 %S to i64
560  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
561  %3 = insertelement <2 x i64> %2, i64 0, i32 1
562  %4 = bitcast <2 x i64> %3 to <8 x i16>
563  %5 = tail call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> <i16 1024, i16 0, i16 0, i16 0, i16 2048, i16 0, i16 0, i16 0, i16 4096, i16 0, i16 0, i16 0, i16 8192, i16 0, i16 0, i16 0>, <8 x i16> %4)
564  %6 = bitcast <16 x i16> %5 to <8 x i32>
565  %7 = bitcast <2 x i64> %3 to <4 x i32>
566  %8 = tail call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %6, <4 x i32> %7)
567  %9 = bitcast <8 x i32> %8 to <4 x i64>
568  %10 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %9, <2 x i64> %3)
569  %11 = bitcast <4 x i64> %10 to <16 x i16>
570  %12 = tail call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %11, i32 %S)
571  %13 = bitcast <16 x i16> %12 to <8 x i32>
572  %14 = tail call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %13, i32 %S)
573  %15 = bitcast <8 x i32> %14 to <4 x i64>
574  %16 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %15, i32 %S)
575  ret <4 x i64> %16
576; CHECK: test_avx2_psrl_0
577; CHECK: ret <4 x i64> zeroinitializer
578}
579
580declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) #1
581declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) #1
582declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) #1
583declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) #1
584declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) #1
585declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) #1
586declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) #1
587declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) #1
588declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) #1
589declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) #1
590declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) #1
591declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) #1
592declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) #1
593declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) #1
594declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) #1
595declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) #1
596declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) #1
597declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) #1
598declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) #1
599declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) #1
600declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) #1
601declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) #1
602declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) #1
603declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) #1
604
605attributes #1 = { nounwind readnone }
606