1; Tests for SSE2 and below, without SSE3+.
2; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s
3
4define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
5; CHECK-LABEL: test1:
6; CHECK:       ## BB#0:
7; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
8; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
9; CHECK-NEXT:    movapd (%ecx), %xmm0
10; CHECK-NEXT:    movlpd {{[0-9]+}}(%esp), %xmm0
11; CHECK-NEXT:    movapd %xmm0, (%eax)
12; CHECK-NEXT:    retl
13	%tmp3 = load <2 x double>* %A, align 16
14	%tmp7 = insertelement <2 x double> undef, double %B, i32 0
15	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
16	store <2 x double> %tmp9, <2 x double>* %r, align 16
17	ret void
18}
19
20define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
21; CHECK-LABEL: test2:
22; CHECK:       ## BB#0:
23; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
24; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
25; CHECK-NEXT:    movapd (%ecx), %xmm0
26; CHECK-NEXT:    movhpd {{[0-9]+}}(%esp), %xmm0
27; CHECK-NEXT:    movapd %xmm0, (%eax)
28; CHECK-NEXT:    retl
29	%tmp3 = load <2 x double>* %A, align 16
30	%tmp7 = insertelement <2 x double> undef, double %B, i32 0
31	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
32	store <2 x double> %tmp9, <2 x double>* %r, align 16
33	ret void
34}
35
36
37define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind {
38; CHECK-LABEL: test3:
39; CHECK:       ## BB#0:
40; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
41; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
42; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
43; CHECK-NEXT:    movaps (%edx), %xmm0
44; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
45; CHECK-NEXT:    movaps %xmm0, (%eax)
46; CHECK-NEXT:    retl
47	%tmp = load <4 x float>* %B		; <<4 x float>> [#uses=2]
48	%tmp3 = load <4 x float>* %A		; <<4 x float>> [#uses=2]
49	%tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0		; <float> [#uses=1]
50	%tmp7 = extractelement <4 x float> %tmp, i32 0		; <float> [#uses=1]
51	%tmp8 = extractelement <4 x float> %tmp3, i32 1		; <float> [#uses=1]
52	%tmp9 = extractelement <4 x float> %tmp, i32 1		; <float> [#uses=1]
53	%tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.1, i32 0		; <<4 x float>> [#uses=1]
54	%tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1		; <<4 x float>> [#uses=1]
55	%tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2		; <<4 x float>> [#uses=1]
56	%tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3		; <<4 x float>> [#uses=1]
57	store <4 x float> %tmp13, <4 x float>* %res
58	ret void
59}
60
61define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
62; CHECK-LABEL: test4:
63; CHECK:       ## BB#0:
64; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
65; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
66; CHECK-NEXT:    movaps %xmm0, (%eax)
67; CHECK-NEXT:    retl
68	%tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
69	store <4 x float> %tmp5, <4 x float>* %res
70	ret void
71}
72
73define <4 x i32> @test5(i8** %ptr) nounwind {
74; CHECK-LABEL: test5:
75; CHECK:       ## BB#0:
76; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
77; CHECK-NEXT:    movl (%eax), %eax
78; CHECK-NEXT:    movss (%eax), %xmm1
79; CHECK-NEXT:    pxor %xmm0, %xmm0
80; CHECK-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
81; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
82; CHECK-NEXT:    retl
83	%tmp = load i8** %ptr		; <i8*> [#uses=1]
84	%tmp.upgrd.1 = bitcast i8* %tmp to float*		; <float*> [#uses=1]
85	%tmp.upgrd.2 = load float* %tmp.upgrd.1		; <float> [#uses=1]
86	%tmp.upgrd.3 = insertelement <4 x float> undef, float %tmp.upgrd.2, i32 0		; <<4 x float>> [#uses=1]
87	%tmp9 = insertelement <4 x float> %tmp.upgrd.3, float 0.000000e+00, i32 1		; <<4 x float>> [#uses=1]
88	%tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 2		; <<4 x float>> [#uses=1]
89	%tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 3		; <<4 x float>> [#uses=1]
90	%tmp21 = bitcast <4 x float> %tmp11 to <16 x i8>		; <<16 x i8>> [#uses=1]
91	%tmp22 = shufflevector <16 x i8> %tmp21, <16 x i8> zeroinitializer, <16 x i32> < i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23 >		; <<16 x i8>> [#uses=1]
92	%tmp31 = bitcast <16 x i8> %tmp22 to <8 x i16>		; <<8 x i16>> [#uses=1]
93	%tmp.upgrd.4 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp31, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 >		; <<8 x i16>> [#uses=1]
94	%tmp36 = bitcast <8 x i16> %tmp.upgrd.4 to <4 x i32>		; <<4 x i32>> [#uses=1]
95	ret <4 x i32> %tmp36
96}
97
98define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
99; CHECK-LABEL: test6:
100; CHECK:       ## BB#0:
101; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
102; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
103; CHECK-NEXT:    movaps (%ecx), %xmm0
104; CHECK-NEXT:    movaps %xmm0, (%eax)
105; CHECK-NEXT:    retl
106  %tmp1 = load <4 x float>* %A            ; <<4 x float>> [#uses=1]
107  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 >          ; <<4 x float>> [#uses=1]
108  store <4 x float> %tmp2, <4 x float>* %res
109  ret void
110}
111
112define void @test7() nounwind {
113; CHECK-LABEL: test7:
114; CHECK:       ## BB#0:
115; CHECK-NEXT:    xorps %xmm0, %xmm0
116; CHECK-NEXT:    movaps %xmm0, 0
117; CHECK-NEXT:    retl
118  bitcast <4 x i32> zeroinitializer to <4 x float>                ; <<4 x float>>:1 [#uses=1]
119  shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer         ; <<4 x float>>:2 [#uses=1]
120  store <4 x float> %2, <4 x float>* null
121  ret void
122}
123
124@x = external global [4 x i32]
125
126define <2 x i64> @test8() nounwind {
127; CHECK-LABEL: test8:
128; CHECK:       ## BB#0:
129; CHECK-NEXT:    movl L_x$non_lazy_ptr, %eax
130; CHECK-NEXT:    movups (%eax), %xmm0
131; CHECK-NEXT:    retl
132	%tmp = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 0)		; <i32> [#uses=1]
133	%tmp3 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 1)		; <i32> [#uses=1]
134	%tmp5 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 2)		; <i32> [#uses=1]
135	%tmp7 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 3)		; <i32> [#uses=1]
136	%tmp.upgrd.1 = insertelement <4 x i32> undef, i32 %tmp, i32 0		; <<4 x i32>> [#uses=1]
137	%tmp13 = insertelement <4 x i32> %tmp.upgrd.1, i32 %tmp3, i32 1		; <<4 x i32>> [#uses=1]
138	%tmp14 = insertelement <4 x i32> %tmp13, i32 %tmp5, i32 2		; <<4 x i32>> [#uses=1]
139	%tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3		; <<4 x i32>> [#uses=1]
140	%tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64>		; <<2 x i64>> [#uses=1]
141	ret <2 x i64> %tmp16
142}
143
144define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind {
145; CHECK-LABEL: test9:
146; CHECK:       ## BB#0:
147; CHECK-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
148; CHECK-NEXT:    retl
149	%tmp = insertelement <4 x float> undef, float %a, i32 0		; <<4 x float>> [#uses=1]
150	%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1		; <<4 x float>> [#uses=1]
151	%tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2		; <<4 x float>> [#uses=1]
152	%tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3		; <<4 x float>> [#uses=1]
153	ret <4 x float> %tmp13
154}
155
156define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind {
157; CHECK-LABEL: test10:
158; CHECK:       ## BB#0:
159; CHECK-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
160; CHECK-NEXT:    retl
161	%tmp = insertelement <4 x float> undef, float %a, i32 0		; <<4 x float>> [#uses=1]
162	%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1		; <<4 x float>> [#uses=1]
163	%tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2		; <<4 x float>> [#uses=1]
164	%tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3		; <<4 x float>> [#uses=1]
165	ret <4 x float> %tmp13
166}
167
168define <2 x double> @test11(double %a, double %b) nounwind {
169; CHECK-LABEL: test11:
170; CHECK:       ## BB#0:
171; CHECK-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
172; CHECK-NEXT:    retl
173	%tmp = insertelement <2 x double> undef, double %a, i32 0		; <<2 x double>> [#uses=1]
174	%tmp7 = insertelement <2 x double> %tmp, double %b, i32 1		; <<2 x double>> [#uses=1]
175	ret <2 x double> %tmp7
176}
177
178define void @test12() nounwind {
179; CHECK-LABEL: test12:
180; CHECK:       ## BB#0:
181; CHECK-NEXT:    movapd 0, %xmm0
182; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
183; CHECK-NEXT:    movsd %xmm0, %xmm1
184; CHECK-NEXT:    xorpd %xmm2, %xmm2
185; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
186; CHECK-NEXT:    addps %xmm1, %xmm0
187; CHECK-NEXT:    movaps %xmm0, 0
188; CHECK-NEXT:    retl
189  %tmp1 = load <4 x float>* null          ; <<4 x float>> [#uses=2]
190  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 >             ; <<4 x float>> [#uses=1]
191  %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >                ; <<4 x float>> [#uses=1]
192  %tmp4 = fadd <4 x float> %tmp2, %tmp3            ; <<4 x float>> [#uses=1]
193  store <4 x float> %tmp4, <4 x float>* null
194  ret void
195}
196
197define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
198; CHECK-LABEL: test13:
199; CHECK:       ## BB#0:
200; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
201; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
202; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
203; CHECK-NEXT:    movaps (%edx), %xmm0
204; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
205; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
206; CHECK-NEXT:    movaps %xmm0, (%eax)
207; CHECK-NEXT:    retl
208  %tmp3 = load <4 x float>* %B            ; <<4 x float>> [#uses=1]
209  %tmp5 = load <4 x float>* %C            ; <<4 x float>> [#uses=1]
210  %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 >         ; <<4 x float>> [#uses=1]
211  store <4 x float> %tmp11, <4 x float>* %res
212  ret void
213}
214
215define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
216; CHECK-LABEL: test14:
217; CHECK:       ## BB#0:
218; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
219; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
220; CHECK-NEXT:    movaps (%ecx), %xmm1
221; CHECK-NEXT:    movaps (%eax), %xmm2
222; CHECK-NEXT:    movaps %xmm2, %xmm0
223; CHECK-NEXT:    addps %xmm1, %xmm0
224; CHECK-NEXT:    subps %xmm1, %xmm2
225; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
226; CHECK-NEXT:    retl
227  %tmp = load <4 x float>* %y             ; <<4 x float>> [#uses=2]
228  %tmp5 = load <4 x float>* %x            ; <<4 x float>> [#uses=2]
229  %tmp9 = fadd <4 x float> %tmp5, %tmp             ; <<4 x float>> [#uses=1]
230  %tmp21 = fsub <4 x float> %tmp5, %tmp            ; <<4 x float>> [#uses=1]
231  %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 >                ; <<4 x float>> [#uses=1]
232  ret <4 x float> %tmp27
233}
234
235define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind {
236; CHECK-LABEL: test15:
237; CHECK:       ## BB#0: ## %entry
238; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
239; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
240; CHECK-NEXT:    movapd (%ecx), %xmm0
241; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
242; CHECK-NEXT:    retl
243entry:
244  %tmp = load <4 x float>* %y             ; <<4 x float>> [#uses=1]
245  %tmp3 = load <4 x float>* %x            ; <<4 x float>> [#uses=1]
246  %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >           ; <<4 x float>> [#uses=1]
247  ret <4 x float> %tmp4
248}
249
250; PR8900
251
252define  <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) {
253; CHECK-LABEL: test16:
254; CHECK:       ## BB#0:
255; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
256; CHECK-NEXT:    movapd 96(%eax), %xmm0
257; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
258; CHECK-NEXT:    retl
259  %i5 = getelementptr inbounds <4 x double>* %srcA, i32 3
260  %i6 = load <4 x double>* %i5, align 32
261  %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2>
262  ret <2 x double> %i7
263}
264
265; PR9009
266define fastcc void @test17() nounwind {
267; CHECK-LABEL: test17:
268; CHECK:       ## BB#0: ## %entry
269; CHECK-NEXT:    movaps {{.*#+}} xmm0 = <u,u,32768,32768>
270; CHECK-NEXT:    movaps %xmm0, (%eax)
271; CHECK-NEXT:    retl
272entry:
273  %0 = insertelement <4 x i32> undef, i32 undef, i32 1
274  %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
275  %2 = bitcast <4 x i32> %1 to <4 x float>
276  store <4 x float> %2, <4 x float> * undef
277  ret void
278}
279
280; PR9210
281define <4 x float> @f(<4 x double>) nounwind {
282; CHECK-LABEL: f:
283; CHECK:       ## BB#0: ## %entry
284; CHECK-NEXT:    cvtpd2ps %xmm1, %xmm1
285; CHECK-NEXT:    cvtpd2ps %xmm0, %xmm0
286; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
287; CHECK-NEXT:    retl
288entry:
289 %double2float.i = fptrunc <4 x double> %0 to <4 x float>
290 ret <4 x float> %double2float.i
291}
292
293define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
294; CHECK-LABEL: test_insert_64_zext:
295; CHECK:       ## BB#0:
296; CHECK-NEXT:    movq %xmm0, %xmm0
297; CHECK-NEXT:    retl
298  %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
299  ret <2 x i64> %1
300}
301
302define <4 x i32> @PR19721(<4 x i32> %i) {
303; CHECK-LABEL: PR19721:
304; CHECK:       ## BB#0:
305; CHECK-NEXT:    xorps %xmm1, %xmm1
306; CHECK-NEXT:    movss %xmm1, %xmm0
307; CHECK-NEXT:    retl
308  %bc = bitcast <4 x i32> %i to i128
309  %insert = and i128 %bc, -4294967296
310  %bc2 = bitcast i128 %insert to <4 x i32>
311  ret <4 x i32> %bc2
312}
313
314define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
315; CHECK-LABEL: test_mul:
316; CHECK:       ## BB#0:
317; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
318; CHECK-NEXT:    pmuludq %xmm1, %xmm0
319; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
320; CHECK-NEXT:    pmuludq %xmm2, %xmm1
321; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
322; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
323; CHECK-NEXT:    retl
324  %m = mul <4 x i32> %x, %y
325  ret <4 x i32> %m
326}
327