1; Tests for SSE2 and below, without SSE3+.
2; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s
3
4define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
5	%tmp3 = load <2 x double>* %A, align 16
6	%tmp7 = insertelement <2 x double> undef, double %B, i32 0
7	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
8	store <2 x double> %tmp9, <2 x double>* %r, align 16
9	ret void
10
11; CHECK-LABEL: test1:
12; CHECK: 	movl	8(%esp), %eax
13; CHECK-NEXT: 	movapd	(%eax), %xmm0
14; CHECK-NEXT: 	movlpd	12(%esp), %xmm0
15; CHECK-NEXT: 	movl	4(%esp), %eax
16; CHECK-NEXT: 	movapd	%xmm0, (%eax)
17; CHECK-NEXT: 	ret
18}
19
20define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
21	%tmp3 = load <2 x double>* %A, align 16
22	%tmp7 = insertelement <2 x double> undef, double %B, i32 0
23	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
24	store <2 x double> %tmp9, <2 x double>* %r, align 16
25	ret void
26
27; CHECK-LABEL: test2:
28; CHECK: 	movl	4(%esp), %eax
29; CHECK: 	movl	8(%esp), %ecx
30; CHECK-NEXT: 	movapd	(%ecx), %xmm0
31; CHECK-NEXT: 	movhpd	12(%esp), %xmm0
32; CHECK-NEXT: 	movapd	%xmm0, (%eax)
33; CHECK-NEXT: 	ret
34}
35
36
37define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind {
38	%tmp = load <4 x float>* %B		; <<4 x float>> [#uses=2]
39	%tmp3 = load <4 x float>* %A		; <<4 x float>> [#uses=2]
40	%tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0		; <float> [#uses=1]
41	%tmp7 = extractelement <4 x float> %tmp, i32 0		; <float> [#uses=1]
42	%tmp8 = extractelement <4 x float> %tmp3, i32 1		; <float> [#uses=1]
43	%tmp9 = extractelement <4 x float> %tmp, i32 1		; <float> [#uses=1]
44	%tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.1, i32 0		; <<4 x float>> [#uses=1]
45	%tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1		; <<4 x float>> [#uses=1]
46	%tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2		; <<4 x float>> [#uses=1]
47	%tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3		; <<4 x float>> [#uses=1]
48	store <4 x float> %tmp13, <4 x float>* %res
49	ret void
50; CHECK: @test3
51; CHECK: 	unpcklps
52}
53
54define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
55	%tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
56	store <4 x float> %tmp5, <4 x float>* %res
57	ret void
58; CHECK: @test4
59; CHECK: 	pshufd	$50, %xmm0, %xmm0
60}
61
62define <4 x i32> @test5(i8** %ptr) nounwind {
63; CHECK-LABEL: test5:
64; CHECK: pxor
65; CHECK: punpcklbw
66; CHECK: punpcklwd
67
68	%tmp = load i8** %ptr		; <i8*> [#uses=1]
69	%tmp.upgrd.1 = bitcast i8* %tmp to float*		; <float*> [#uses=1]
70	%tmp.upgrd.2 = load float* %tmp.upgrd.1		; <float> [#uses=1]
71	%tmp.upgrd.3 = insertelement <4 x float> undef, float %tmp.upgrd.2, i32 0		; <<4 x float>> [#uses=1]
72	%tmp9 = insertelement <4 x float> %tmp.upgrd.3, float 0.000000e+00, i32 1		; <<4 x float>> [#uses=1]
73	%tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 2		; <<4 x float>> [#uses=1]
74	%tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 3		; <<4 x float>> [#uses=1]
75	%tmp21 = bitcast <4 x float> %tmp11 to <16 x i8>		; <<16 x i8>> [#uses=1]
76	%tmp22 = shufflevector <16 x i8> %tmp21, <16 x i8> zeroinitializer, <16 x i32> < i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23 >		; <<16 x i8>> [#uses=1]
77	%tmp31 = bitcast <16 x i8> %tmp22 to <8 x i16>		; <<8 x i16>> [#uses=1]
78	%tmp.upgrd.4 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp31, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 >		; <<8 x i16>> [#uses=1]
79	%tmp36 = bitcast <8 x i16> %tmp.upgrd.4 to <4 x i32>		; <<4 x i32>> [#uses=1]
80	ret <4 x i32> %tmp36
81}
82
83define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
84        %tmp1 = load <4 x float>* %A            ; <<4 x float>> [#uses=1]
85        %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 >          ; <<4 x float>> [#uses=1]
86        store <4 x float> %tmp2, <4 x float>* %res
87        ret void
88
89; CHECK-LABEL: test6:
90; CHECK: 	movaps	(%ecx), %xmm0
91; CHECK:	movaps	%xmm0, (%eax)
92}
93
94define void @test7() nounwind {
95        bitcast <4 x i32> zeroinitializer to <4 x float>                ; <<4 x float>>:1 [#uses=1]
96        shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer         ; <<4 x float>>:2 [#uses=1]
97        store <4 x float> %2, <4 x float>* null
98        ret void
99
100; CHECK-LABEL: test7:
101; CHECK:	xorps	%xmm0, %xmm0
102; CHECK:	movaps	%xmm0, 0
103}
104
105@x = external global [4 x i32]
106
107define <2 x i64> @test8() nounwind {
108	%tmp = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 0)		; <i32> [#uses=1]
109	%tmp3 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 1)		; <i32> [#uses=1]
110	%tmp5 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 2)		; <i32> [#uses=1]
111	%tmp7 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 3)		; <i32> [#uses=1]
112	%tmp.upgrd.1 = insertelement <4 x i32> undef, i32 %tmp, i32 0		; <<4 x i32>> [#uses=1]
113	%tmp13 = insertelement <4 x i32> %tmp.upgrd.1, i32 %tmp3, i32 1		; <<4 x i32>> [#uses=1]
114	%tmp14 = insertelement <4 x i32> %tmp13, i32 %tmp5, i32 2		; <<4 x i32>> [#uses=1]
115	%tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3		; <<4 x i32>> [#uses=1]
116	%tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64>		; <<2 x i64>> [#uses=1]
117	ret <2 x i64> %tmp16
118; CHECK-LABEL: test8:
119; CHECK: movups	(%eax), %xmm0
120}
121
122define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind {
123	%tmp = insertelement <4 x float> undef, float %a, i32 0		; <<4 x float>> [#uses=1]
124	%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1		; <<4 x float>> [#uses=1]
125	%tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2		; <<4 x float>> [#uses=1]
126	%tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3		; <<4 x float>> [#uses=1]
127	ret <4 x float> %tmp13
128; CHECK-LABEL: test9:
129; CHECK: movups	8(%esp), %xmm0
130}
131
132define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind {
133	%tmp = insertelement <4 x float> undef, float %a, i32 0		; <<4 x float>> [#uses=1]
134	%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1		; <<4 x float>> [#uses=1]
135	%tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2		; <<4 x float>> [#uses=1]
136	%tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3		; <<4 x float>> [#uses=1]
137	ret <4 x float> %tmp13
138; CHECK-LABEL: test10:
139; CHECK: movaps	4(%esp), %xmm0
140}
141
142define <2 x double> @test11(double %a, double %b) nounwind {
143	%tmp = insertelement <2 x double> undef, double %a, i32 0		; <<2 x double>> [#uses=1]
144	%tmp7 = insertelement <2 x double> %tmp, double %b, i32 1		; <<2 x double>> [#uses=1]
145	ret <2 x double> %tmp7
146; CHECK-LABEL: test11:
147; CHECK: movaps	4(%esp), %xmm0
148}
149
150define void @test12() nounwind {
151        %tmp1 = load <4 x float>* null          ; <<4 x float>> [#uses=2]
152        %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 >             ; <<4 x float>> [#uses=1]
153        %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >                ; <<4 x float>> [#uses=1]
154        %tmp4 = fadd <4 x float> %tmp2, %tmp3            ; <<4 x float>> [#uses=1]
155        store <4 x float> %tmp4, <4 x float>* null
156        ret void
157; CHECK-LABEL: test12:
158; CHECK: movhlps
159; CHECK: shufps
160}
161
162define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
163        %tmp3 = load <4 x float>* %B            ; <<4 x float>> [#uses=1]
164        %tmp5 = load <4 x float>* %C            ; <<4 x float>> [#uses=1]
165        %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 >         ; <<4 x float>> [#uses=1]
166        store <4 x float> %tmp11, <4 x float>* %res
167        ret void
168; CHECK: test13
169; CHECK: shufps	$69, (%ecx), %xmm0
170; CHECK: pshufd	$-40, %xmm0, %xmm0
171}
172
173define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
174        %tmp = load <4 x float>* %y             ; <<4 x float>> [#uses=2]
175        %tmp5 = load <4 x float>* %x            ; <<4 x float>> [#uses=2]
176        %tmp9 = fadd <4 x float> %tmp5, %tmp             ; <<4 x float>> [#uses=1]
177        %tmp21 = fsub <4 x float> %tmp5, %tmp            ; <<4 x float>> [#uses=1]
178        %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 >                ; <<4 x float>> [#uses=1]
179        ret <4 x float> %tmp27
180; CHECK-LABEL: test14:
181; CHECK: 	addps	[[X1:%xmm[0-9]+]], [[X0:%xmm[0-9]+]]
182; CHECK: 	subps	[[X1]], [[X2:%xmm[0-9]+]]
183; CHECK: 	movlhps	[[X2]], [[X0]]
184}
185
186define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind {
187entry:
188        %tmp = load <4 x float>* %y             ; <<4 x float>> [#uses=1]
189        %tmp3 = load <4 x float>* %x            ; <<4 x float>> [#uses=1]
190        %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >           ; <<4 x float>> [#uses=1]
191        ret <4 x float> %tmp4
192; CHECK-LABEL: test15:
193; CHECK: 	movhlps	%xmm1, %xmm0
194}
195
196; PR8900
197; CHECK-LABEL: test16:
198; CHECK: unpcklpd
199; CHECK: ret
200
201define  <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) {
202  %i5 = getelementptr inbounds <4 x double>* %srcA, i32 3
203  %i6 = load <4 x double>* %i5, align 32
204  %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2>
205  ret <2 x double> %i7
206}
207
208; PR9009
209define fastcc void @test17() nounwind {
210entry:
211  %0 = insertelement <4 x i32> undef, i32 undef, i32 1
212  %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
213  %2 = bitcast <4 x i32> %1 to <4 x float>
214  store <4 x float> %2, <4 x float> * undef
215  ret void
216}
217
218; PR9210
219define <4 x float> @f(<4 x double>) nounwind {
220entry:
221 %double2float.i = fptrunc <4 x double> %0 to <4 x float>
222 ret <4 x float> %double2float.i
223}
224