1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE
3; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X86-AVX,AVX1,X86-AVX1
4; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,X86-AVX,AVX512,X86-AVX512
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX1,X64-AVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX512,X64-AVX512
8
9; Tests for SSE2 and below, without SSE3+.
10
11define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
12; X86-SSE-LABEL: test1:
13; X86-SSE:       # %bb.0:
14; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
15; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
16; X86-SSE-NEXT:    movaps (%ecx), %xmm0
17; X86-SSE-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
18; X86-SSE-NEXT:    movaps %xmm0, (%eax)
19; X86-SSE-NEXT:    retl
20;
21; X86-AVX-LABEL: test1:
22; X86-AVX:       # %bb.0:
23; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
24; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
25; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
26; X86-AVX-NEXT:    vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
27; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
28; X86-AVX-NEXT:    retl
29;
30; X64-SSE-LABEL: test1:
31; X64-SSE:       # %bb.0:
32; X64-SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[0],mem[1]
33; X64-SSE-NEXT:    movapd %xmm0, (%rdi)
34; X64-SSE-NEXT:    retq
35;
36; X64-AVX-LABEL: test1:
37; X64-AVX:       # %bb.0:
38; X64-AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
39; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
40; X64-AVX-NEXT:    retq
41	%tmp3 = load <2 x double>, <2 x double>* %A, align 16
42	%tmp7 = insertelement <2 x double> undef, double %B, i32 0
43	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
44	store <2 x double> %tmp9, <2 x double>* %r, align 16
45	ret void
46}
47
48define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
49; X86-SSE-LABEL: test2:
50; X86-SSE:       # %bb.0:
51; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
52; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
53; X86-SSE-NEXT:    movaps (%ecx), %xmm0
54; X86-SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
55; X86-SSE-NEXT:    movaps %xmm0, (%eax)
56; X86-SSE-NEXT:    retl
57;
58; X86-AVX-LABEL: test2:
59; X86-AVX:       # %bb.0:
60; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
61; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
62; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
63; X86-AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
64; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
65; X86-AVX-NEXT:    retl
66;
67; X64-SSE-LABEL: test2:
68; X64-SSE:       # %bb.0:
69; X64-SSE-NEXT:    movaps (%rsi), %xmm1
70; X64-SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
71; X64-SSE-NEXT:    movaps %xmm1, (%rdi)
72; X64-SSE-NEXT:    retq
73;
74; X64-AVX-LABEL: test2:
75; X64-AVX:       # %bb.0:
76; X64-AVX-NEXT:    vmovaps (%rsi), %xmm1
77; X64-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
78; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
79; X64-AVX-NEXT:    retq
80	%tmp3 = load <2 x double>, <2 x double>* %A, align 16
81	%tmp7 = insertelement <2 x double> undef, double %B, i32 0
82	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
83	store <2 x double> %tmp9, <2 x double>* %r, align 16
84	ret void
85}
86
87
88define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind {
89; X86-SSE-LABEL: test3:
90; X86-SSE:       # %bb.0:
91; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
92; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
93; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
94; X86-SSE-NEXT:    movaps (%edx), %xmm0
95; X86-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
96; X86-SSE-NEXT:    movaps %xmm0, (%eax)
97; X86-SSE-NEXT:    retl
98;
99; X86-AVX-LABEL: test3:
100; X86-AVX:       # %bb.0:
101; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
102; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
103; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
104; X86-AVX-NEXT:    vmovaps (%edx), %xmm0
105; X86-AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
106; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
107; X86-AVX-NEXT:    retl
108;
109; X64-SSE-LABEL: test3:
110; X64-SSE:       # %bb.0:
111; X64-SSE-NEXT:    movaps (%rsi), %xmm0
112; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
113; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
114; X64-SSE-NEXT:    retq
115;
116; X64-AVX-LABEL: test3:
117; X64-AVX:       # %bb.0:
118; X64-AVX-NEXT:    vmovaps (%rsi), %xmm0
119; X64-AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
120; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
121; X64-AVX-NEXT:    retq
122	%tmp = load <4 x float>, <4 x float>* %B		; <<4 x float>> [#uses=2]
123	%tmp3 = load <4 x float>, <4 x float>* %A		; <<4 x float>> [#uses=2]
124	%tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0		; <float> [#uses=1]
125	%tmp7 = extractelement <4 x float> %tmp, i32 0		; <float> [#uses=1]
126	%tmp8 = extractelement <4 x float> %tmp3, i32 1		; <float> [#uses=1]
127	%tmp9 = extractelement <4 x float> %tmp, i32 1		; <float> [#uses=1]
128	%tmp10 = insertelement <4 x float> undef, float %tmp.upgrd.1, i32 0		; <<4 x float>> [#uses=1]
129	%tmp11 = insertelement <4 x float> %tmp10, float %tmp7, i32 1		; <<4 x float>> [#uses=1]
130	%tmp12 = insertelement <4 x float> %tmp11, float %tmp8, i32 2		; <<4 x float>> [#uses=1]
131	%tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3		; <<4 x float>> [#uses=1]
132	store <4 x float> %tmp13, <4 x float>* %res
133	ret void
134}
135
136define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
137; X86-SSE-LABEL: test4:
138; X86-SSE:       # %bb.0:
139; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
140; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
141; X86-SSE-NEXT:    movaps %xmm0, (%eax)
142; X86-SSE-NEXT:    retl
143;
144; X86-AVX-LABEL: test4:
145; X86-AVX:       # %bb.0:
146; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
147; X86-AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
148; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
149; X86-AVX-NEXT:    retl
150;
151; X64-SSE-LABEL: test4:
152; X64-SSE:       # %bb.0:
153; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
154; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
155; X64-SSE-NEXT:    retq
156;
157; X64-AVX-LABEL: test4:
158; X64-AVX:       # %bb.0:
159; X64-AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
160; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
161; X64-AVX-NEXT:    retq
162	%tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
163	store <4 x float> %tmp5, <4 x float>* %res
164	ret void
165}
166
167define <4 x i32> @test5(i8** %ptr) nounwind {
168; X86-SSE-LABEL: test5:
169; X86-SSE:       # %bb.0:
170; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
171; X86-SSE-NEXT:    movl (%eax), %eax
172; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
173; X86-SSE-NEXT:    pxor %xmm0, %xmm0
174; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
175; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
176; X86-SSE-NEXT:    retl
177;
178; X86-AVX-LABEL: test5:
179; X86-AVX:       # %bb.0:
180; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
181; X86-AVX-NEXT:    movl (%eax), %eax
182; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
183; X86-AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
184; X86-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
185; X86-AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
186; X86-AVX-NEXT:    retl
187;
188; X64-SSE-LABEL: test5:
189; X64-SSE:       # %bb.0:
190; X64-SSE-NEXT:    movq (%rdi), %rax
191; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
192; X64-SSE-NEXT:    pxor %xmm0, %xmm0
193; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
194; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
195; X64-SSE-NEXT:    retq
196;
197; X64-AVX-LABEL: test5:
198; X64-AVX:       # %bb.0:
199; X64-AVX-NEXT:    movq (%rdi), %rax
200; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
201; X64-AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
202; X64-AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
203; X64-AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
204; X64-AVX-NEXT:    retq
205	%tmp = load i8*, i8** %ptr		; <i8*> [#uses=1]
206	%tmp.upgrd.1 = bitcast i8* %tmp to float*		; <float*> [#uses=1]
207	%tmp.upgrd.2 = load float, float* %tmp.upgrd.1		; <float> [#uses=1]
208	%tmp.upgrd.3 = insertelement <4 x float> undef, float %tmp.upgrd.2, i32 0		; <<4 x float>> [#uses=1]
209	%tmp9 = insertelement <4 x float> %tmp.upgrd.3, float 0.000000e+00, i32 1		; <<4 x float>> [#uses=1]
210	%tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 2		; <<4 x float>> [#uses=1]
211	%tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 3		; <<4 x float>> [#uses=1]
212	%tmp21 = bitcast <4 x float> %tmp11 to <16 x i8>		; <<16 x i8>> [#uses=1]
213	%tmp22 = shufflevector <16 x i8> %tmp21, <16 x i8> zeroinitializer, <16 x i32> < i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23 >		; <<16 x i8>> [#uses=1]
214	%tmp31 = bitcast <16 x i8> %tmp22 to <8 x i16>		; <<8 x i16>> [#uses=1]
215	%tmp.upgrd.4 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %tmp31, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 >		; <<8 x i16>> [#uses=1]
216	%tmp36 = bitcast <8 x i16> %tmp.upgrd.4 to <4 x i32>		; <<4 x i32>> [#uses=1]
217	ret <4 x i32> %tmp36
218}
219
220define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
221; X86-SSE-LABEL: test6:
222; X86-SSE:       # %bb.0:
223; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
224; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
225; X86-SSE-NEXT:    movaps (%ecx), %xmm0
226; X86-SSE-NEXT:    movaps %xmm0, (%eax)
227; X86-SSE-NEXT:    retl
228;
229; X86-AVX-LABEL: test6:
230; X86-AVX:       # %bb.0:
231; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
232; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
233; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
234; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
235; X86-AVX-NEXT:    retl
236;
237; X64-SSE-LABEL: test6:
238; X64-SSE:       # %bb.0:
239; X64-SSE-NEXT:    movaps (%rsi), %xmm0
240; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
241; X64-SSE-NEXT:    retq
242;
243; X64-AVX-LABEL: test6:
244; X64-AVX:       # %bb.0:
245; X64-AVX-NEXT:    vmovaps (%rsi), %xmm0
246; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
247; X64-AVX-NEXT:    retq
248  %tmp1 = load <4 x float>, <4 x float>* %A            ; <<4 x float>> [#uses=1]
249  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 >          ; <<4 x float>> [#uses=1]
250  store <4 x float> %tmp2, <4 x float>* %res
251  ret void
252}
253
254define void @test7() nounwind {
255; SSE-LABEL: test7:
256; SSE:       # %bb.0:
257; SSE-NEXT:    xorps %xmm0, %xmm0
258; SSE-NEXT:    movaps %xmm0, 0
259; SSE-NEXT:    ret{{[l|q]}}
260;
261; AVX-LABEL: test7:
262; AVX:       # %bb.0:
263; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
264; AVX-NEXT:    vmovaps %xmm0, 0
265; AVX-NEXT:    ret{{[l|q]}}
266  bitcast <4 x i32> zeroinitializer to <4 x float>                ; <<4 x float>>:1 [#uses=1]
267  shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer         ; <<4 x float>>:2 [#uses=1]
268  store <4 x float> %2, <4 x float>* null
269  ret void
270}
271
272@x = external dso_local global [4 x i32]
273
274define <2 x i64> @test8() nounwind {
275; X86-SSE-LABEL: test8:
276; X86-SSE:       # %bb.0:
277; X86-SSE-NEXT:    movups x, %xmm0
278; X86-SSE-NEXT:    retl
279;
280; X86-AVX-LABEL: test8:
281; X86-AVX:       # %bb.0:
282; X86-AVX-NEXT:    vmovups x, %xmm0
283; X86-AVX-NEXT:    retl
284;
285; X64-SSE-LABEL: test8:
286; X64-SSE:       # %bb.0:
287; X64-SSE-NEXT:    movups x(%rip), %xmm0
288; X64-SSE-NEXT:    retq
289;
290; X64-AVX-LABEL: test8:
291; X64-AVX:       # %bb.0:
292; X64-AVX-NEXT:    vmovups x(%rip), %xmm0
293; X64-AVX-NEXT:    retq
294	%tmp = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 0)		; <i32> [#uses=1]
295	%tmp3 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 1)		; <i32> [#uses=1]
296	%tmp5 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 2)		; <i32> [#uses=1]
297	%tmp7 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 3)		; <i32> [#uses=1]
298	%tmp.upgrd.1 = insertelement <4 x i32> undef, i32 %tmp, i32 0		; <<4 x i32>> [#uses=1]
299	%tmp13 = insertelement <4 x i32> %tmp.upgrd.1, i32 %tmp3, i32 1		; <<4 x i32>> [#uses=1]
300	%tmp14 = insertelement <4 x i32> %tmp13, i32 %tmp5, i32 2		; <<4 x i32>> [#uses=1]
301	%tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3		; <<4 x i32>> [#uses=1]
302	%tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64>		; <<2 x i64>> [#uses=1]
303	ret <2 x i64> %tmp16
304}
305
306define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind {
307; X86-SSE-LABEL: test9:
308; X86-SSE:       # %bb.0:
309; X86-SSE-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
310; X86-SSE-NEXT:    retl
311;
312; X86-AVX-LABEL: test9:
313; X86-AVX:       # %bb.0:
314; X86-AVX-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
315; X86-AVX-NEXT:    retl
316;
317; X64-SSE-LABEL: test9:
318; X64-SSE:       # %bb.0:
319; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
320; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
321; X64-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
322; X64-SSE-NEXT:    retq
323;
324; X64-AVX-LABEL: test9:
325; X64-AVX:       # %bb.0:
326; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
327; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
328; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
329; X64-AVX-NEXT:    retq
330	%tmp = insertelement <4 x float> undef, float %a, i32 0		; <<4 x float>> [#uses=1]
331	%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1		; <<4 x float>> [#uses=1]
332	%tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2		; <<4 x float>> [#uses=1]
333	%tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3		; <<4 x float>> [#uses=1]
334	ret <4 x float> %tmp13
335}
336
337define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind {
338; X86-SSE-LABEL: test10:
339; X86-SSE:       # %bb.0:
340; X86-SSE-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
341; X86-SSE-NEXT:    retl
342;
343; X86-AVX-LABEL: test10:
344; X86-AVX:       # %bb.0:
345; X86-AVX-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
346; X86-AVX-NEXT:    retl
347;
348; X64-SSE-LABEL: test10:
349; X64-SSE:       # %bb.0:
350; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
351; X64-SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
352; X64-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
353; X64-SSE-NEXT:    retq
354;
355; X64-AVX-LABEL: test10:
356; X64-AVX:       # %bb.0:
357; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
358; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
359; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
360; X64-AVX-NEXT:    retq
361	%tmp = insertelement <4 x float> undef, float %a, i32 0		; <<4 x float>> [#uses=1]
362	%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1		; <<4 x float>> [#uses=1]
363	%tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2		; <<4 x float>> [#uses=1]
364	%tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3		; <<4 x float>> [#uses=1]
365	ret <4 x float> %tmp13
366}
367
368define <2 x double> @test11(double %a, double %b) nounwind {
369; X86-SSE-LABEL: test11:
370; X86-SSE:       # %bb.0:
371; X86-SSE-NEXT:    movups {{[0-9]+}}(%esp), %xmm0
372; X86-SSE-NEXT:    retl
373;
374; X86-AVX-LABEL: test11:
375; X86-AVX:       # %bb.0:
376; X86-AVX-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
377; X86-AVX-NEXT:    retl
378;
379; X64-SSE-LABEL: test11:
380; X64-SSE:       # %bb.0:
381; X64-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
382; X64-SSE-NEXT:    retq
383;
384; X64-AVX-LABEL: test11:
385; X64-AVX:       # %bb.0:
386; X64-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
387; X64-AVX-NEXT:    retq
388	%tmp = insertelement <2 x double> undef, double %a, i32 0		; <<2 x double>> [#uses=1]
389	%tmp7 = insertelement <2 x double> %tmp, double %b, i32 1		; <<2 x double>> [#uses=1]
390	ret <2 x double> %tmp7
391}
392
393define void @test12() nounwind {
394; SSE-LABEL: test12:
395; SSE:       # %bb.0:
396; SSE-NEXT:    movapd 0, %xmm0
397; SSE-NEXT:    movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
398; SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
399; SSE-NEXT:    xorps %xmm2, %xmm2
400; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
401; SSE-NEXT:    addps %xmm1, %xmm2
402; SSE-NEXT:    movaps %xmm2, 0
403; SSE-NEXT:    ret{{[l|q]}}
404;
405; AVX1-LABEL: test12:
406; AVX1:       # %bb.0:
407; AVX1-NEXT:    vmovaps 0, %xmm0
408; AVX1-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
409; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
410; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
411; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
412; AVX1-NEXT:    vmovaps %xmm0, 0
413; AVX1-NEXT:    ret{{[l|q]}}
414;
415; AVX512-LABEL: test12:
416; AVX512:       # %bb.0:
417; AVX512-NEXT:    vmovaps 0, %xmm0
418; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
419; AVX512-NEXT:    vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
420; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
421; AVX512-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
422; AVX512-NEXT:    vaddps %xmm0, %xmm1, %xmm0
423; AVX512-NEXT:    vmovaps %xmm0, 0
424; AVX512-NEXT:    ret{{[l|q]}}
425  %tmp1 = load <4 x float>, <4 x float>* null          ; <<4 x float>> [#uses=2]
426  %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 >             ; <<4 x float>> [#uses=1]
427  %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >                ; <<4 x float>> [#uses=1]
428  %tmp4 = fadd <4 x float> %tmp2, %tmp3            ; <<4 x float>> [#uses=1]
429  store <4 x float> %tmp4, <4 x float>* null
430  ret void
431}
432
433define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
434; X86-SSE-LABEL: test13:
435; X86-SSE:       # %bb.0:
436; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
437; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
438; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
439; X86-SSE-NEXT:    movaps (%edx), %xmm0
440; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
441; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
442; X86-SSE-NEXT:    movaps %xmm0, (%eax)
443; X86-SSE-NEXT:    retl
444;
445; X86-AVX-LABEL: test13:
446; X86-AVX:       # %bb.0:
447; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
448; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
449; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
450; X86-AVX-NEXT:    vmovaps (%edx), %xmm0
451; X86-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
452; X86-AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
453; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
454; X86-AVX-NEXT:    retl
455;
456; X64-SSE-LABEL: test13:
457; X64-SSE:       # %bb.0:
458; X64-SSE-NEXT:    movaps (%rdx), %xmm0
459; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
460; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
461; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
462; X64-SSE-NEXT:    retq
463;
464; X64-AVX-LABEL: test13:
465; X64-AVX:       # %bb.0:
466; X64-AVX-NEXT:    vmovaps (%rdx), %xmm0
467; X64-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
468; X64-AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
469; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
470; X64-AVX-NEXT:    retq
471  %tmp3 = load <4 x float>, <4 x float>* %B            ; <<4 x float>> [#uses=1]
472  %tmp5 = load <4 x float>, <4 x float>* %C            ; <<4 x float>> [#uses=1]
473  %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 >         ; <<4 x float>> [#uses=1]
474  store <4 x float> %tmp11, <4 x float>* %res
475  ret void
476}
477
478define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
479; X86-SSE-LABEL: test14:
480; X86-SSE:       # %bb.0:
481; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
482; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
483; X86-SSE-NEXT:    movaps (%ecx), %xmm1
484; X86-SSE-NEXT:    movaps (%eax), %xmm2
485; X86-SSE-NEXT:    movaps %xmm2, %xmm0
486; X86-SSE-NEXT:    addps %xmm1, %xmm0
487; X86-SSE-NEXT:    subps %xmm1, %xmm2
488; X86-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
489; X86-SSE-NEXT:    retl
490;
491; X86-AVX-LABEL: test14:
492; X86-AVX:       # %bb.0:
493; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
494; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
495; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
496; X86-AVX-NEXT:    vmovaps (%eax), %xmm1
497; X86-AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm2
498; X86-AVX-NEXT:    vsubps %xmm0, %xmm1, %xmm0
499; X86-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
500; X86-AVX-NEXT:    retl
501;
502; X64-SSE-LABEL: test14:
503; X64-SSE:       # %bb.0:
504; X64-SSE-NEXT:    movaps (%rsi), %xmm1
505; X64-SSE-NEXT:    movaps (%rdi), %xmm2
506; X64-SSE-NEXT:    movaps %xmm2, %xmm0
507; X64-SSE-NEXT:    addps %xmm1, %xmm0
508; X64-SSE-NEXT:    subps %xmm1, %xmm2
509; X64-SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
510; X64-SSE-NEXT:    retq
511;
512; X64-AVX-LABEL: test14:
513; X64-AVX:       # %bb.0:
514; X64-AVX-NEXT:    vmovaps (%rsi), %xmm0
515; X64-AVX-NEXT:    vmovaps (%rdi), %xmm1
516; X64-AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm2
517; X64-AVX-NEXT:    vsubps %xmm0, %xmm1, %xmm0
518; X64-AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
519; X64-AVX-NEXT:    retq
520  %tmp = load <4 x float>, <4 x float>* %y             ; <<4 x float>> [#uses=2]
521  %tmp5 = load <4 x float>, <4 x float>* %x            ; <<4 x float>> [#uses=2]
522  %tmp9 = fadd <4 x float> %tmp5, %tmp             ; <<4 x float>> [#uses=1]
523  %tmp21 = fsub <4 x float> %tmp5, %tmp            ; <<4 x float>> [#uses=1]
524  %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 >                ; <<4 x float>> [#uses=1]
525  ret <4 x float> %tmp27
526}
527
528define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind {
529; X86-SSE-LABEL: test15:
530; X86-SSE:       # %bb.0: # %entry
531; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
532; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
533; X86-SSE-NEXT:    movaps (%ecx), %xmm0
534; X86-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
535; X86-SSE-NEXT:    retl
536;
537; X86-AVX-LABEL: test15:
538; X86-AVX:       # %bb.0: # %entry
539; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
540; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
541; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
542; X86-AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
543; X86-AVX-NEXT:    retl
544;
545; X64-SSE-LABEL: test15:
546; X64-SSE:       # %bb.0: # %entry
547; X64-SSE-NEXT:    movaps (%rdi), %xmm0
548; X64-SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
549; X64-SSE-NEXT:    retq
550;
551; X64-AVX-LABEL: test15:
552; X64-AVX:       # %bb.0: # %entry
553; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
554; X64-AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
555; X64-AVX-NEXT:    retq
556entry:
557  %tmp = load <4 x float>, <4 x float>* %y             ; <<4 x float>> [#uses=1]
558  %tmp3 = load <4 x float>, <4 x float>* %x            ; <<4 x float>> [#uses=1]
559  %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 >           ; <<4 x float>> [#uses=1]
560  ret <4 x float> %tmp4
561}
562
563; PR8900
564
565define  <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) {
566; X86-SSE-LABEL: test16:
567; X86-SSE:       # %bb.0:
568; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
569; X86-SSE-NEXT:    movaps 96(%eax), %xmm0
570; X86-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
571; X86-SSE-NEXT:    retl
572;
573; X86-AVX-LABEL: test16:
574; X86-AVX:       # %bb.0:
575; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
576; X86-AVX-NEXT:    vmovaps 96(%eax), %xmm0
577; X86-AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
578; X86-AVX-NEXT:    retl
579;
580; X64-SSE-LABEL: test16:
581; X64-SSE:       # %bb.0:
582; X64-SSE-NEXT:    movaps 96(%rdi), %xmm0
583; X64-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
584; X64-SSE-NEXT:    retq
585;
586; X64-AVX-LABEL: test16:
587; X64-AVX:       # %bb.0:
588; X64-AVX-NEXT:    vmovaps 96(%rdi), %xmm0
589; X64-AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
590; X64-AVX-NEXT:    retq
591  %i5 = getelementptr inbounds <4 x double>, <4 x double>* %srcA, i32 3
592  %i6 = load <4 x double>, <4 x double>* %i5, align 32
593  %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2>
594  ret <2 x double> %i7
595}
596
597; PR9009
598define fastcc void @test17() nounwind {
599; X86-SSE-LABEL: test17:
600; X86-SSE:       # %bb.0: # %entry
601; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = <u,u,32768,32768>
602; X86-SSE-NEXT:    movaps %xmm0, (%eax)
603; X86-SSE-NEXT:    retl
604;
605; X86-AVX1-LABEL: test17:
606; X86-AVX1:       # %bb.0: # %entry
607; X86-AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = <u,u,32768,32768>
608; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax)
609; X86-AVX1-NEXT:    retl
610;
611; X86-AVX512-LABEL: test17:
612; X86-AVX512:       # %bb.0: # %entry
613; X86-AVX512-NEXT:    vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768]
614; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax)
615; X86-AVX512-NEXT:    retl
616;
617; X64-SSE-LABEL: test17:
618; X64-SSE:       # %bb.0: # %entry
619; X64-SSE-NEXT:    movaps {{.*#+}} xmm0 = <u,u,32768,32768>
620; X64-SSE-NEXT:    movaps %xmm0, (%rax)
621; X64-SSE-NEXT:    retq
622;
623; X64-AVX1-LABEL: test17:
624; X64-AVX1:       # %bb.0: # %entry
625; X64-AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = <u,u,32768,32768>
626; X64-AVX1-NEXT:    vmovaps %xmm0, (%rax)
627; X64-AVX1-NEXT:    retq
628;
629; X64-AVX512-LABEL: test17:
630; X64-AVX512:       # %bb.0: # %entry
631; X64-AVX512-NEXT:    vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768]
632; X64-AVX512-NEXT:    vmovaps %xmm0, (%rax)
633; X64-AVX512-NEXT:    retq
634entry:
635  %0 = insertelement <4 x i32> undef, i32 undef, i32 1
636  %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
637  %2 = bitcast <4 x i32> %1 to <4 x float>
638  store <4 x float> %2, <4 x float> * undef
639  ret void
640}
641
642; PR9210
643define <4 x float> @f(<4 x double>) nounwind {
644; SSE-LABEL: f:
645; SSE:       # %bb.0: # %entry
646; SSE-NEXT:    cvtpd2ps %xmm1, %xmm1
647; SSE-NEXT:    cvtpd2ps %xmm0, %xmm0
648; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
649; SSE-NEXT:    ret{{[l|q]}}
650;
651; AVX-LABEL: f:
652; AVX:       # %bb.0: # %entry
653; AVX-NEXT:    vcvtpd2ps %ymm0, %xmm0
654; AVX-NEXT:    vzeroupper
655; AVX-NEXT:    ret{{[l|q]}}
656entry:
657 %double2float.i = fptrunc <4 x double> %0 to <4 x float>
658 ret <4 x float> %double2float.i
659}
660
661define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
662; SSE-LABEL: test_insert_64_zext:
663; SSE:       # %bb.0:
664; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
665; SSE-NEXT:    ret{{[l|q]}}
666;
667; AVX-LABEL: test_insert_64_zext:
668; AVX:       # %bb.0:
669; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
670; AVX-NEXT:    ret{{[l|q]}}
671  %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
672  ret <2 x i64> %1
673}
674
675define <4 x i32> @PR19721(<4 x i32> %i) {
676; X86-SSE-LABEL: PR19721:
677; X86-SSE:       # %bb.0:
678; X86-SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
679; X86-SSE-NEXT:    retl
680;
681; AVX-LABEL: PR19721:
682; AVX:       # %bb.0:
683; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
684; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
685; AVX-NEXT:    ret{{[l|q]}}
686;
687; X64-SSE-LABEL: PR19721:
688; X64-SSE:       # %bb.0:
689; X64-SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
690; X64-SSE-NEXT:    retq
691  %bc = bitcast <4 x i32> %i to i128
692  %insert = and i128 %bc, -4294967296
693  %bc2 = bitcast i128 %insert to <4 x i32>
694  ret <4 x i32> %bc2
695}
696
697define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
698; SSE-LABEL: test_mul:
699; SSE:       # %bb.0:
700; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
701; SSE-NEXT:    pmuludq %xmm1, %xmm0
702; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
703; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
704; SSE-NEXT:    pmuludq %xmm2, %xmm1
705; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
706; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
707; SSE-NEXT:    ret{{[l|q]}}
708;
709; AVX-LABEL: test_mul:
710; AVX:       # %bb.0:
711; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
712; AVX-NEXT:    ret{{[l|q]}}
713  %m = mul <4 x i32> %x, %y
714  ret <4 x i32> %m
715}
716