1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3
4; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
5; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
6; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
7; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW
8; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
9; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
10; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL
11; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL
12
13define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) {
14; SSE-LABEL: shuffle_v4i32_0001:
15; SSE:       # %bb.0:
16; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
17; SSE-NEXT:    retq
18;
19; AVX-LABEL: shuffle_v4i32_0001:
20; AVX:       # %bb.0:
21; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1]
22; AVX-NEXT:    retq
23  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
24  ret <4 x i32> %shuffle
25}
26define <4 x i32> @shuffle_v4i32_0020(<4 x i32> %a, <4 x i32> %b) {
27; SSE-LABEL: shuffle_v4i32_0020:
28; SSE:       # %bb.0:
29; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,0]
30; SSE-NEXT:    retq
31;
32; AVX-LABEL: shuffle_v4i32_0020:
33; AVX:       # %bb.0:
34; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0]
35; AVX-NEXT:    retq
36  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
37  ret <4 x i32> %shuffle
38}
39define <4 x i32> @shuffle_v4i32_0112(<4 x i32> %a, <4 x i32> %b) {
40; SSE-LABEL: shuffle_v4i32_0112:
41; SSE:       # %bb.0:
42; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
43; SSE-NEXT:    retq
44;
45; AVX-LABEL: shuffle_v4i32_0112:
46; AVX:       # %bb.0:
47; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
48; AVX-NEXT:    retq
49  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
50  ret <4 x i32> %shuffle
51}
52define <4 x i32> @shuffle_v4i32_0300(<4 x i32> %a, <4 x i32> %b) {
53; SSE-LABEL: shuffle_v4i32_0300:
54; SSE:       # %bb.0:
55; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,0,0]
56; SSE-NEXT:    retq
57;
58; AVX-LABEL: shuffle_v4i32_0300:
59; AVX:       # %bb.0:
60; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0]
61; AVX-NEXT:    retq
62  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
63  ret <4 x i32> %shuffle
64}
65define <4 x i32> @shuffle_v4i32_1000(<4 x i32> %a, <4 x i32> %b) {
66; SSE-LABEL: shuffle_v4i32_1000:
67; SSE:       # %bb.0:
68; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
69; SSE-NEXT:    retq
70;
71; AVX-LABEL: shuffle_v4i32_1000:
72; AVX:       # %bb.0:
73; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
74; AVX-NEXT:    retq
75  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
76  ret <4 x i32> %shuffle
77}
78define <4 x i32> @shuffle_v4i32_2200(<4 x i32> %a, <4 x i32> %b) {
79; SSE-LABEL: shuffle_v4i32_2200:
80; SSE:       # %bb.0:
81; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,0,0]
82; SSE-NEXT:    retq
83;
84; AVX-LABEL: shuffle_v4i32_2200:
85; AVX:       # %bb.0:
86; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0]
87; AVX-NEXT:    retq
88  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
89  ret <4 x i32> %shuffle
90}
91define <4 x i32> @shuffle_v4i32_3330(<4 x i32> %a, <4 x i32> %b) {
92; SSE-LABEL: shuffle_v4i32_3330:
93; SSE:       # %bb.0:
94; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,0]
95; SSE-NEXT:    retq
96;
97; AVX-LABEL: shuffle_v4i32_3330:
98; AVX:       # %bb.0:
99; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0]
100; AVX-NEXT:    retq
101  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
102  ret <4 x i32> %shuffle
103}
104define <4 x i32> @shuffle_v4i32_3210(<4 x i32> %a, <4 x i32> %b) {
105; SSE-LABEL: shuffle_v4i32_3210:
106; SSE:       # %bb.0:
107; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
108; SSE-NEXT:    retq
109;
110; AVX-LABEL: shuffle_v4i32_3210:
111; AVX:       # %bb.0:
112; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
113; AVX-NEXT:    retq
114  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
115  ret <4 x i32> %shuffle
116}
117
118define <4 x i32> @shuffle_v4i32_2121(<4 x i32> %a, <4 x i32> %b) {
119; SSE-LABEL: shuffle_v4i32_2121:
120; SSE:       # %bb.0:
121; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,1]
122; SSE-NEXT:    retq
123;
124; AVX-LABEL: shuffle_v4i32_2121:
125; AVX:       # %bb.0:
126; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,1]
127; AVX-NEXT:    retq
128  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 1, i32 2, i32 1>
129  ret <4 x i32> %shuffle
130}
131
132define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) {
133; SSE-LABEL: shuffle_v4f32_0001:
134; SSE:       # %bb.0:
135; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,1]
136; SSE-NEXT:    retq
137;
138; AVX-LABEL: shuffle_v4f32_0001:
139; AVX:       # %bb.0:
140; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1]
141; AVX-NEXT:    retq
142  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
143  ret <4 x float> %shuffle
144}
145define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) {
146; SSE-LABEL: shuffle_v4f32_0020:
147; SSE:       # %bb.0:
148; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,2,0]
149; SSE-NEXT:    retq
150;
151; AVX-LABEL: shuffle_v4f32_0020:
152; AVX:       # %bb.0:
153; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0]
154; AVX-NEXT:    retq
155  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
156  ret <4 x float> %shuffle
157}
158define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) {
159; SSE-LABEL: shuffle_v4f32_0300:
160; SSE:       # %bb.0:
161; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3,0,0]
162; SSE-NEXT:    retq
163;
164; AVX-LABEL: shuffle_v4f32_0300:
165; AVX:       # %bb.0:
166; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0]
167; AVX-NEXT:    retq
168  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
169  ret <4 x float> %shuffle
170}
171define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) {
172; SSE-LABEL: shuffle_v4f32_1000:
173; SSE:       # %bb.0:
174; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
175; SSE-NEXT:    retq
176;
177; AVX-LABEL: shuffle_v4f32_1000:
178; AVX:       # %bb.0:
179; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
180; AVX-NEXT:    retq
181  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
182  ret <4 x float> %shuffle
183}
184define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) {
185; SSE-LABEL: shuffle_v4f32_2200:
186; SSE:       # %bb.0:
187; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,2,0,0]
188; SSE-NEXT:    retq
189;
190; AVX-LABEL: shuffle_v4f32_2200:
191; AVX:       # %bb.0:
192; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0]
193; AVX-NEXT:    retq
194  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
195  ret <4 x float> %shuffle
196}
197define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) {
198; SSE-LABEL: shuffle_v4f32_3330:
199; SSE:       # %bb.0:
200; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,0]
201; SSE-NEXT:    retq
202;
203; AVX-LABEL: shuffle_v4f32_3330:
204; AVX:       # %bb.0:
205; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0]
206; AVX-NEXT:    retq
207  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
208  ret <4 x float> %shuffle
209}
210define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) {
211; SSE-LABEL: shuffle_v4f32_3210:
212; SSE:       # %bb.0:
213; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
214; SSE-NEXT:    retq
215;
216; AVX-LABEL: shuffle_v4f32_3210:
217; AVX:       # %bb.0:
218; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
219; AVX-NEXT:    retq
220  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
221  ret <4 x float> %shuffle
222}
223define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) {
224; SSE-LABEL: shuffle_v4f32_0011:
225; SSE:       # %bb.0:
226; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
227; SSE-NEXT:    retq
228;
229; AVX-LABEL: shuffle_v4f32_0011:
230; AVX:       # %bb.0:
231; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
232; AVX-NEXT:    retq
233  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
234  ret <4 x float> %shuffle
235}
236define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) {
237; SSE-LABEL: shuffle_v4f32_2233:
238; SSE:       # %bb.0:
239; SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
240; SSE-NEXT:    retq
241;
242; AVX-LABEL: shuffle_v4f32_2233:
243; AVX:       # %bb.0:
244; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
245; AVX-NEXT:    retq
246  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
247  ret <4 x float> %shuffle
248}
249define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) {
250; SSE2-LABEL: shuffle_v4f32_0022:
251; SSE2:       # %bb.0:
252; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,2,2]
253; SSE2-NEXT:    retq
254;
255; SSE3-LABEL: shuffle_v4f32_0022:
256; SSE3:       # %bb.0:
257; SSE3-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
258; SSE3-NEXT:    retq
259;
260; SSSE3-LABEL: shuffle_v4f32_0022:
261; SSSE3:       # %bb.0:
262; SSSE3-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
263; SSSE3-NEXT:    retq
264;
265; SSE41-LABEL: shuffle_v4f32_0022:
266; SSE41:       # %bb.0:
267; SSE41-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
268; SSE41-NEXT:    retq
269;
270; AVX-LABEL: shuffle_v4f32_0022:
271; AVX:       # %bb.0:
272; AVX-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
273; AVX-NEXT:    retq
274  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
275  ret <4 x float> %shuffle
276}
277define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
278; SSE2-LABEL: shuffle_v4f32_1133:
279; SSE2:       # %bb.0:
280; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
281; SSE2-NEXT:    retq
282;
283; SSE3-LABEL: shuffle_v4f32_1133:
284; SSE3:       # %bb.0:
285; SSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
286; SSE3-NEXT:    retq
287;
288; SSSE3-LABEL: shuffle_v4f32_1133:
289; SSSE3:       # %bb.0:
290; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
291; SSSE3-NEXT:    retq
292;
293; SSE41-LABEL: shuffle_v4f32_1133:
294; SSE41:       # %bb.0:
295; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
296; SSE41-NEXT:    retq
297;
298; AVX-LABEL: shuffle_v4f32_1133:
299; AVX:       # %bb.0:
300; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
301; AVX-NEXT:    retq
302  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
303  ret <4 x float> %shuffle
304}
305
306define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {
307; SSE-LABEL: shuffle_v4f32_0145:
308; SSE:       # %bb.0:
309; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
310; SSE-NEXT:    retq
311;
312; AVX-LABEL: shuffle_v4f32_0145:
313; AVX:       # %bb.0:
314; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
315; AVX-NEXT:    retq
316  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
317  ret <4 x float> %shuffle
318}
319
320define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) {
321; SSE-LABEL: shuffle_v4f32_6723:
322; SSE:       # %bb.0:
323; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
324; SSE-NEXT:    retq
325;
326; AVX-LABEL: shuffle_v4f32_6723:
327; AVX:       # %bb.0:
328; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
329; AVX-NEXT:    retq
330  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
331  ret <4 x float> %shuffle
332}
333
334define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
335; SSE2-LABEL: shuffle_v4i32_0124:
336; SSE2:       # %bb.0:
337; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
338; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
339; SSE2-NEXT:    retq
340;
341; SSE3-LABEL: shuffle_v4i32_0124:
342; SSE3:       # %bb.0:
343; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
344; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
345; SSE3-NEXT:    retq
346;
347; SSSE3-LABEL: shuffle_v4i32_0124:
348; SSSE3:       # %bb.0:
349; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
350; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
351; SSSE3-NEXT:    retq
352;
353; SSE41-LABEL: shuffle_v4i32_0124:
354; SSE41:       # %bb.0:
355; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
356; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
357; SSE41-NEXT:    retq
358;
359; AVX1-LABEL: shuffle_v4i32_0124:
360; AVX1:       # %bb.0:
361; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
362; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
363; AVX1-NEXT:    retq
364;
365; AVX2-LABEL: shuffle_v4i32_0124:
366; AVX2:       # %bb.0:
367; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
368; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
369; AVX2-NEXT:    retq
370;
371; AVX512VL-LABEL: shuffle_v4i32_0124:
372; AVX512VL:       # %bb.0:
373; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,4]
374; AVX512VL-NEXT:    vpermt2d %xmm1, %xmm2, %xmm0
375; AVX512VL-NEXT:    retq
376  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
377  ret <4 x i32> %shuffle
378}
379define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) {
380; SSE2-LABEL: shuffle_v4i32_0142:
381; SSE2:       # %bb.0:
382; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
383; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
384; SSE2-NEXT:    retq
385;
386; SSE3-LABEL: shuffle_v4i32_0142:
387; SSE3:       # %bb.0:
388; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
389; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
390; SSE3-NEXT:    retq
391;
392; SSSE3-LABEL: shuffle_v4i32_0142:
393; SSSE3:       # %bb.0:
394; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
395; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
396; SSSE3-NEXT:    retq
397;
398; SSE41-LABEL: shuffle_v4i32_0142:
399; SSE41:       # %bb.0:
400; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
401; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
402; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
403; SSE41-NEXT:    retq
404;
405; AVX1-LABEL: shuffle_v4i32_0142:
406; AVX1:       # %bb.0:
407; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
408; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2]
409; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
410; AVX1-NEXT:    retq
411;
412; AVX2-LABEL: shuffle_v4i32_0142:
413; AVX2:       # %bb.0:
414; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
415; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2]
416; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
417; AVX2-NEXT:    retq
418;
419; AVX512VL-LABEL: shuffle_v4i32_0142:
420; AVX512VL:       # %bb.0:
421; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,2]
422; AVX512VL-NEXT:    vpermt2d %xmm1, %xmm2, %xmm0
423; AVX512VL-NEXT:    retq
424  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
425  ret <4 x i32> %shuffle
426}
427define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) {
428; SSE2-LABEL: shuffle_v4i32_0412:
429; SSE2:       # %bb.0:
430; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
431; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
432; SSE2-NEXT:    movaps %xmm1, %xmm0
433; SSE2-NEXT:    retq
434;
435; SSE3-LABEL: shuffle_v4i32_0412:
436; SSE3:       # %bb.0:
437; SSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
438; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
439; SSE3-NEXT:    movaps %xmm1, %xmm0
440; SSE3-NEXT:    retq
441;
442; SSSE3-LABEL: shuffle_v4i32_0412:
443; SSSE3:       # %bb.0:
444; SSSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
445; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
446; SSSE3-NEXT:    movaps %xmm1, %xmm0
447; SSSE3-NEXT:    retq
448;
449; SSE41-LABEL: shuffle_v4i32_0412:
450; SSE41:       # %bb.0:
451; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
452; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
453; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
454; SSE41-NEXT:    retq
455;
456; AVX1-LABEL: shuffle_v4i32_0412:
457; AVX1:       # %bb.0:
458; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
459; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
460; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
461; AVX1-NEXT:    retq
462;
463; AVX2-LABEL: shuffle_v4i32_0412:
464; AVX2:       # %bb.0:
465; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
466; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
467; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
468; AVX2-NEXT:    retq
469;
470; AVX512VL-LABEL: shuffle_v4i32_0412:
471; AVX512VL:       # %bb.0:
472; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,4,1,2]
473; AVX512VL-NEXT:    vpermt2d %xmm1, %xmm2, %xmm0
474; AVX512VL-NEXT:    retq
475  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
476  ret <4 x i32> %shuffle
477}
478define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) {
479; SSE2-LABEL: shuffle_v4i32_4012:
480; SSE2:       # %bb.0:
481; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
482; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
483; SSE2-NEXT:    movaps %xmm1, %xmm0
484; SSE2-NEXT:    retq
485;
486; SSE3-LABEL: shuffle_v4i32_4012:
487; SSE3:       # %bb.0:
488; SSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
489; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
490; SSE3-NEXT:    movaps %xmm1, %xmm0
491; SSE3-NEXT:    retq
492;
493; SSSE3-LABEL: shuffle_v4i32_4012:
494; SSSE3:       # %bb.0:
495; SSSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
496; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
497; SSSE3-NEXT:    movaps %xmm1, %xmm0
498; SSSE3-NEXT:    retq
499;
500; SSE41-LABEL: shuffle_v4i32_4012:
501; SSE41:       # %bb.0:
502; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,2]
503; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
504; SSE41-NEXT:    retq
505;
506; AVX1OR2-LABEL: shuffle_v4i32_4012:
507; AVX1OR2:       # %bb.0:
508; AVX1OR2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,2]
509; AVX1OR2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
510; AVX1OR2-NEXT:    retq
511;
512; AVX512VL-LABEL: shuffle_v4i32_4012:
513; AVX512VL:       # %bb.0:
514; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,0,1,2]
515; AVX512VL-NEXT:    vpermt2d %xmm1, %xmm2, %xmm0
516; AVX512VL-NEXT:    retq
517  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
518  ret <4 x i32> %shuffle
519}
520define <4 x i32> @shuffle_v4i32_0145(<4 x i32> %a, <4 x i32> %b) {
521; SSE-LABEL: shuffle_v4i32_0145:
522; SSE:       # %bb.0:
523; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
524; SSE-NEXT:    retq
525;
526; AVX-LABEL: shuffle_v4i32_0145:
527; AVX:       # %bb.0:
528; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
529; AVX-NEXT:    retq
530  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
531  ret <4 x i32> %shuffle
532}
533define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) {
534; SSE2-LABEL: shuffle_v4i32_0451:
535; SSE2:       # %bb.0:
536; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
537; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
538; SSE2-NEXT:    retq
539;
540; SSE3-LABEL: shuffle_v4i32_0451:
541; SSE3:       # %bb.0:
542; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
543; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
544; SSE3-NEXT:    retq
545;
546; SSSE3-LABEL: shuffle_v4i32_0451:
547; SSSE3:       # %bb.0:
548; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
549; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
550; SSSE3-NEXT:    retq
551;
552; SSE41-LABEL: shuffle_v4i32_0451:
553; SSE41:       # %bb.0:
554; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
555; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
556; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
557; SSE41-NEXT:    retq
558;
559; AVX1-LABEL: shuffle_v4i32_0451:
560; AVX1:       # %bb.0:
561; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
562; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
563; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
564; AVX1-NEXT:    retq
565;
566; AVX2-LABEL: shuffle_v4i32_0451:
567; AVX2:       # %bb.0:
568; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
569; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
570; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
571; AVX2-NEXT:    retq
572;
573; AVX512VL-LABEL: shuffle_v4i32_0451:
574; AVX512VL:       # %bb.0:
575; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,4,5,1]
576; AVX512VL-NEXT:    vpermt2d %xmm1, %xmm2, %xmm0
577; AVX512VL-NEXT:    retq
578  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
579  ret <4 x i32> %shuffle
580}
581define <4 x i32> @shuffle_v4i32_4501(<4 x i32> %a, <4 x i32> %b) {
582; SSE-LABEL: shuffle_v4i32_4501:
583; SSE:       # %bb.0:
584; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
585; SSE-NEXT:    movaps %xmm1, %xmm0
586; SSE-NEXT:    retq
587;
588; AVX-LABEL: shuffle_v4i32_4501:
589; AVX:       # %bb.0:
590; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
591; AVX-NEXT:    retq
592  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
593  ret <4 x i32> %shuffle
594}
595define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) {
596; SSE2-LABEL: shuffle_v4i32_4015:
597; SSE2:       # %bb.0:
598; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
599; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
600; SSE2-NEXT:    retq
601;
602; SSE3-LABEL: shuffle_v4i32_4015:
603; SSE3:       # %bb.0:
604; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
605; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
606; SSE3-NEXT:    retq
607;
608; SSSE3-LABEL: shuffle_v4i32_4015:
609; SSSE3:       # %bb.0:
610; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
611; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
612; SSSE3-NEXT:    retq
613;
614; SSE41-LABEL: shuffle_v4i32_4015:
615; SSE41:       # %bb.0:
616; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
617; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
618; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
619; SSE41-NEXT:    retq
620;
621; AVX1-LABEL: shuffle_v4i32_4015:
622; AVX1:       # %bb.0:
623; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1]
624; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
625; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
626; AVX1-NEXT:    retq
627;
628; AVX2-LABEL: shuffle_v4i32_4015:
629; AVX2:       # %bb.0:
630; AVX2-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
631; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
632; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
633; AVX2-NEXT:    retq
634;
635; AVX512VL-LABEL: shuffle_v4i32_4015:
636; AVX512VL:       # %bb.0:
637; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,0,1,5]
638; AVX512VL-NEXT:    vpermt2d %xmm1, %xmm2, %xmm0
639; AVX512VL-NEXT:    retq
640  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
641  ret <4 x i32> %shuffle
642}
643
644define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
645; SSE2-LABEL: shuffle_v4f32_4zzz:
646; SSE2:       # %bb.0:
647; SSE2-NEXT:    xorps %xmm1, %xmm1
648; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
649; SSE2-NEXT:    movaps %xmm1, %xmm0
650; SSE2-NEXT:    retq
651;
652; SSE3-LABEL: shuffle_v4f32_4zzz:
653; SSE3:       # %bb.0:
654; SSE3-NEXT:    xorps %xmm1, %xmm1
655; SSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
656; SSE3-NEXT:    movaps %xmm1, %xmm0
657; SSE3-NEXT:    retq
658;
659; SSSE3-LABEL: shuffle_v4f32_4zzz:
660; SSSE3:       # %bb.0:
661; SSSE3-NEXT:    xorps %xmm1, %xmm1
662; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
663; SSSE3-NEXT:    movaps %xmm1, %xmm0
664; SSSE3-NEXT:    retq
665;
666; SSE41-LABEL: shuffle_v4f32_4zzz:
667; SSE41:       # %bb.0:
668; SSE41-NEXT:    xorps %xmm1, %xmm1
669; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
670; SSE41-NEXT:    retq
671;
672; AVX-LABEL: shuffle_v4f32_4zzz:
673; AVX:       # %bb.0:
674; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
675; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
676; AVX-NEXT:    retq
677  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
678  ret <4 x float> %shuffle
679}
680
681define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
682; SSE2-LABEL: shuffle_v4f32_z4zz:
683; SSE2:       # %bb.0:
684; SSE2-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
685; SSE2-NEXT:    xorps %xmm1, %xmm1
686; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
687; SSE2-NEXT:    retq
688;
689; SSE3-LABEL: shuffle_v4f32_z4zz:
690; SSE3:       # %bb.0:
691; SSE3-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
692; SSE3-NEXT:    xorps %xmm1, %xmm1
693; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
694; SSE3-NEXT:    retq
695;
696; SSSE3-LABEL: shuffle_v4f32_z4zz:
697; SSSE3:       # %bb.0:
698; SSSE3-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
699; SSSE3-NEXT:    xorps %xmm1, %xmm1
700; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
701; SSSE3-NEXT:    retq
702;
703; SSE41-LABEL: shuffle_v4f32_z4zz:
704; SSE41:       # %bb.0:
705; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
706; SSE41-NEXT:    retq
707;
708; AVX-LABEL: shuffle_v4f32_z4zz:
709; AVX:       # %bb.0:
710; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
711; AVX-NEXT:    retq
712  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
713  ret <4 x float> %shuffle
714}
715
716define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
717; SSE2-LABEL: shuffle_v4f32_zz4z:
718; SSE2:       # %bb.0:
719; SSE2-NEXT:    movq {{.*#+}} xmm1 = xmm0[0],zero
720; SSE2-NEXT:    pxor %xmm0, %xmm0
721; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
722; SSE2-NEXT:    retq
723;
724; SSE3-LABEL: shuffle_v4f32_zz4z:
725; SSE3:       # %bb.0:
726; SSE3-NEXT:    movq {{.*#+}} xmm1 = xmm0[0],zero
727; SSE3-NEXT:    pxor %xmm0, %xmm0
728; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
729; SSE3-NEXT:    retq
730;
731; SSSE3-LABEL: shuffle_v4f32_zz4z:
732; SSSE3:       # %bb.0:
733; SSSE3-NEXT:    movq {{.*#+}} xmm1 = xmm0[0],zero
734; SSSE3-NEXT:    pxor %xmm0, %xmm0
735; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
736; SSSE3-NEXT:    retq
737;
738; SSE41-LABEL: shuffle_v4f32_zz4z:
739; SSE41:       # %bb.0:
740; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero
741; SSE41-NEXT:    retq
742;
743; AVX-LABEL: shuffle_v4f32_zz4z:
744; AVX:       # %bb.0:
745; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero
746; AVX-NEXT:    retq
747  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
748  ret <4 x float> %shuffle
749}
750
751define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
752; SSE2-LABEL: shuffle_v4f32_zuu4:
753; SSE2:       # %bb.0:
754; SSE2-NEXT:    xorps %xmm1, %xmm1
755; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
756; SSE2-NEXT:    movaps %xmm1, %xmm0
757; SSE2-NEXT:    retq
758;
759; SSE3-LABEL: shuffle_v4f32_zuu4:
760; SSE3:       # %bb.0:
761; SSE3-NEXT:    xorps %xmm1, %xmm1
762; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
763; SSE3-NEXT:    movaps %xmm1, %xmm0
764; SSE3-NEXT:    retq
765;
766; SSSE3-LABEL: shuffle_v4f32_zuu4:
767; SSSE3:       # %bb.0:
768; SSSE3-NEXT:    xorps %xmm1, %xmm1
769; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
770; SSSE3-NEXT:    movaps %xmm1, %xmm0
771; SSSE3-NEXT:    retq
772;
773; SSE41-LABEL: shuffle_v4f32_zuu4:
774; SSE41:       # %bb.0:
775; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0]
776; SSE41-NEXT:    retq
777;
778; AVX-LABEL: shuffle_v4f32_zuu4:
779; AVX:       # %bb.0:
780; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0]
781; AVX-NEXT:    retq
782  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
783  ret <4 x float> %shuffle
784}
785
786define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
787; SSE2-LABEL: shuffle_v4f32_zzz7:
788; SSE2:       # %bb.0:
789; SSE2-NEXT:    xorps %xmm1, %xmm1
790; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
791; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
792; SSE2-NEXT:    movaps %xmm1, %xmm0
793; SSE2-NEXT:    retq
794;
795; SSE3-LABEL: shuffle_v4f32_zzz7:
796; SSE3:       # %bb.0:
797; SSE3-NEXT:    xorps %xmm1, %xmm1
798; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
799; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
800; SSE3-NEXT:    movaps %xmm1, %xmm0
801; SSE3-NEXT:    retq
802;
803; SSSE3-LABEL: shuffle_v4f32_zzz7:
804; SSSE3:       # %bb.0:
805; SSSE3-NEXT:    xorps %xmm1, %xmm1
806; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
807; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
808; SSSE3-NEXT:    movaps %xmm1, %xmm0
809; SSSE3-NEXT:    retq
810;
811; SSE41-LABEL: shuffle_v4f32_zzz7:
812; SSE41:       # %bb.0:
813; SSE41-NEXT:    xorps %xmm1, %xmm1
814; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
815; SSE41-NEXT:    retq
816;
817; AVX-LABEL: shuffle_v4f32_zzz7:
818; AVX:       # %bb.0:
819; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
820; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
821; AVX-NEXT:    retq
822  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
823  ret <4 x float> %shuffle
824}
825
826define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
827; SSE2-LABEL: shuffle_v4f32_z6zz:
828; SSE2:       # %bb.0:
829; SSE2-NEXT:    xorps %xmm1, %xmm1
830; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
831; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
832; SSE2-NEXT:    retq
833;
834; SSE3-LABEL: shuffle_v4f32_z6zz:
835; SSE3:       # %bb.0:
836; SSE3-NEXT:    xorps %xmm1, %xmm1
837; SSE3-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
838; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
839; SSE3-NEXT:    retq
840;
841; SSSE3-LABEL: shuffle_v4f32_z6zz:
842; SSSE3:       # %bb.0:
843; SSSE3-NEXT:    xorps %xmm1, %xmm1
844; SSSE3-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
845; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
846; SSSE3-NEXT:    retq
847;
848; SSE41-LABEL: shuffle_v4f32_z6zz:
849; SSE41:       # %bb.0:
850; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
851; SSE41-NEXT:    retq
852;
853; AVX-LABEL: shuffle_v4f32_z6zz:
854; AVX:       # %bb.0:
855; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
856; AVX-NEXT:    retq
857  %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
858  ret <4 x float> %shuffle
859}
860
861define <4 x float> @shuffle_v4f32_0z23(<4 x float> %a) {
862; SSE2-LABEL: shuffle_v4f32_0z23:
863; SSE2:       # %bb.0:
864; SSE2-NEXT:    xorps %xmm1, %xmm1
865; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
866; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
867; SSE2-NEXT:    movaps %xmm1, %xmm0
868; SSE2-NEXT:    retq
869;
870; SSE3-LABEL: shuffle_v4f32_0z23:
871; SSE3:       # %bb.0:
872; SSE3-NEXT:    xorps %xmm1, %xmm1
873; SSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
874; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
875; SSE3-NEXT:    movaps %xmm1, %xmm0
876; SSE3-NEXT:    retq
877;
878; SSSE3-LABEL: shuffle_v4f32_0z23:
879; SSSE3:       # %bb.0:
880; SSSE3-NEXT:    xorps %xmm1, %xmm1
881; SSSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
882; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
883; SSSE3-NEXT:    movaps %xmm1, %xmm0
884; SSSE3-NEXT:    retq
885;
886; SSE41-LABEL: shuffle_v4f32_0z23:
887; SSE41:       # %bb.0:
888; SSE41-NEXT:    xorps %xmm1, %xmm1
889; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
890; SSE41-NEXT:    retq
891;
892; AVX-LABEL: shuffle_v4f32_0z23:
893; AVX:       # %bb.0:
894; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
895; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
896; AVX-NEXT:    retq
897  %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
898  ret <4 x float> %shuffle
899}
900
901define <4 x float> @shuffle_v4f32_01z3(<4 x float> %a) {
902; SSE2-LABEL: shuffle_v4f32_01z3:
903; SSE2:       # %bb.0:
904; SSE2-NEXT:    xorps %xmm1, %xmm1
905; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
906; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
907; SSE2-NEXT:    retq
908;
909; SSE3-LABEL: shuffle_v4f32_01z3:
910; SSE3:       # %bb.0:
911; SSE3-NEXT:    xorps %xmm1, %xmm1
912; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
913; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
914; SSE3-NEXT:    retq
915;
916; SSSE3-LABEL: shuffle_v4f32_01z3:
917; SSSE3:       # %bb.0:
918; SSSE3-NEXT:    xorps %xmm1, %xmm1
919; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
920; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
921; SSSE3-NEXT:    retq
922;
923; SSE41-LABEL: shuffle_v4f32_01z3:
924; SSE41:       # %bb.0:
925; SSE41-NEXT:    xorps %xmm1, %xmm1
926; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
927; SSE41-NEXT:    retq
928;
929; AVX-LABEL: shuffle_v4f32_01z3:
930; AVX:       # %bb.0:
931; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
932; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
933; AVX-NEXT:    retq
934  %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
935  ret <4 x float> %shuffle
936}
937
938define <4 x float> @shuffle_v4f32_012z(<4 x float> %a) {
939; SSE2-LABEL: shuffle_v4f32_012z:
940; SSE2:       # %bb.0:
941; SSE2-NEXT:    xorps %xmm1, %xmm1
942; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
943; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
944; SSE2-NEXT:    retq
945;
946; SSE3-LABEL: shuffle_v4f32_012z:
947; SSE3:       # %bb.0:
948; SSE3-NEXT:    xorps %xmm1, %xmm1
949; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
950; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
951; SSE3-NEXT:    retq
952;
953; SSSE3-LABEL: shuffle_v4f32_012z:
954; SSSE3:       # %bb.0:
955; SSSE3-NEXT:    xorps %xmm1, %xmm1
956; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
957; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
958; SSSE3-NEXT:    retq
959;
960; SSE41-LABEL: shuffle_v4f32_012z:
961; SSE41:       # %bb.0:
962; SSE41-NEXT:    xorps %xmm1, %xmm1
963; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
964; SSE41-NEXT:    retq
965;
966; AVX-LABEL: shuffle_v4f32_012z:
967; AVX:       # %bb.0:
968; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
969; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
970; AVX-NEXT:    retq
971  %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
972  ret <4 x float> %shuffle
973}
974
975define <4 x float> @shuffle_v4f32_0zz3(<4 x float> %a) {
976; SSE2-LABEL: shuffle_v4f32_0zz3:
977; SSE2:       # %bb.0:
978; SSE2-NEXT:    xorps %xmm1, %xmm1
979; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
980; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
981; SSE2-NEXT:    retq
982;
983; SSE3-LABEL: shuffle_v4f32_0zz3:
984; SSE3:       # %bb.0:
985; SSE3-NEXT:    xorps %xmm1, %xmm1
986; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
987; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
988; SSE3-NEXT:    retq
989;
990; SSSE3-LABEL: shuffle_v4f32_0zz3:
991; SSSE3:       # %bb.0:
992; SSSE3-NEXT:    xorps %xmm1, %xmm1
993; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
994; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
995; SSSE3-NEXT:    retq
996;
997; SSE41-LABEL: shuffle_v4f32_0zz3:
998; SSE41:       # %bb.0:
999; SSE41-NEXT:    xorps %xmm1, %xmm1
1000; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1001; SSE41-NEXT:    retq
1002;
1003; AVX-LABEL: shuffle_v4f32_0zz3:
1004; AVX:       # %bb.0:
1005; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1006; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1007; AVX-NEXT:    retq
1008  %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3>
1009  ret <4 x float> %shuffle
1010}
1011
1012define <4 x float> @shuffle_v4f32_0z2z(<4 x float> %v) {
1013; SSE2-LABEL: shuffle_v4f32_0z2z:
1014; SSE2:       # %bb.0:
1015; SSE2-NEXT:    xorps %xmm1, %xmm1
1016; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
1017; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
1018; SSE2-NEXT:    retq
1019;
1020; SSE3-LABEL: shuffle_v4f32_0z2z:
1021; SSE3:       # %bb.0:
1022; SSE3-NEXT:    xorps %xmm1, %xmm1
1023; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
1024; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
1025; SSE3-NEXT:    retq
1026;
1027; SSSE3-LABEL: shuffle_v4f32_0z2z:
1028; SSSE3:       # %bb.0:
1029; SSSE3-NEXT:    xorps %xmm1, %xmm1
1030; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
1031; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
1032; SSSE3-NEXT:    retq
1033;
1034; SSE41-LABEL: shuffle_v4f32_0z2z:
1035; SSE41:       # %bb.0:
1036; SSE41-NEXT:    xorps %xmm1, %xmm1
1037; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1038; SSE41-NEXT:    retq
1039;
1040; AVX-LABEL: shuffle_v4f32_0z2z:
1041; AVX:       # %bb.0:
1042; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1043; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1044; AVX-NEXT:    retq
1045  %shuffle = shufflevector <4 x float> %v, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 2, i32 4>
1046  ret <4 x float> %shuffle
1047}
1048
1049define <4 x float> @shuffle_v4f32_u051(<4 x float> %a, <4 x float> %b) {
1050; SSE-LABEL: shuffle_v4f32_u051:
1051; SSE:       # %bb.0:
1052; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1053; SSE-NEXT:    movaps %xmm1, %xmm0
1054; SSE-NEXT:    retq
1055;
1056; AVX-LABEL: shuffle_v4f32_u051:
1057; AVX:       # %bb.0:
1058; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1059; AVX-NEXT:    retq
1060  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 undef, i32 0, i32 5, i32 1>
1061  ret <4 x float> %shuffle
1062}
1063
1064define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %a, <4 x float> %b) {
1065; SSE2-LABEL: shuffle_v4f32_0zz4:
1066; SSE2:       # %bb.0:
1067; SSE2-NEXT:    movq {{.*#+}} xmm2 = xmm1[0],zero
1068; SSE2-NEXT:    pxor %xmm1, %xmm1
1069; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
1070; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1071; SSE2-NEXT:    movaps %xmm1, %xmm0
1072; SSE2-NEXT:    retq
1073;
1074; SSE3-LABEL: shuffle_v4f32_0zz4:
1075; SSE3:       # %bb.0:
1076; SSE3-NEXT:    movq {{.*#+}} xmm2 = xmm1[0],zero
1077; SSE3-NEXT:    pxor %xmm1, %xmm1
1078; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
1079; SSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1080; SSE3-NEXT:    movaps %xmm1, %xmm0
1081; SSE3-NEXT:    retq
1082;
1083; SSSE3-LABEL: shuffle_v4f32_0zz4:
1084; SSSE3:       # %bb.0:
1085; SSSE3-NEXT:    movq {{.*#+}} xmm2 = xmm1[0],zero
1086; SSSE3-NEXT:    pxor %xmm1, %xmm1
1087; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
1088; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1089; SSSE3-NEXT:    movaps %xmm1, %xmm0
1090; SSSE3-NEXT:    retq
1091;
1092; SSE41-LABEL: shuffle_v4f32_0zz4:
1093; SSE41:       # %bb.0:
1094; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
1095; SSE41-NEXT:    retq
1096;
1097; AVX-LABEL: shuffle_v4f32_0zz4:
1098; AVX:       # %bb.0:
1099; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
1100; AVX-NEXT:    retq
1101  %shuffle = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <4 x i32> <i32 undef, i32 5, i32 6, i32 0>
1102  %shuffle1 = shufflevector <4 x float> %a, <4 x float> %shuffle, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1103  ret <4 x float> %shuffle1
1104}
1105
1106define <4 x float> @shuffle_v4f32_0zz6(<4 x float> %a, <4 x float> %b) {
1107; SSE2-LABEL: shuffle_v4f32_0zz6:
1108; SSE2:       # %bb.0:
1109; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
1110; SSE2-NEXT:    xorps %xmm1, %xmm1
1111; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
1112; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1113; SSE2-NEXT:    movaps %xmm1, %xmm0
1114; SSE2-NEXT:    retq
1115;
1116; SSE3-LABEL: shuffle_v4f32_0zz6:
1117; SSE3:       # %bb.0:
1118; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
1119; SSE3-NEXT:    xorps %xmm1, %xmm1
1120; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
1121; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1122; SSE3-NEXT:    movaps %xmm1, %xmm0
1123; SSE3-NEXT:    retq
1124;
1125; SSSE3-LABEL: shuffle_v4f32_0zz6:
1126; SSSE3:       # %bb.0:
1127; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
1128; SSSE3-NEXT:    xorps %xmm1, %xmm1
1129; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
1130; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1131; SSSE3-NEXT:    movaps %xmm1, %xmm0
1132; SSSE3-NEXT:    retq
1133;
1134; SSE41-LABEL: shuffle_v4f32_0zz6:
1135; SSE41:       # %bb.0:
1136; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
1137; SSE41-NEXT:    retq
1138;
1139; AVX-LABEL: shuffle_v4f32_0zz6:
1140; AVX:       # %bb.0:
1141; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
1142; AVX-NEXT:    retq
1143  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 6>
1144  %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 2, i32 7>
1145  ret <4 x float> %shuffle1
1146}
1147
1148define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) {
1149; SSE2-LABEL: shuffle_v4f32_0z24:
1150; SSE2:       # %bb.0:
1151; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
1152; SSE2-NEXT:    xorps %xmm2, %xmm2
1153; SSE2-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1154; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
1155; SSE2-NEXT:    movaps %xmm2, %xmm0
1156; SSE2-NEXT:    retq
1157;
1158; SSE3-LABEL: shuffle_v4f32_0z24:
1159; SSE3:       # %bb.0:
1160; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
1161; SSE3-NEXT:    xorps %xmm2, %xmm2
1162; SSE3-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1163; SSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
1164; SSE3-NEXT:    movaps %xmm2, %xmm0
1165; SSE3-NEXT:    retq
1166;
1167; SSSE3-LABEL: shuffle_v4f32_0z24:
1168; SSSE3:       # %bb.0:
1169; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
1170; SSSE3-NEXT:    xorps %xmm2, %xmm2
1171; SSSE3-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1172; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
1173; SSSE3-NEXT:    movaps %xmm2, %xmm0
1174; SSSE3-NEXT:    retq
1175;
1176; SSE41-LABEL: shuffle_v4f32_0z24:
1177; SSE41:       # %bb.0:
1178; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
1179; SSE41-NEXT:    retq
1180;
1181; AVX-LABEL: shuffle_v4f32_0z24:
1182; AVX:       # %bb.0:
1183; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
1184; AVX-NEXT:    retq
1185  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 2, i32 4>
1186  %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1187  ret <4 x float> %shuffle1
1188}
1189
1190define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) {
1191; SSE2-LABEL: shuffle_v4i32_4zzz:
1192; SSE2:       # %bb.0:
1193; SSE2-NEXT:    xorps %xmm1, %xmm1
1194; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1195; SSE2-NEXT:    movaps %xmm1, %xmm0
1196; SSE2-NEXT:    retq
1197;
1198; SSE3-LABEL: shuffle_v4i32_4zzz:
1199; SSE3:       # %bb.0:
1200; SSE3-NEXT:    xorps %xmm1, %xmm1
1201; SSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1202; SSE3-NEXT:    movaps %xmm1, %xmm0
1203; SSE3-NEXT:    retq
1204;
1205; SSSE3-LABEL: shuffle_v4i32_4zzz:
1206; SSSE3:       # %bb.0:
1207; SSSE3-NEXT:    xorps %xmm1, %xmm1
1208; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1209; SSSE3-NEXT:    movaps %xmm1, %xmm0
1210; SSSE3-NEXT:    retq
1211;
1212; SSE41-LABEL: shuffle_v4i32_4zzz:
1213; SSE41:       # %bb.0:
1214; SSE41-NEXT:    xorps %xmm1, %xmm1
1215; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1216; SSE41-NEXT:    retq
1217;
1218; AVX-LABEL: shuffle_v4i32_4zzz:
1219; AVX:       # %bb.0:
1220; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1221; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1222; AVX-NEXT:    retq
1223  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1224  ret <4 x i32> %shuffle
1225}
1226
1227define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) {
1228; SSE2-LABEL: shuffle_v4i32_z4zz:
1229; SSE2:       # %bb.0:
1230; SSE2-NEXT:    xorps %xmm1, %xmm1
1231; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1232; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1233; SSE2-NEXT:    retq
1234;
1235; SSE3-LABEL: shuffle_v4i32_z4zz:
1236; SSE3:       # %bb.0:
1237; SSE3-NEXT:    xorps %xmm1, %xmm1
1238; SSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1239; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1240; SSE3-NEXT:    retq
1241;
1242; SSSE3-LABEL: shuffle_v4i32_z4zz:
1243; SSSE3:       # %bb.0:
1244; SSSE3-NEXT:    xorps %xmm1, %xmm1
1245; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1246; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1247; SSSE3-NEXT:    retq
1248;
1249; SSE41-LABEL: shuffle_v4i32_z4zz:
1250; SSE41:       # %bb.0:
1251; SSE41-NEXT:    pxor %xmm1, %xmm1
1252; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1253; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
1254; SSE41-NEXT:    retq
1255;
1256; AVX1-LABEL: shuffle_v4i32_z4zz:
1257; AVX1:       # %bb.0:
1258; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1259; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1260; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1]
1261; AVX1-NEXT:    retq
1262;
1263; AVX2-SLOW-LABEL: shuffle_v4i32_z4zz:
1264; AVX2-SLOW:       # %bb.0:
1265; AVX2-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1266; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1267; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1]
1268; AVX2-SLOW-NEXT:    retq
1269;
1270; AVX2-FAST-LABEL: shuffle_v4i32_z4zz:
1271; AVX2-FAST:       # %bb.0:
1272; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
1273; AVX2-FAST-NEXT:    retq
1274;
1275; AVX512VL-LABEL: shuffle_v4i32_z4zz:
1276; AVX512VL:       # %bb.0:
1277; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
1278; AVX512VL-NEXT:    retq
1279  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
1280  ret <4 x i32> %shuffle
1281}
1282
1283define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) {
1284; SSE2-LABEL: shuffle_v4i32_zz4z:
1285; SSE2:       # %bb.0:
1286; SSE2-NEXT:    xorps %xmm1, %xmm1
1287; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1288; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1289; SSE2-NEXT:    retq
1290;
1291; SSE3-LABEL: shuffle_v4i32_zz4z:
1292; SSE3:       # %bb.0:
1293; SSE3-NEXT:    xorps %xmm1, %xmm1
1294; SSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1295; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1296; SSE3-NEXT:    retq
1297;
1298; SSSE3-LABEL: shuffle_v4i32_zz4z:
1299; SSSE3:       # %bb.0:
1300; SSSE3-NEXT:    xorps %xmm1, %xmm1
1301; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1302; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1303; SSSE3-NEXT:    retq
1304;
1305; SSE41-LABEL: shuffle_v4i32_zz4z:
1306; SSE41:       # %bb.0:
1307; SSE41-NEXT:    pxor %xmm1, %xmm1
1308; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1309; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
1310; SSE41-NEXT:    retq
1311;
1312; AVX1-LABEL: shuffle_v4i32_zz4z:
1313; AVX1:       # %bb.0:
1314; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1315; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1316; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1]
1317; AVX1-NEXT:    retq
1318;
1319; AVX2-SLOW-LABEL: shuffle_v4i32_zz4z:
1320; AVX2-SLOW:       # %bb.0:
1321; AVX2-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1322; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1323; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1]
1324; AVX2-SLOW-NEXT:    retq
1325;
1326; AVX2-FAST-LABEL: shuffle_v4i32_zz4z:
1327; AVX2-FAST:       # %bb.0:
1328; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
1329; AVX2-FAST-NEXT:    retq
1330;
1331; AVX512VL-LABEL: shuffle_v4i32_zz4z:
1332; AVX512VL:       # %bb.0:
1333; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
1334; AVX512VL-NEXT:    retq
1335  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
1336  ret <4 x i32> %shuffle
1337}
1338
1339define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) {
1340; SSE-LABEL: shuffle_v4i32_zuu4:
1341; SSE:       # %bb.0:
1342; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
1343; SSE-NEXT:    retq
1344;
1345; AVX-LABEL: shuffle_v4i32_zuu4:
1346; AVX:       # %bb.0:
1347; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
1348; AVX-NEXT:    retq
1349  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
1350  ret <4 x i32> %shuffle
1351}
1352
1353define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) {
1354; SSE2-LABEL: shuffle_v4i32_z6zz:
1355; SSE2:       # %bb.0:
1356; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
1357; SSE2-NEXT:    xorps %xmm1, %xmm1
1358; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1359; SSE2-NEXT:    retq
1360;
1361; SSE3-LABEL: shuffle_v4i32_z6zz:
1362; SSE3:       # %bb.0:
1363; SSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
1364; SSE3-NEXT:    xorps %xmm1, %xmm1
1365; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1366; SSE3-NEXT:    retq
1367;
1368; SSSE3-LABEL: shuffle_v4i32_z6zz:
1369; SSSE3:       # %bb.0:
1370; SSSE3-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
1371; SSSE3-NEXT:    xorps %xmm1, %xmm1
1372; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1373; SSSE3-NEXT:    retq
1374;
1375; SSE41-LABEL: shuffle_v4i32_z6zz:
1376; SSE41:       # %bb.0:
1377; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
1378; SSE41-NEXT:    pxor %xmm0, %xmm0
1379; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1380; SSE41-NEXT:    retq
1381;
1382; AVX1-LABEL: shuffle_v4i32_z6zz:
1383; AVX1:       # %bb.0:
1384; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
1385; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1386; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1387; AVX1-NEXT:    retq
1388;
1389; AVX2-SLOW-LABEL: shuffle_v4i32_z6zz:
1390; AVX2-SLOW:       # %bb.0:
1391; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
1392; AVX2-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1393; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1394; AVX2-SLOW-NEXT:    retq
1395;
1396; AVX2-FAST-LABEL: shuffle_v4i32_z6zz:
1397; AVX2-FAST:       # %bb.0:
1398; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero
1399; AVX2-FAST-NEXT:    retq
1400;
1401; AVX512VL-LABEL: shuffle_v4i32_z6zz:
1402; AVX512VL:       # %bb.0:
1403; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero
1404; AVX512VL-NEXT:    retq
1405  %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
1406  ret <4 x i32> %shuffle
1407}
1408
1409define <4 x i32> @shuffle_v4i32_7012(<4 x i32> %a, <4 x i32> %b) {
1410; SSE2-LABEL: shuffle_v4i32_7012:
1411; SSE2:       # %bb.0:
1412; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
1413; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
1414; SSE2-NEXT:    movaps %xmm1, %xmm0
1415; SSE2-NEXT:    retq
1416;
1417; SSE3-LABEL: shuffle_v4i32_7012:
1418; SSE3:       # %bb.0:
1419; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
1420; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
1421; SSE3-NEXT:    movaps %xmm1, %xmm0
1422; SSE3-NEXT:    retq
1423;
1424; SSSE3-LABEL: shuffle_v4i32_7012:
1425; SSSE3:       # %bb.0:
1426; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
1427; SSSE3-NEXT:    retq
1428;
1429; SSE41-LABEL: shuffle_v4i32_7012:
1430; SSE41:       # %bb.0:
1431; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
1432; SSE41-NEXT:    retq
1433;
1434; AVX-LABEL: shuffle_v4i32_7012:
1435; AVX:       # %bb.0:
1436; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
1437; AVX-NEXT:    retq
1438  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
1439  ret <4 x i32> %shuffle
1440}
1441
1442define <4 x i32> @shuffle_v4i32_6701(<4 x i32> %a, <4 x i32> %b) {
1443; SSE2-LABEL: shuffle_v4i32_6701:
1444; SSE2:       # %bb.0:
1445; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
1446; SSE2-NEXT:    movaps %xmm1, %xmm0
1447; SSE2-NEXT:    retq
1448;
1449; SSE3-LABEL: shuffle_v4i32_6701:
1450; SSE3:       # %bb.0:
1451; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
1452; SSE3-NEXT:    movaps %xmm1, %xmm0
1453; SSE3-NEXT:    retq
1454;
1455; SSSE3-LABEL: shuffle_v4i32_6701:
1456; SSSE3:       # %bb.0:
1457; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1458; SSSE3-NEXT:    retq
1459;
1460; SSE41-LABEL: shuffle_v4i32_6701:
1461; SSE41:       # %bb.0:
1462; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1463; SSE41-NEXT:    retq
1464;
1465; AVX-LABEL: shuffle_v4i32_6701:
1466; AVX:       # %bb.0:
1467; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1468; AVX-NEXT:    retq
1469  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1470  ret <4 x i32> %shuffle
1471}
1472
1473define <4 x i32> @shuffle_v4i32_5670(<4 x i32> %a, <4 x i32> %b) {
1474; SSE2-LABEL: shuffle_v4i32_5670:
1475; SSE2:       # %bb.0:
1476; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1477; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0]
1478; SSE2-NEXT:    movaps %xmm1, %xmm0
1479; SSE2-NEXT:    retq
1480;
1481; SSE3-LABEL: shuffle_v4i32_5670:
1482; SSE3:       # %bb.0:
1483; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1484; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0]
1485; SSE3-NEXT:    movaps %xmm1, %xmm0
1486; SSE3-NEXT:    retq
1487;
1488; SSSE3-LABEL: shuffle_v4i32_5670:
1489; SSSE3:       # %bb.0:
1490; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
1491; SSSE3-NEXT:    retq
1492;
1493; SSE41-LABEL: shuffle_v4i32_5670:
1494; SSE41:       # %bb.0:
1495; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
1496; SSE41-NEXT:    retq
1497;
1498; AVX-LABEL: shuffle_v4i32_5670:
1499; AVX:       # %bb.0:
1500; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
1501; AVX-NEXT:    retq
1502  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 5, i32 6, i32 7, i32 0>
1503  ret <4 x i32> %shuffle
1504}
1505
1506define <4 x i32> @shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b) {
1507; SSE2-LABEL: shuffle_v4i32_1234:
1508; SSE2:       # %bb.0:
1509; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
1510; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
1511; SSE2-NEXT:    retq
1512;
1513; SSE3-LABEL: shuffle_v4i32_1234:
1514; SSE3:       # %bb.0:
1515; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
1516; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
1517; SSE3-NEXT:    retq
1518;
1519; SSSE3-LABEL: shuffle_v4i32_1234:
1520; SSSE3:       # %bb.0:
1521; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
1522; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1523; SSSE3-NEXT:    retq
1524;
1525; SSE41-LABEL: shuffle_v4i32_1234:
1526; SSE41:       # %bb.0:
1527; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
1528; SSE41-NEXT:    movdqa %xmm1, %xmm0
1529; SSE41-NEXT:    retq
1530;
1531; AVX-LABEL: shuffle_v4i32_1234:
1532; AVX:       # %bb.0:
1533; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
1534; AVX-NEXT:    retq
1535  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
1536  ret <4 x i32> %shuffle
1537}
1538
1539define <4 x i32> @shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b) {
1540; SSE2-LABEL: shuffle_v4i32_2345:
1541; SSE2:       # %bb.0:
1542; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1543; SSE2-NEXT:    retq
1544;
1545; SSE3-LABEL: shuffle_v4i32_2345:
1546; SSE3:       # %bb.0:
1547; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1548; SSE3-NEXT:    retq
1549;
1550; SSSE3-LABEL: shuffle_v4i32_2345:
1551; SSSE3:       # %bb.0:
1552; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
1553; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1554; SSSE3-NEXT:    retq
1555;
1556; SSE41-LABEL: shuffle_v4i32_2345:
1557; SSE41:       # %bb.0:
1558; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
1559; SSE41-NEXT:    movdqa %xmm1, %xmm0
1560; SSE41-NEXT:    retq
1561;
1562; AVX-LABEL: shuffle_v4i32_2345:
1563; AVX:       # %bb.0:
1564; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
1565; AVX-NEXT:    retq
1566  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
1567  ret <4 x i32> %shuffle
1568}
1569
1570; PR22391
1571define <4 x i32> @shuffle_v4i32_2456(<4 x i32> %a, <4 x i32> %b) {
1572; SSE2-LABEL: shuffle_v4i32_2456:
1573; SSE2:       # %bb.0:
1574; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1575; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1576; SSE2-NEXT:    retq
1577;
1578; SSE3-LABEL: shuffle_v4i32_2456:
1579; SSE3:       # %bb.0:
1580; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
1581; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1582; SSE3-NEXT:    retq
1583;
1584; SSSE3-LABEL: shuffle_v4i32_2456:
1585; SSSE3:       # %bb.0:
1586; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1587; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1588; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1589; SSSE3-NEXT:    retq
1590;
1591; SSE41-LABEL: shuffle_v4i32_2456:
1592; SSE41:       # %bb.0:
1593; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1594; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1595; SSE41-NEXT:    movdqa %xmm1, %xmm0
1596; SSE41-NEXT:    retq
1597;
1598; AVX1OR2-LABEL: shuffle_v4i32_2456:
1599; AVX1OR2:       # %bb.0:
1600; AVX1OR2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1601; AVX1OR2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1602; AVX1OR2-NEXT:    retq
1603;
1604; AVX512VL-LABEL: shuffle_v4i32_2456:
1605; AVX512VL:       # %bb.0:
1606; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,0,1,2]
1607; AVX512VL-NEXT:    vpermi2d %xmm0, %xmm1, %xmm2
1608; AVX512VL-NEXT:    vmovdqa %xmm2, %xmm0
1609; AVX512VL-NEXT:    retq
1610  %s1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
1611  %s2 = shufflevector <4 x i32> %s1, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1612  ret <4 x i32> %s2
1613}
1614
1615define <4 x i32> @shuffle_v4i32_40u1(<4 x i32> %a, <4 x i32> %b) {
1616; SSE-LABEL: shuffle_v4i32_40u1:
1617; SSE:       # %bb.0:
1618; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1619; SSE-NEXT:    movaps %xmm1, %xmm0
1620; SSE-NEXT:    retq
1621;
1622; AVX-LABEL: shuffle_v4i32_40u1:
1623; AVX:       # %bb.0:
1624; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1625; AVX-NEXT:    retq
1626  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 undef, i32 1>
1627  ret <4 x i32> %shuffle
1628}
1629
1630define <4 x i32> @shuffle_v4i32_3456(<4 x i32> %a, <4 x i32> %b) {
1631; SSE2-LABEL: shuffle_v4i32_3456:
1632; SSE2:       # %bb.0:
1633; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
1634; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1635; SSE2-NEXT:    retq
1636;
1637; SSE3-LABEL: shuffle_v4i32_3456:
1638; SSE3:       # %bb.0:
1639; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
1640; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
1641; SSE3-NEXT:    retq
1642;
1643; SSSE3-LABEL: shuffle_v4i32_3456:
1644; SSSE3:       # %bb.0:
1645; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1646; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1647; SSSE3-NEXT:    retq
1648;
1649; SSE41-LABEL: shuffle_v4i32_3456:
1650; SSE41:       # %bb.0:
1651; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1652; SSE41-NEXT:    movdqa %xmm1, %xmm0
1653; SSE41-NEXT:    retq
1654;
1655; AVX-LABEL: shuffle_v4i32_3456:
1656; AVX:       # %bb.0:
1657; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
1658; AVX-NEXT:    retq
1659  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
1660  ret <4 x i32> %shuffle
1661}
1662
1663define <4 x i32> @shuffle_v4i32_0u1u(<4 x i32> %a, <4 x i32> %b) {
1664; SSE2-LABEL: shuffle_v4i32_0u1u:
1665; SSE2:       # %bb.0:
1666; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1667; SSE2-NEXT:    retq
1668;
1669; SSE3-LABEL: shuffle_v4i32_0u1u:
1670; SSE3:       # %bb.0:
1671; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1672; SSE3-NEXT:    retq
1673;
1674; SSSE3-LABEL: shuffle_v4i32_0u1u:
1675; SSSE3:       # %bb.0:
1676; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1677; SSSE3-NEXT:    retq
1678;
1679; SSE41-LABEL: shuffle_v4i32_0u1u:
1680; SSE41:       # %bb.0:
1681; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1682; SSE41-NEXT:    retq
1683;
1684; AVX-LABEL: shuffle_v4i32_0u1u:
1685; AVX:       # %bb.0:
1686; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1687; AVX-NEXT:    retq
1688  %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
1689  ret <4 x i32> %shuffle
1690}
1691
1692define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) {
1693; SSE2-LABEL: shuffle_v4i32_0z1z:
1694; SSE2:       # %bb.0:
1695; SSE2-NEXT:    xorps %xmm1, %xmm1
1696; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1697; SSE2-NEXT:    retq
1698;
1699; SSE3-LABEL: shuffle_v4i32_0z1z:
1700; SSE3:       # %bb.0:
1701; SSE3-NEXT:    xorps %xmm1, %xmm1
1702; SSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1703; SSE3-NEXT:    retq
1704;
1705; SSSE3-LABEL: shuffle_v4i32_0z1z:
1706; SSSE3:       # %bb.0:
1707; SSSE3-NEXT:    xorps %xmm1, %xmm1
1708; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1709; SSSE3-NEXT:    retq
1710;
1711; SSE41-LABEL: shuffle_v4i32_0z1z:
1712; SSE41:       # %bb.0:
1713; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1714; SSE41-NEXT:    retq
1715;
1716; AVX-LABEL: shuffle_v4i32_0z1z:
1717; AVX:       # %bb.0:
1718; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
1719; AVX-NEXT:    retq
1720  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1721  ret <4 x i32> %shuffle
1722}
1723
1724define <4 x i32> @shuffle_v4i32_01zu(<4 x i32> %a) {
1725; SSE-LABEL: shuffle_v4i32_01zu:
1726; SSE:       # %bb.0:
1727; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
1728; SSE-NEXT:    retq
1729;
1730; AVX-LABEL: shuffle_v4i32_01zu:
1731; AVX:       # %bb.0:
1732; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
1733; AVX-NEXT:    retq
1734  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 7, i32 undef>
1735  ret <4 x i32> %shuffle
1736}
1737
1738define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) {
1739; SSE2-LABEL: shuffle_v4i32_0z23:
1740; SSE2:       # %bb.0:
1741; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1742; SSE2-NEXT:    retq
1743;
1744; SSE3-LABEL: shuffle_v4i32_0z23:
1745; SSE3:       # %bb.0:
1746; SSE3-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1747; SSE3-NEXT:    retq
1748;
1749; SSSE3-LABEL: shuffle_v4i32_0z23:
1750; SSSE3:       # %bb.0:
1751; SSSE3-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1752; SSSE3-NEXT:    retq
1753;
1754; SSE41-LABEL: shuffle_v4i32_0z23:
1755; SSE41:       # %bb.0:
1756; SSE41-NEXT:    xorps %xmm1, %xmm1
1757; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1758; SSE41-NEXT:    retq
1759;
1760; AVX-LABEL: shuffle_v4i32_0z23:
1761; AVX:       # %bb.0:
1762; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1763; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1764; AVX-NEXT:    retq
1765  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
1766  ret <4 x i32> %shuffle
1767}
1768
1769define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) {
1770; SSE2-LABEL: shuffle_v4i32_01z3:
1771; SSE2:       # %bb.0:
1772; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1773; SSE2-NEXT:    retq
1774;
1775; SSE3-LABEL: shuffle_v4i32_01z3:
1776; SSE3:       # %bb.0:
1777; SSE3-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1778; SSE3-NEXT:    retq
1779;
1780; SSSE3-LABEL: shuffle_v4i32_01z3:
1781; SSSE3:       # %bb.0:
1782; SSSE3-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1783; SSSE3-NEXT:    retq
1784;
1785; SSE41-LABEL: shuffle_v4i32_01z3:
1786; SSE41:       # %bb.0:
1787; SSE41-NEXT:    xorps %xmm1, %xmm1
1788; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
1789; SSE41-NEXT:    retq
1790;
1791; AVX-LABEL: shuffle_v4i32_01z3:
1792; AVX:       # %bb.0:
1793; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1794; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
1795; AVX-NEXT:    retq
1796  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
1797  ret <4 x i32> %shuffle
1798}
1799
1800define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) {
1801; SSE2-LABEL: shuffle_v4i32_012z:
1802; SSE2:       # %bb.0:
1803; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1804; SSE2-NEXT:    retq
1805;
1806; SSE3-LABEL: shuffle_v4i32_012z:
1807; SSE3:       # %bb.0:
1808; SSE3-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1809; SSE3-NEXT:    retq
1810;
1811; SSSE3-LABEL: shuffle_v4i32_012z:
1812; SSSE3:       # %bb.0:
1813; SSSE3-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1814; SSSE3-NEXT:    retq
1815;
1816; SSE41-LABEL: shuffle_v4i32_012z:
1817; SSE41:       # %bb.0:
1818; SSE41-NEXT:    xorps %xmm1, %xmm1
1819; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1820; SSE41-NEXT:    retq
1821;
1822; AVX-LABEL: shuffle_v4i32_012z:
1823; AVX:       # %bb.0:
1824; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1825; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1826; AVX-NEXT:    retq
1827  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1828  ret <4 x i32> %shuffle
1829}
1830
1831define <4 x i32> @shuffle_v4i32_0zz3(<4 x i32> %a) {
1832; SSE2-LABEL: shuffle_v4i32_0zz3:
1833; SSE2:       # %bb.0:
1834; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1835; SSE2-NEXT:    retq
1836;
1837; SSE3-LABEL: shuffle_v4i32_0zz3:
1838; SSE3:       # %bb.0:
1839; SSE3-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1840; SSE3-NEXT:    retq
1841;
1842; SSSE3-LABEL: shuffle_v4i32_0zz3:
1843; SSSE3:       # %bb.0:
1844; SSSE3-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1845; SSSE3-NEXT:    retq
1846;
1847; SSE41-LABEL: shuffle_v4i32_0zz3:
1848; SSE41:       # %bb.0:
1849; SSE41-NEXT:    xorps %xmm1, %xmm1
1850; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1851; SSE41-NEXT:    retq
1852;
1853; AVX-LABEL: shuffle_v4i32_0zz3:
1854; AVX:       # %bb.0:
1855; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1856; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
1857; AVX-NEXT:    retq
1858  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3>
1859  ret <4 x i32> %shuffle
1860}
1861
1862define <4 x i32> @shuffle_v4i32_bitcast_0415(<4 x i32> %a, <4 x i32> %b) {
1863; SSE-LABEL: shuffle_v4i32_bitcast_0415:
1864; SSE:       # %bb.0:
1865; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1866; SSE-NEXT:    retq
1867;
1868; AVX-LABEL: shuffle_v4i32_bitcast_0415:
1869; AVX:       # %bb.0:
1870; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1871; AVX-NEXT:    retq
1872  %shuffle32 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 0, i32 4>
1873  %bitcast64 = bitcast <4 x i32> %shuffle32 to <2 x double>
1874  %shuffle64 = shufflevector <2 x double> %bitcast64, <2 x double> undef, <2 x i32> <i32 1, i32 0>
1875  %bitcast32 = bitcast <2 x double> %shuffle64 to <4 x i32>
1876  ret <4 x i32> %bitcast32
1877}
1878
1879define <4 x float> @shuffle_v4f32_bitcast_4401(<4 x float> %a, <4 x i32> %b) {
1880; SSE-LABEL: shuffle_v4f32_bitcast_4401:
1881; SSE:       # %bb.0:
1882; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1]
1883; SSE-NEXT:    movaps %xmm1, %xmm0
1884; SSE-NEXT:    retq
1885;
1886; AVX-LABEL: shuffle_v4f32_bitcast_4401:
1887; AVX:       # %bb.0:
1888; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,1]
1889; AVX-NEXT:    retq
1890  %1 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
1891  %2 = bitcast <4 x i32> %1 to <2 x double>
1892  %3 = bitcast <4 x float> %a to <2 x double>
1893  %4 = shufflevector <2 x double> %2, <2 x double> %3, <2 x i32> <i32 0, i32 2>
1894  %5 = bitcast <2 x double> %4 to <4 x float>
1895  ret <4 x float> %5
1896}
1897
1898define <4 x float> @shuffle_v4f32_bitcast_0045(<4 x float> %a, <4 x i32> %b) {
1899; SSE-LABEL: shuffle_v4f32_bitcast_0045:
1900; SSE:       # %bb.0:
1901; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
1902; SSE-NEXT:    retq
1903;
1904; AVX-LABEL: shuffle_v4f32_bitcast_0045:
1905; AVX:       # %bb.0:
1906; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
1907; AVX-NEXT:    retq
1908  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
1909  %2 = bitcast <4 x i32> %b to <4 x float>
1910  %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 1, i32 0, i32 4, i32 5>
1911  ret <4 x float> %3
1912}
1913
1914define <4 x float> @mask_v4f32_4127(<4 x float> %a, <4 x float> %b) {
1915; SSE2-LABEL: mask_v4f32_4127:
1916; SSE2:       # %bb.0:
1917; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
1918; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1919; SSE2-NEXT:    movaps %xmm1, %xmm0
1920; SSE2-NEXT:    retq
1921;
1922; SSE3-LABEL: mask_v4f32_4127:
1923; SSE3:       # %bb.0:
1924; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
1925; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1926; SSE3-NEXT:    movaps %xmm1, %xmm0
1927; SSE3-NEXT:    retq
1928;
1929; SSSE3-LABEL: mask_v4f32_4127:
1930; SSSE3:       # %bb.0:
1931; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
1932; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1933; SSSE3-NEXT:    movaps %xmm1, %xmm0
1934; SSSE3-NEXT:    retq
1935;
1936; SSE41-LABEL: mask_v4f32_4127:
1937; SSE41:       # %bb.0:
1938; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
1939; SSE41-NEXT:    retq
1940;
1941; AVX-LABEL: mask_v4f32_4127:
1942; AVX:       # %bb.0:
1943; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
1944; AVX-NEXT:    retq
1945  %1 = bitcast <4 x float> %a to <4 x i32>
1946  %2 = bitcast <4 x float> %b to <4 x i32>
1947  %3 = and <4 x i32> %1, <i32  0, i32 -1, i32 -1, i32  0>
1948  %4 = and <4 x i32> %2, <i32 -1, i32  0, i32  0, i32 -1>
1949  %5 = or <4 x i32> %4, %3
1950  %6 = bitcast <4 x i32> %5 to <4 x float>
1951  ret <4 x float> %6
1952}
1953
1954define <4 x float> @mask_v4f32_0127(<4 x float> %a, <4 x float> %b) {
1955; SSE2-LABEL: mask_v4f32_0127:
1956; SSE2:       # %bb.0:
1957; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1958; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1959; SSE2-NEXT:    movaps %xmm1, %xmm0
1960; SSE2-NEXT:    retq
1961;
1962; SSE3-LABEL: mask_v4f32_0127:
1963; SSE3:       # %bb.0:
1964; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1965; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1966; SSE3-NEXT:    movaps %xmm1, %xmm0
1967; SSE3-NEXT:    retq
1968;
1969; SSSE3-LABEL: mask_v4f32_0127:
1970; SSSE3:       # %bb.0:
1971; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1972; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1973; SSSE3-NEXT:    movaps %xmm1, %xmm0
1974; SSSE3-NEXT:    retq
1975;
1976; SSE41-LABEL: mask_v4f32_0127:
1977; SSE41:       # %bb.0:
1978; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1979; SSE41-NEXT:    retq
1980;
1981; AVX-LABEL: mask_v4f32_0127:
1982; AVX:       # %bb.0:
1983; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
1984; AVX-NEXT:    retq
1985  %1 = bitcast <4 x float> %a to <2 x i64>
1986  %2 = bitcast <4 x float> %b to <2 x i64>
1987  %3 = and <2 x i64> %1, <i64 0, i64 -4294967296>
1988  %4 = and <2 x i64> %2, <i64 -1, i64 4294967295>
1989  %5 = or <2 x i64> %4, %3
1990  %6 = bitcast <2 x i64> %5 to <4 x float>
1991  ret <4 x float> %6
1992}
1993
1994define <4 x i32> @mask_v4i32_0127(<4 x i32> %a, <4 x i32> %b) {
1995; SSE2-LABEL: mask_v4i32_0127:
1996; SSE2:       # %bb.0:
1997; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
1998; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
1999; SSE2-NEXT:    movaps %xmm1, %xmm0
2000; SSE2-NEXT:    retq
2001;
2002; SSE3-LABEL: mask_v4i32_0127:
2003; SSE3:       # %bb.0:
2004; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
2005; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
2006; SSE3-NEXT:    movaps %xmm1, %xmm0
2007; SSE3-NEXT:    retq
2008;
2009; SSSE3-LABEL: mask_v4i32_0127:
2010; SSSE3:       # %bb.0:
2011; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
2012; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
2013; SSSE3-NEXT:    movaps %xmm1, %xmm0
2014; SSSE3-NEXT:    retq
2015;
2016; SSE41-LABEL: mask_v4i32_0127:
2017; SSE41:       # %bb.0:
2018; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
2019; SSE41-NEXT:    retq
2020;
2021; AVX-LABEL: mask_v4i32_0127:
2022; AVX:       # %bb.0:
2023; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
2024; AVX-NEXT:    retq
2025  %1 = bitcast <4 x i32> %a to <2 x i64>
2026  %2 = bitcast <4 x i32> %b to <2 x i64>
2027  %3 = and <2 x i64> %1, <i64 0, i64 -4294967296>
2028  %4 = and <2 x i64> %2, <i64 -1, i64 4294967295>
2029  %5 = or <2 x i64> %4, %3
2030  %6 = bitcast <2 x i64> %5 to <4 x i32>
2031  ret <4 x i32> %6
2032}
2033
2034define <4 x float> @broadcast_v4f32_0101_from_v2f32(<2 x float>* %x) {
2035; SSE2-LABEL: broadcast_v4f32_0101_from_v2f32:
2036; SSE2:       # %bb.0:
2037; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2038; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2039; SSE2-NEXT:    retq
2040;
2041; SSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
2042; SSE3:       # %bb.0:
2043; SSE3-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
2044; SSE3-NEXT:    retq
2045;
2046; SSSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
2047; SSSE3:       # %bb.0:
2048; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
2049; SSSE3-NEXT:    retq
2050;
2051; SSE41-LABEL: broadcast_v4f32_0101_from_v2f32:
2052; SSE41:       # %bb.0:
2053; SSE41-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
2054; SSE41-NEXT:    retq
2055;
2056; AVX-LABEL: broadcast_v4f32_0101_from_v2f32:
2057; AVX:       # %bb.0:
2058; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
2059; AVX-NEXT:    retq
2060  %1 = load <2 x float>, <2 x float>* %x, align 1
2061  %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
2062  ret <4 x float> %2
2063}
2064
2065define <4 x i32> @extract3_insert0_v4i32_7123(<4 x i32> %a0, <4 x i32> %a1) {
2066; SSE2-LABEL: extract3_insert0_v4i32_7123:
2067; SSE2:       # %bb.0:
2068; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
2069; SSE2-NEXT:    movd %xmm1, %eax
2070; SSE2-NEXT:    movd %eax, %xmm1
2071; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2072; SSE2-NEXT:    retq
2073;
2074; SSE3-LABEL: extract3_insert0_v4i32_7123:
2075; SSE3:       # %bb.0:
2076; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
2077; SSE3-NEXT:    movd %xmm1, %eax
2078; SSE3-NEXT:    movd %eax, %xmm1
2079; SSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2080; SSE3-NEXT:    retq
2081;
2082; SSSE3-LABEL: extract3_insert0_v4i32_7123:
2083; SSSE3:       # %bb.0:
2084; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
2085; SSSE3-NEXT:    movd %xmm1, %eax
2086; SSSE3-NEXT:    movd %eax, %xmm1
2087; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2088; SSSE3-NEXT:    retq
2089;
2090; SSE41-LABEL: extract3_insert0_v4i32_7123:
2091; SSE41:       # %bb.0:
2092; SSE41-NEXT:    extractps $3, %xmm1, %eax
2093; SSE41-NEXT:    pinsrd $0, %eax, %xmm0
2094; SSE41-NEXT:    retq
2095;
2096; AVX-LABEL: extract3_insert0_v4i32_7123:
2097; AVX:       # %bb.0:
2098; AVX-NEXT:    vextractps $3, %xmm1, %eax
2099; AVX-NEXT:    vpinsrd $0, %eax, %xmm0, %xmm0
2100; AVX-NEXT:    retq
2101  %1 = extractelement <4 x i32> %a1, i32 3
2102  %2 = insertelement <4 x i32> %a0, i32 %1, i32 0
2103  ret <4 x i32> %2
2104}
2105
2106define <4 x i32> @extract3_insert3_v4i32_0127(<4 x i32> %a0, <4 x i32> %a1) {
2107; SSE2-LABEL: extract3_insert3_v4i32_0127:
2108; SSE2:       # %bb.0:
2109; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2110; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2111; SSE2-NEXT:    retq
2112;
2113; SSE3-LABEL: extract3_insert3_v4i32_0127:
2114; SSE3:       # %bb.0:
2115; SSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2116; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2117; SSE3-NEXT:    retq
2118;
2119; SSSE3-LABEL: extract3_insert3_v4i32_0127:
2120; SSSE3:       # %bb.0:
2121; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
2122; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2123; SSSE3-NEXT:    retq
2124;
2125; SSE41-LABEL: extract3_insert3_v4i32_0127:
2126; SSE41:       # %bb.0:
2127; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
2128; SSE41-NEXT:    retq
2129;
2130; AVX-LABEL: extract3_insert3_v4i32_0127:
2131; AVX:       # %bb.0:
2132; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
2133; AVX-NEXT:    retq
2134  %1 = extractelement <4 x i32> %a1, i32 3
2135  %2 = insertelement <4 x i32> %a0, i32 %1, i32 3
2136  ret <4 x i32> %2
2137}
2138
2139define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) {
2140; SSE-LABEL: insert_reg_and_zero_v4i32:
2141; SSE:       # %bb.0:
2142; SSE-NEXT:    movd %edi, %xmm0
2143; SSE-NEXT:    retq
2144;
2145; AVX-LABEL: insert_reg_and_zero_v4i32:
2146; AVX:       # %bb.0:
2147; AVX-NEXT:    vmovd %edi, %xmm0
2148; AVX-NEXT:    retq
2149  %v = insertelement <4 x i32> undef, i32 %a, i32 0
2150  %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2151  ret <4 x i32> %shuffle
2152}
2153
2154define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) {
2155; SSE-LABEL: insert_mem_and_zero_v4i32:
2156; SSE:       # %bb.0:
2157; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2158; SSE-NEXT:    retq
2159;
2160; AVX-LABEL: insert_mem_and_zero_v4i32:
2161; AVX:       # %bb.0:
2162; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2163; AVX-NEXT:    retq
2164  %a = load i32, i32* %ptr
2165  %v = insertelement <4 x i32> undef, i32 %a, i32 0
2166  %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2167  ret <4 x i32> %shuffle
2168}
2169
2170define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
2171; SSE2-LABEL: insert_reg_and_zero_v4f32:
2172; SSE2:       # %bb.0:
2173; SSE2-NEXT:    xorps %xmm1, %xmm1
2174; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2175; SSE2-NEXT:    movaps %xmm1, %xmm0
2176; SSE2-NEXT:    retq
2177;
2178; SSE3-LABEL: insert_reg_and_zero_v4f32:
2179; SSE3:       # %bb.0:
2180; SSE3-NEXT:    xorps %xmm1, %xmm1
2181; SSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2182; SSE3-NEXT:    movaps %xmm1, %xmm0
2183; SSE3-NEXT:    retq
2184;
2185; SSSE3-LABEL: insert_reg_and_zero_v4f32:
2186; SSSE3:       # %bb.0:
2187; SSSE3-NEXT:    xorps %xmm1, %xmm1
2188; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2189; SSSE3-NEXT:    movaps %xmm1, %xmm0
2190; SSSE3-NEXT:    retq
2191;
2192; SSE41-LABEL: insert_reg_and_zero_v4f32:
2193; SSE41:       # %bb.0:
2194; SSE41-NEXT:    xorps %xmm1, %xmm1
2195; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2196; SSE41-NEXT:    retq
2197;
2198; AVX-LABEL: insert_reg_and_zero_v4f32:
2199; AVX:       # %bb.0:
2200; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
2201; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
2202; AVX-NEXT:    retq
2203  %v = insertelement <4 x float> undef, float %a, i32 0
2204  %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2205  ret <4 x float> %shuffle
2206}
2207
2208define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
2209; SSE-LABEL: insert_mem_and_zero_v4f32:
2210; SSE:       # %bb.0:
2211; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2212; SSE-NEXT:    retq
2213;
2214; AVX-LABEL: insert_mem_and_zero_v4f32:
2215; AVX:       # %bb.0:
2216; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2217; AVX-NEXT:    retq
2218  %a = load float, float* %ptr
2219  %v = insertelement <4 x float> undef, float %a, i32 0
2220  %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2221  ret <4 x float> %shuffle
2222}
2223
2224define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) {
2225; SSE2-LABEL: insert_reg_lo_v4i32:
2226; SSE2:       # %bb.0:
2227; SSE2-NEXT:    movq %rdi, %xmm1
2228; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2229; SSE2-NEXT:    retq
2230;
2231; SSE3-LABEL: insert_reg_lo_v4i32:
2232; SSE3:       # %bb.0:
2233; SSE3-NEXT:    movq %rdi, %xmm1
2234; SSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2235; SSE3-NEXT:    retq
2236;
2237; SSSE3-LABEL: insert_reg_lo_v4i32:
2238; SSSE3:       # %bb.0:
2239; SSSE3-NEXT:    movq %rdi, %xmm1
2240; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2241; SSSE3-NEXT:    retq
2242;
2243; SSE41-LABEL: insert_reg_lo_v4i32:
2244; SSE41:       # %bb.0:
2245; SSE41-NEXT:    movq %rdi, %xmm1
2246; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2247; SSE41-NEXT:    retq
2248;
2249; AVX1-LABEL: insert_reg_lo_v4i32:
2250; AVX1:       # %bb.0:
2251; AVX1-NEXT:    vmovq %rdi, %xmm1
2252; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2253; AVX1-NEXT:    retq
2254;
2255; AVX2OR512VL-LABEL: insert_reg_lo_v4i32:
2256; AVX2OR512VL:       # %bb.0:
2257; AVX2OR512VL-NEXT:    vmovq %rdi, %xmm1
2258; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2259; AVX2OR512VL-NEXT:    retq
2260  %a.cast = bitcast i64 %a to <2 x i32>
2261  %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2262  %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2263  ret <4 x i32> %shuffle
2264}
2265
2266define <4 x i32> @insert_mem_lo_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
2267; SSE2-LABEL: insert_mem_lo_v4i32:
2268; SSE2:       # %bb.0:
2269; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2270; SSE2-NEXT:    retq
2271;
2272; SSE3-LABEL: insert_mem_lo_v4i32:
2273; SSE3:       # %bb.0:
2274; SSE3-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2275; SSE3-NEXT:    retq
2276;
2277; SSSE3-LABEL: insert_mem_lo_v4i32:
2278; SSSE3:       # %bb.0:
2279; SSSE3-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2280; SSSE3-NEXT:    retq
2281;
2282; SSE41-LABEL: insert_mem_lo_v4i32:
2283; SSE41:       # %bb.0:
2284; SSE41-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
2285; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2286; SSE41-NEXT:    retq
2287;
2288; AVX-LABEL: insert_mem_lo_v4i32:
2289; AVX:       # %bb.0:
2290; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
2291; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2292; AVX-NEXT:    retq
2293  %a = load <2 x i32>, <2 x i32>* %ptr
2294  %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2295  %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2296  ret <4 x i32> %shuffle
2297}
2298
2299define <4 x i32> @insert_reg_hi_v4i32(i64 %a, <4 x i32> %b) {
2300; SSE-LABEL: insert_reg_hi_v4i32:
2301; SSE:       # %bb.0:
2302; SSE-NEXT:    movq %rdi, %xmm1
2303; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2304; SSE-NEXT:    retq
2305;
2306; AVX-LABEL: insert_reg_hi_v4i32:
2307; AVX:       # %bb.0:
2308; AVX-NEXT:    vmovq %rdi, %xmm1
2309; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2310; AVX-NEXT:    retq
2311  %a.cast = bitcast i64 %a to <2 x i32>
2312  %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2313  %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2314  ret <4 x i32> %shuffle
2315}
2316
2317define <4 x i32> @insert_mem_hi_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
2318; SSE-LABEL: insert_mem_hi_v4i32:
2319; SSE:       # %bb.0:
2320; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
2321; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2322; SSE-NEXT:    retq
2323;
2324; AVX-LABEL: insert_mem_hi_v4i32:
2325; AVX:       # %bb.0:
2326; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
2327; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2328; AVX-NEXT:    retq
2329  %a = load <2 x i32>, <2 x i32>* %ptr
2330  %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2331  %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2332  ret <4 x i32> %shuffle
2333}
2334
2335define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) {
2336; SSE2-LABEL: insert_reg_lo_v4f32:
2337; SSE2:       # %bb.0:
2338; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2339; SSE2-NEXT:    retq
2340;
2341; SSE3-LABEL: insert_reg_lo_v4f32:
2342; SSE3:       # %bb.0:
2343; SSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2344; SSE3-NEXT:    retq
2345;
2346; SSSE3-LABEL: insert_reg_lo_v4f32:
2347; SSSE3:       # %bb.0:
2348; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2349; SSSE3-NEXT:    retq
2350;
2351; SSE41-LABEL: insert_reg_lo_v4f32:
2352; SSE41:       # %bb.0:
2353; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2354; SSE41-NEXT:    retq
2355;
2356; AVX-LABEL: insert_reg_lo_v4f32:
2357; AVX:       # %bb.0:
2358; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2359; AVX-NEXT:    retq
2360  %a.cast = bitcast double %a to <2 x float>
2361  %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2362  %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2363  ret <4 x float> %shuffle
2364}
2365
2366define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
2367; SSE-LABEL: insert_mem_lo_v4f32:
2368; SSE:       # %bb.0:
2369; SSE-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2370; SSE-NEXT:    retq
2371;
2372; AVX-LABEL: insert_mem_lo_v4f32:
2373; AVX:       # %bb.0:
2374; AVX-NEXT:    vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2375; AVX-NEXT:    retq
2376  %a = load <2 x float>, <2 x float>* %ptr
2377  %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2378  %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2379  ret <4 x float> %shuffle
2380}
2381
2382define <4 x float> @insert_reg_hi_v4f32(double %a, <4 x float> %b) {
2383; SSE-LABEL: insert_reg_hi_v4f32:
2384; SSE:       # %bb.0:
2385; SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2386; SSE-NEXT:    movaps %xmm1, %xmm0
2387; SSE-NEXT:    retq
2388;
2389; AVX-LABEL: insert_reg_hi_v4f32:
2390; AVX:       # %bb.0:
2391; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2392; AVX-NEXT:    retq
2393  %a.cast = bitcast double %a to <2 x float>
2394  %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2395  %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2396  ret <4 x float> %shuffle
2397}
2398
2399define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
2400; SSE-LABEL: insert_mem_hi_v4f32:
2401; SSE:       # %bb.0:
2402; SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
2403; SSE-NEXT:    retq
2404;
2405; AVX-LABEL: insert_mem_hi_v4f32:
2406; AVX:       # %bb.0:
2407; AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
2408; AVX-NEXT:    retq
2409  %a = load <2 x float>, <2 x float>* %ptr
2410  %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
2411  %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
2412  ret <4 x float> %shuffle
2413}
2414
2415; PR21137
2416define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
2417; SSE-LABEL: shuffle_mem_v4f32_3210:
2418; SSE:       # %bb.0:
2419; SSE-NEXT:    movaps (%rdi), %xmm0
2420; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
2421; SSE-NEXT:    retq
2422;
2423; AVX-LABEL: shuffle_mem_v4f32_3210:
2424; AVX:       # %bb.0:
2425; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
2426; AVX-NEXT:    retq
2427  %a = load <4 x float>, <4 x float>* %ptr
2428  %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
2429  ret <4 x float> %shuffle
2430}
2431
2432define <4 x i32> @insert_dup_mem_v4i32(i32* %ptr) {
2433; SSE-LABEL: insert_dup_mem_v4i32:
2434; SSE:       # %bb.0:
2435; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2436; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2437; SSE-NEXT:    retq
2438;
2439; AVX-LABEL: insert_dup_mem_v4i32:
2440; AVX:       # %bb.0:
2441; AVX-NEXT:    vbroadcastss (%rdi), %xmm0
2442; AVX-NEXT:    retq
2443  %tmp = load i32, i32* %ptr, align 4
2444  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
2445  %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
2446  ret <4 x i32> %tmp2
2447}
2448
2449; PR41249
2450define <4 x float> @shuffle_mem_pmovzx_v4f32(<2 x float>* %p0, <4 x float>* %p1) {
2451; SSE-LABEL: shuffle_mem_pmovzx_v4f32:
2452; SSE:       # %bb.0:
2453; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2454; SSE-NEXT:    xorps %xmm1, %xmm1
2455; SSE-NEXT:    movaps %xmm0, %xmm2
2456; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2457; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2458; SSE-NEXT:    movaps %xmm2, (%rsi)
2459; SSE-NEXT:    retq
2460;
2461; AVX1-LABEL: shuffle_mem_pmovzx_v4f32:
2462; AVX1:       # %bb.0:
2463; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
2464; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
2465; AVX1-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2466; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2467; AVX1-NEXT:    vmovaps %xmm1, (%rsi)
2468; AVX1-NEXT:    retq
2469;
2470; AVX2OR512VL-LABEL: shuffle_mem_pmovzx_v4f32:
2471; AVX2OR512VL:       # %bb.0:
2472; AVX2OR512VL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
2473; AVX2OR512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
2474; AVX2OR512VL-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2475; AVX2OR512VL-NEXT:    vbroadcastss %xmm0, %xmm0
2476; AVX2OR512VL-NEXT:    vmovaps %xmm1, (%rsi)
2477; AVX2OR512VL-NEXT:    retq
2478  %1 = load <2 x float>, <2 x float>* %p0
2479  %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
2480  %3 = shufflevector <4 x float> %2, <4 x float> <float undef, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
2481  %4 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> zeroinitializer
2482  store <4 x float> %3, <4 x float>* %p1
2483  ret <4 x float> %4
2484}
2485
2486;
2487; Shuffle to logical bit shifts
2488;
2489
2490define <4 x i32> @shuffle_v4i32_z0zX(<4 x i32> %a) {
2491; SSE-LABEL: shuffle_v4i32_z0zX:
2492; SSE:       # %bb.0:
2493; SSE-NEXT:    psllq $32, %xmm0
2494; SSE-NEXT:    retq
2495;
2496; AVX-LABEL: shuffle_v4i32_z0zX:
2497; AVX:       # %bb.0:
2498; AVX-NEXT:    vpsllq $32, %xmm0, %xmm0
2499; AVX-NEXT:    retq
2500  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 0, i32 4, i32 undef>
2501  ret <4 x i32> %shuffle
2502}
2503
2504define <4 x i32> @shuffle_v4i32_1z3z(<4 x i32> %a) {
2505; SSE-LABEL: shuffle_v4i32_1z3z:
2506; SSE:       # %bb.0:
2507; SSE-NEXT:    psrlq $32, %xmm0
2508; SSE-NEXT:    retq
2509;
2510; AVX-LABEL: shuffle_v4i32_1z3z:
2511; AVX:       # %bb.0:
2512; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm0
2513; AVX-NEXT:    retq
2514  %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
2515  ret <4 x i32> %shuffle
2516}
2517
2518define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, <4 x float>* %pb) {
2519; SSE-LABEL: shuffle_mem_v4f32_0145:
2520; SSE:       # %bb.0:
2521; SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
2522; SSE-NEXT:    retq
2523;
2524; AVX-LABEL: shuffle_mem_v4f32_0145:
2525; AVX:       # %bb.0:
2526; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
2527; AVX-NEXT:    retq
2528  %b = load <4 x float>, <4 x float>* %pb, align 1
2529  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
2530  ret <4 x float> %shuffle
2531}
2532
2533define <4 x float> @shuffle_mem_v4f32_4523(<4 x float> %a, <4 x float>* %pb) {
2534; SSE2-LABEL: shuffle_mem_v4f32_4523:
2535; SSE2:       # %bb.0:
2536; SSE2-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2537; SSE2-NEXT:    retq
2538;
2539; SSE3-LABEL: shuffle_mem_v4f32_4523:
2540; SSE3:       # %bb.0:
2541; SSE3-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2542; SSE3-NEXT:    retq
2543;
2544; SSSE3-LABEL: shuffle_mem_v4f32_4523:
2545; SSSE3:       # %bb.0:
2546; SSSE3-NEXT:    movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2547; SSSE3-NEXT:    retq
2548;
2549; SSE41-LABEL: shuffle_mem_v4f32_4523:
2550; SSE41:       # %bb.0:
2551; SSE41-NEXT:    movups (%rdi), %xmm1
2552; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2553; SSE41-NEXT:    retq
2554;
2555; AVX-LABEL: shuffle_mem_v4f32_4523:
2556; AVX:       # %bb.0:
2557; AVX-NEXT:    vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
2558; AVX-NEXT:    retq
2559  %b = load <4 x float>, <4 x float>* %pb, align 1
2560  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
2561  ret <4 x float> %shuffle
2562}
2563
2564define  <4 x float> @shuffle_mem_v4f32_0624(<4 x float> %a0, <4 x float>* %a1) {
2565; SSE-LABEL: shuffle_mem_v4f32_0624:
2566; SSE:       # %bb.0:
2567; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2]
2568; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0,3,1]
2569; SSE-NEXT:    retq
2570;
2571; AVX1OR2-LABEL: shuffle_mem_v4f32_0624:
2572; AVX1OR2:       # %bb.0:
2573; AVX1OR2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2]
2574; AVX1OR2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1]
2575; AVX1OR2-NEXT:    retq
2576;
2577; AVX512VL-LABEL: shuffle_mem_v4f32_0624:
2578; AVX512VL:       # %bb.0:
2579; AVX512VL-NEXT:    vmovaps (%rdi), %xmm2
2580; AVX512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [0,6,2,4]
2581; AVX512VL-NEXT:    vpermi2ps %xmm0, %xmm2, %xmm1
2582; AVX512VL-NEXT:    vmovaps %xmm1, %xmm0
2583; AVX512VL-NEXT:    retq
2584  %1 = load <4 x float>, <4 x float>* %a1
2585  %2 = shufflevector <4 x float> %1, <4 x float> %a0, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
2586  ret <4 x float> %2
2587}
2588
2589define  <4 x float> @shuffle_mem_v4f32_4760(<4 x float> %a0, <4 x float>* %a1) {
2590; SSE-LABEL: shuffle_mem_v4f32_4760:
2591; SSE:       # %bb.0:
2592; SSE-NEXT:    movaps %xmm0, %xmm1
2593; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0]
2594; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2]
2595; SSE-NEXT:    retq
2596;
2597; AVX1OR2-LABEL: shuffle_mem_v4f32_4760:
2598; AVX1OR2:       # %bb.0:
2599; AVX1OR2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,0],mem[0,0]
2600; AVX1OR2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2]
2601; AVX1OR2-NEXT:    retq
2602;
2603; AVX512VL-LABEL: shuffle_mem_v4f32_4760:
2604; AVX512VL:       # %bb.0:
2605; AVX512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [0,3,2,4]
2606; AVX512VL-NEXT:    vpermt2ps (%rdi), %xmm1, %xmm0
2607; AVX512VL-NEXT:    retq
2608  %1 = load <4 x float>, <4 x float>* %a1
2609  %2 = shufflevector <4 x float> %1, <4 x float> %a0, <4 x i32> <i32 4, i32 7, i32 6, i32 0>
2610  ret <4 x float> %2
2611}
2612