1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
4; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
5; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
6; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW
7; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL
8; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE
9;
10; Verify that the DAG combiner correctly folds bitwise operations across
11; shuffles, nested shuffles with undef, pairs of nested shuffles, and other
12; basic and always-safe patterns. Also test that the DAG combiner will combine
13; target-specific shuffle instructions where reasonable.
14
15target triple = "x86_64-unknown-unknown"
16
17declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
18declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
19declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
20
21define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
22; CHECK-LABEL: combine_pshufd1:
23; CHECK:       # %bb.0: # %entry
24; CHECK-NEXT:    retq
25entry:
26  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
27  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
28  ret <4 x i32> %c
29}
30
31define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
32; CHECK-LABEL: combine_pshufd2:
33; CHECK:       # %bb.0: # %entry
34; CHECK-NEXT:    retq
35entry:
36  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
37  %b.cast = bitcast <4 x i32> %b to <8 x i16>
38  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
39  %c.cast = bitcast <8 x i16> %c to <4 x i32>
40  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
41  ret <4 x i32> %d
42}
43
44define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
45; CHECK-LABEL: combine_pshufd3:
46; CHECK:       # %bb.0: # %entry
47; CHECK-NEXT:    retq
48entry:
49  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
50  %b.cast = bitcast <4 x i32> %b to <8 x i16>
51  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
52  %c.cast = bitcast <8 x i16> %c to <4 x i32>
53  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
54  ret <4 x i32> %d
55}
56
57define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
58; SSE-LABEL: combine_pshufd4:
59; SSE:       # %bb.0: # %entry
60; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
61; SSE-NEXT:    retq
62;
63; AVX-LABEL: combine_pshufd4:
64; AVX:       # %bb.0: # %entry
65; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
66; AVX-NEXT:    retq
67entry:
68  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31)
69  %b.cast = bitcast <4 x i32> %b to <8 x i16>
70  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
71  %c.cast = bitcast <8 x i16> %c to <4 x i32>
72  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31)
73  ret <4 x i32> %d
74}
75
76define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
77; SSE-LABEL: combine_pshufd5:
78; SSE:       # %bb.0: # %entry
79; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
80; SSE-NEXT:    retq
81;
82; AVX-LABEL: combine_pshufd5:
83; AVX:       # %bb.0: # %entry
84; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
85; AVX-NEXT:    retq
86entry:
87  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76)
88  %b.cast = bitcast <4 x i32> %b to <8 x i16>
89  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
90  %c.cast = bitcast <8 x i16> %c to <4 x i32>
91  %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76)
92  ret <4 x i32> %d
93}
94
95define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
96; SSE-LABEL: combine_pshufd6:
97; SSE:       # %bb.0: # %entry
98; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
99; SSE-NEXT:    retq
100;
101; AVX1-LABEL: combine_pshufd6:
102; AVX1:       # %bb.0: # %entry
103; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
104; AVX1-NEXT:    retq
105;
106; AVX2-LABEL: combine_pshufd6:
107; AVX2:       # %bb.0: # %entry
108; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
109; AVX2-NEXT:    retq
110entry:
111  %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0)
112  %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8)
113  ret <4 x i32> %c
114}
115
116define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
117; CHECK-LABEL: combine_pshuflw1:
118; CHECK:       # %bb.0: # %entry
119; CHECK-NEXT:    retq
120entry:
121  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
122  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
123  ret <8 x i16> %c
124}
125
126define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
127; CHECK-LABEL: combine_pshuflw2:
128; CHECK:       # %bb.0: # %entry
129; CHECK-NEXT:    retq
130entry:
131  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
132  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
133  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
134  ret <8 x i16> %d
135}
136
137define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
138; SSE-LABEL: combine_pshuflw3:
139; SSE:       # %bb.0: # %entry
140; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
141; SSE-NEXT:    retq
142;
143; AVX-LABEL: combine_pshuflw3:
144; AVX:       # %bb.0: # %entry
145; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
146; AVX-NEXT:    retq
147entry:
148  %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
149  %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
150  %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
151  ret <8 x i16> %d
152}
153
154define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
155; SSE-LABEL: combine_pshufhw1:
156; SSE:       # %bb.0: # %entry
157; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
158; SSE-NEXT:    retq
159;
160; AVX-LABEL: combine_pshufhw1:
161; AVX:       # %bb.0: # %entry
162; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
163; AVX-NEXT:    retq
164entry:
165  %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
166  %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
167  %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
168  ret <8 x i16> %d
169}
170
171define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
172; SSE-LABEL: combine_bitwise_ops_test1:
173; SSE:       # %bb.0:
174; SSE-NEXT:    pand %xmm1, %xmm0
175; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
176; SSE-NEXT:    retq
177;
178; AVX-LABEL: combine_bitwise_ops_test1:
179; AVX:       # %bb.0:
180; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
181; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
182; AVX-NEXT:    retq
183  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
184  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
185  %and = and <4 x i32> %shuf1, %shuf2
186  ret <4 x i32> %and
187}
188
189define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
190; SSE-LABEL: combine_bitwise_ops_test2:
191; SSE:       # %bb.0:
192; SSE-NEXT:    por %xmm1, %xmm0
193; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
194; SSE-NEXT:    retq
195;
196; AVX-LABEL: combine_bitwise_ops_test2:
197; AVX:       # %bb.0:
198; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
199; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
200; AVX-NEXT:    retq
201  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
202  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
203  %or = or <4 x i32> %shuf1, %shuf2
204  ret <4 x i32> %or
205}
206
207define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
208; SSE-LABEL: combine_bitwise_ops_test3:
209; SSE:       # %bb.0:
210; SSE-NEXT:    pxor %xmm1, %xmm0
211; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
212; SSE-NEXT:    retq
213;
214; AVX-LABEL: combine_bitwise_ops_test3:
215; AVX:       # %bb.0:
216; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
217; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
218; AVX-NEXT:    retq
219  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
220  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
221  %xor = xor <4 x i32> %shuf1, %shuf2
222  ret <4 x i32> %xor
223}
224
225define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
226; SSE-LABEL: combine_bitwise_ops_test4:
227; SSE:       # %bb.0:
228; SSE-NEXT:    pand %xmm1, %xmm0
229; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
230; SSE-NEXT:    retq
231;
232; AVX-LABEL: combine_bitwise_ops_test4:
233; AVX:       # %bb.0:
234; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
235; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
236; AVX-NEXT:    retq
237  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
238  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
239  %and = and <4 x i32> %shuf1, %shuf2
240  ret <4 x i32> %and
241}
242
243define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
244; SSE-LABEL: combine_bitwise_ops_test5:
245; SSE:       # %bb.0:
246; SSE-NEXT:    por %xmm1, %xmm0
247; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
248; SSE-NEXT:    retq
249;
250; AVX-LABEL: combine_bitwise_ops_test5:
251; AVX:       # %bb.0:
252; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
253; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
254; AVX-NEXT:    retq
255  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
256  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
257  %or = or <4 x i32> %shuf1, %shuf2
258  ret <4 x i32> %or
259}
260
261define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
262; SSE-LABEL: combine_bitwise_ops_test6:
263; SSE:       # %bb.0:
264; SSE-NEXT:    pxor %xmm1, %xmm0
265; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
266; SSE-NEXT:    retq
267;
268; AVX-LABEL: combine_bitwise_ops_test6:
269; AVX:       # %bb.0:
270; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
271; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
272; AVX-NEXT:    retq
273  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
274  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
275  %xor = xor <4 x i32> %shuf1, %shuf2
276  ret <4 x i32> %xor
277}
278
279
280; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
281; are not performing a swizzle operations.
282
283define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
284; SSE2-LABEL: combine_bitwise_ops_test1b:
285; SSE2:       # %bb.0:
286; SSE2-NEXT:    pand %xmm1, %xmm0
287; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
288; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
289; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
290; SSE2-NEXT:    retq
291;
292; SSSE3-LABEL: combine_bitwise_ops_test1b:
293; SSSE3:       # %bb.0:
294; SSSE3-NEXT:    pand %xmm1, %xmm0
295; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
296; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
297; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
298; SSSE3-NEXT:    retq
299;
300; SSE41-LABEL: combine_bitwise_ops_test1b:
301; SSE41:       # %bb.0:
302; SSE41-NEXT:    andps %xmm1, %xmm0
303; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
304; SSE41-NEXT:    retq
305;
306; AVX-LABEL: combine_bitwise_ops_test1b:
307; AVX:       # %bb.0:
308; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
309; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
310; AVX-NEXT:    retq
311  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
312  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
313  %and = and <4 x i32> %shuf1, %shuf2
314  ret <4 x i32> %and
315}
316
317define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
318; SSE2-LABEL: combine_bitwise_ops_test2b:
319; SSE2:       # %bb.0:
320; SSE2-NEXT:    por %xmm1, %xmm0
321; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
322; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
323; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
324; SSE2-NEXT:    retq
325;
326; SSSE3-LABEL: combine_bitwise_ops_test2b:
327; SSSE3:       # %bb.0:
328; SSSE3-NEXT:    por %xmm1, %xmm0
329; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
330; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
331; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
332; SSSE3-NEXT:    retq
333;
334; SSE41-LABEL: combine_bitwise_ops_test2b:
335; SSE41:       # %bb.0:
336; SSE41-NEXT:    orps %xmm1, %xmm0
337; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
338; SSE41-NEXT:    retq
339;
340; AVX-LABEL: combine_bitwise_ops_test2b:
341; AVX:       # %bb.0:
342; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
343; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
344; AVX-NEXT:    retq
345  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
346  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
347  %or = or <4 x i32> %shuf1, %shuf2
348  ret <4 x i32> %or
349}
350
351define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
352; SSE2-LABEL: combine_bitwise_ops_test3b:
353; SSE2:       # %bb.0:
354; SSE2-NEXT:    xorps %xmm1, %xmm0
355; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
356; SSE2-NEXT:    retq
357;
358; SSSE3-LABEL: combine_bitwise_ops_test3b:
359; SSSE3:       # %bb.0:
360; SSSE3-NEXT:    xorps %xmm1, %xmm0
361; SSSE3-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
362; SSSE3-NEXT:    retq
363;
364; SSE41-LABEL: combine_bitwise_ops_test3b:
365; SSE41:       # %bb.0:
366; SSE41-NEXT:    xorps %xmm1, %xmm0
367; SSE41-NEXT:    xorps %xmm1, %xmm1
368; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
369; SSE41-NEXT:    retq
370;
371; AVX-LABEL: combine_bitwise_ops_test3b:
372; AVX:       # %bb.0:
373; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
374; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
375; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
376; AVX-NEXT:    retq
377  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
378  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
379  %xor = xor <4 x i32> %shuf1, %shuf2
380  ret <4 x i32> %xor
381}
382
383define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
384; SSE2-LABEL: combine_bitwise_ops_test4b:
385; SSE2:       # %bb.0:
386; SSE2-NEXT:    pand %xmm1, %xmm0
387; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
388; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
389; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
390; SSE2-NEXT:    retq
391;
392; SSSE3-LABEL: combine_bitwise_ops_test4b:
393; SSSE3:       # %bb.0:
394; SSSE3-NEXT:    pand %xmm1, %xmm0
395; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
396; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
397; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
398; SSSE3-NEXT:    retq
399;
400; SSE41-LABEL: combine_bitwise_ops_test4b:
401; SSE41:       # %bb.0:
402; SSE41-NEXT:    andps %xmm1, %xmm0
403; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
404; SSE41-NEXT:    retq
405;
406; AVX-LABEL: combine_bitwise_ops_test4b:
407; AVX:       # %bb.0:
408; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
409; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
410; AVX-NEXT:    retq
411  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
412  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
413  %and = and <4 x i32> %shuf1, %shuf2
414  ret <4 x i32> %and
415}
416
417define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
418; SSE2-LABEL: combine_bitwise_ops_test5b:
419; SSE2:       # %bb.0:
420; SSE2-NEXT:    por %xmm1, %xmm0
421; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
422; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
423; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
424; SSE2-NEXT:    retq
425;
426; SSSE3-LABEL: combine_bitwise_ops_test5b:
427; SSSE3:       # %bb.0:
428; SSSE3-NEXT:    por %xmm1, %xmm0
429; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
430; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
431; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
432; SSSE3-NEXT:    retq
433;
434; SSE41-LABEL: combine_bitwise_ops_test5b:
435; SSE41:       # %bb.0:
436; SSE41-NEXT:    orps %xmm1, %xmm0
437; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
438; SSE41-NEXT:    retq
439;
440; AVX-LABEL: combine_bitwise_ops_test5b:
441; AVX:       # %bb.0:
442; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
443; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
444; AVX-NEXT:    retq
445  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
446  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
447  %or = or <4 x i32> %shuf1, %shuf2
448  ret <4 x i32> %or
449}
450
451define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
452; SSE2-LABEL: combine_bitwise_ops_test6b:
453; SSE2:       # %bb.0:
454; SSE2-NEXT:    xorps %xmm1, %xmm0
455; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
456; SSE2-NEXT:    retq
457;
458; SSSE3-LABEL: combine_bitwise_ops_test6b:
459; SSSE3:       # %bb.0:
460; SSSE3-NEXT:    xorps %xmm1, %xmm0
461; SSSE3-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
462; SSSE3-NEXT:    retq
463;
464; SSE41-LABEL: combine_bitwise_ops_test6b:
465; SSE41:       # %bb.0:
466; SSE41-NEXT:    xorps %xmm1, %xmm0
467; SSE41-NEXT:    xorps %xmm1, %xmm1
468; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
469; SSE41-NEXT:    retq
470;
471; AVX-LABEL: combine_bitwise_ops_test6b:
472; AVX:       # %bb.0:
473; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
474; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
475; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
476; AVX-NEXT:    retq
477  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
478  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
479  %xor = xor <4 x i32> %shuf1, %shuf2
480  ret <4 x i32> %xor
481}
482
483define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
484; SSE-LABEL: combine_bitwise_ops_test1c:
485; SSE:       # %bb.0:
486; SSE-NEXT:    andps %xmm1, %xmm0
487; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
488; SSE-NEXT:    retq
489;
490; AVX-LABEL: combine_bitwise_ops_test1c:
491; AVX:       # %bb.0:
492; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
493; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
494; AVX-NEXT:    retq
495  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
496  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
497  %and = and <4 x i32> %shuf1, %shuf2
498  ret <4 x i32> %and
499}
500
501define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
502; SSE-LABEL: combine_bitwise_ops_test2c:
503; SSE:       # %bb.0:
504; SSE-NEXT:    orps %xmm1, %xmm0
505; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
506; SSE-NEXT:    retq
507;
508; AVX-LABEL: combine_bitwise_ops_test2c:
509; AVX:       # %bb.0:
510; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
511; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
512; AVX-NEXT:    retq
513  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
514  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
515  %or = or <4 x i32> %shuf1, %shuf2
516  ret <4 x i32> %or
517}
518
519define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
520; SSE2-LABEL: combine_bitwise_ops_test3c:
521; SSE2:       # %bb.0:
522; SSE2-NEXT:    xorps %xmm1, %xmm0
523; SSE2-NEXT:    xorps %xmm1, %xmm1
524; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
525; SSE2-NEXT:    retq
526;
527; SSSE3-LABEL: combine_bitwise_ops_test3c:
528; SSSE3:       # %bb.0:
529; SSSE3-NEXT:    xorps %xmm1, %xmm0
530; SSSE3-NEXT:    xorps %xmm1, %xmm1
531; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
532; SSSE3-NEXT:    retq
533;
534; SSE41-LABEL: combine_bitwise_ops_test3c:
535; SSE41:       # %bb.0:
536; SSE41-NEXT:    xorps %xmm1, %xmm0
537; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
538; SSE41-NEXT:    retq
539;
540; AVX-LABEL: combine_bitwise_ops_test3c:
541; AVX:       # %bb.0:
542; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
543; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
544; AVX-NEXT:    retq
545  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
546  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
547  %xor = xor <4 x i32> %shuf1, %shuf2
548  ret <4 x i32> %xor
549}
550
551define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
552; SSE-LABEL: combine_bitwise_ops_test4c:
553; SSE:       # %bb.0:
554; SSE-NEXT:    andps %xmm1, %xmm0
555; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
556; SSE-NEXT:    movaps %xmm2, %xmm0
557; SSE-NEXT:    retq
558;
559; AVX-LABEL: combine_bitwise_ops_test4c:
560; AVX:       # %bb.0:
561; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
562; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
563; AVX-NEXT:    retq
564  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
565  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
566  %and = and <4 x i32> %shuf1, %shuf2
567  ret <4 x i32> %and
568}
569
570define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
571; SSE-LABEL: combine_bitwise_ops_test5c:
572; SSE:       # %bb.0:
573; SSE-NEXT:    orps %xmm1, %xmm0
574; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
575; SSE-NEXT:    movaps %xmm2, %xmm0
576; SSE-NEXT:    retq
577;
578; AVX-LABEL: combine_bitwise_ops_test5c:
579; AVX:       # %bb.0:
580; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
581; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
582; AVX-NEXT:    retq
583  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
584  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
585  %or = or <4 x i32> %shuf1, %shuf2
586  ret <4 x i32> %or
587}
588
589define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
590; SSE2-LABEL: combine_bitwise_ops_test6c:
591; SSE2:       # %bb.0:
592; SSE2-NEXT:    xorps %xmm1, %xmm0
593; SSE2-NEXT:    xorps %xmm1, %xmm1
594; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
595; SSE2-NEXT:    movaps %xmm1, %xmm0
596; SSE2-NEXT:    retq
597;
598; SSSE3-LABEL: combine_bitwise_ops_test6c:
599; SSSE3:       # %bb.0:
600; SSSE3-NEXT:    xorps %xmm1, %xmm0
601; SSSE3-NEXT:    xorps %xmm1, %xmm1
602; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
603; SSSE3-NEXT:    movaps %xmm1, %xmm0
604; SSSE3-NEXT:    retq
605;
606; SSE41-LABEL: combine_bitwise_ops_test6c:
607; SSE41:       # %bb.0:
608; SSE41-NEXT:    xorps %xmm1, %xmm0
609; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3]
610; SSE41-NEXT:    retq
611;
612; AVX-LABEL: combine_bitwise_ops_test6c:
613; AVX:       # %bb.0:
614; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
615; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3]
616; AVX-NEXT:    retq
617  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
618  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
619  %xor = xor <4 x i32> %shuf1, %shuf2
620  ret <4 x i32> %xor
621}
622
623define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) {
624; SSE-LABEL: combine_nested_undef_test1:
625; SSE:       # %bb.0:
626; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
627; SSE-NEXT:    retq
628;
629; AVX-LABEL: combine_nested_undef_test1:
630; AVX:       # %bb.0:
631; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
632; AVX-NEXT:    retq
633  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
634  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
635  ret <4 x i32> %2
636}
637
638define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) {
639; SSE-LABEL: combine_nested_undef_test2:
640; SSE:       # %bb.0:
641; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
642; SSE-NEXT:    retq
643;
644; AVX-LABEL: combine_nested_undef_test2:
645; AVX:       # %bb.0:
646; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
647; AVX-NEXT:    retq
648  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
649  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
650  ret <4 x i32> %2
651}
652
653define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) {
654; SSE-LABEL: combine_nested_undef_test3:
655; SSE:       # %bb.0:
656; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
657; SSE-NEXT:    retq
658;
659; AVX-LABEL: combine_nested_undef_test3:
660; AVX:       # %bb.0:
661; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
662; AVX-NEXT:    retq
663  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
664  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
665  ret <4 x i32> %2
666}
667
668define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) {
669; SSE-LABEL: combine_nested_undef_test4:
670; SSE:       # %bb.0:
671; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
672; SSE-NEXT:    retq
673;
674; AVX1-LABEL: combine_nested_undef_test4:
675; AVX1:       # %bb.0:
676; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
677; AVX1-NEXT:    retq
678;
679; AVX2-LABEL: combine_nested_undef_test4:
680; AVX2:       # %bb.0:
681; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
682; AVX2-NEXT:    retq
683  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
684  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3>
685  ret <4 x i32> %2
686}
687
688define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) {
689; SSE-LABEL: combine_nested_undef_test5:
690; SSE:       # %bb.0:
691; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
692; SSE-NEXT:    retq
693;
694; AVX-LABEL: combine_nested_undef_test5:
695; AVX:       # %bb.0:
696; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
697; AVX-NEXT:    retq
698  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
699  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
700  ret <4 x i32> %2
701}
702
703define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) {
704; SSE-LABEL: combine_nested_undef_test6:
705; SSE:       # %bb.0:
706; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
707; SSE-NEXT:    retq
708;
709; AVX-LABEL: combine_nested_undef_test6:
710; AVX:       # %bb.0:
711; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
712; AVX-NEXT:    retq
713  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
714  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
715  ret <4 x i32> %2
716}
717
718define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) {
719; SSE-LABEL: combine_nested_undef_test7:
720; SSE:       # %bb.0:
721; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
722; SSE-NEXT:    retq
723;
724; AVX-LABEL: combine_nested_undef_test7:
725; AVX:       # %bb.0:
726; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
727; AVX-NEXT:    retq
728  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
729  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
730  ret <4 x i32> %2
731}
732
733define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) {
734; SSE-LABEL: combine_nested_undef_test8:
735; SSE:       # %bb.0:
736; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
737; SSE-NEXT:    retq
738;
739; AVX-LABEL: combine_nested_undef_test8:
740; AVX:       # %bb.0:
741; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3]
742; AVX-NEXT:    retq
743  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
744  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
745  ret <4 x i32> %2
746}
747
748define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) {
749; SSE-LABEL: combine_nested_undef_test9:
750; SSE:       # %bb.0:
751; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
752; SSE-NEXT:    retq
753;
754; AVX-LABEL: combine_nested_undef_test9:
755; AVX:       # %bb.0:
756; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,2]
757; AVX-NEXT:    retq
758  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
759  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
760  ret <4 x i32> %2
761}
762
763define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) {
764; SSE-LABEL: combine_nested_undef_test10:
765; SSE:       # %bb.0:
766; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
767; SSE-NEXT:    retq
768;
769; AVX-LABEL: combine_nested_undef_test10:
770; AVX:       # %bb.0:
771; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
772; AVX-NEXT:    retq
773  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
774  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
775  ret <4 x i32> %2
776}
777
778define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) {
779; SSE-LABEL: combine_nested_undef_test11:
780; SSE:       # %bb.0:
781; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
782; SSE-NEXT:    retq
783;
784; AVX-LABEL: combine_nested_undef_test11:
785; AVX:       # %bb.0:
786; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,1]
787; AVX-NEXT:    retq
788  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
789  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
790  ret <4 x i32> %2
791}
792
793define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
794; SSE-LABEL: combine_nested_undef_test12:
795; SSE:       # %bb.0:
796; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
797; SSE-NEXT:    retq
798;
799; AVX1-LABEL: combine_nested_undef_test12:
800; AVX1:       # %bb.0:
801; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
802; AVX1-NEXT:    retq
803;
804; AVX2-LABEL: combine_nested_undef_test12:
805; AVX2:       # %bb.0:
806; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
807; AVX2-NEXT:    retq
808  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
809  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4>
810  ret <4 x i32> %2
811}
812
813; The following pair of shuffles is folded into vector %A.
814define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
815; CHECK-LABEL: combine_nested_undef_test13:
816; CHECK:       # %bb.0:
817; CHECK-NEXT:    retq
818  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
819  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
820  ret <4 x i32> %2
821}
822
823; The following pair of shuffles is folded into vector %B.
824define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
825; SSE-LABEL: combine_nested_undef_test14:
826; SSE:       # %bb.0:
827; SSE-NEXT:    movaps %xmm1, %xmm0
828; SSE-NEXT:    retq
829;
830; AVX-LABEL: combine_nested_undef_test14:
831; AVX:       # %bb.0:
832; AVX-NEXT:    vmovaps %xmm1, %xmm0
833; AVX-NEXT:    retq
834  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
835  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4>
836  ret <4 x i32> %2
837}
838
839
840; Verify that we don't optimize the following cases. We expect more than one shuffle.
841;
842; FIXME: Many of these already don't make sense, and the rest should stop
843; making sense with th enew vector shuffle lowering. Revisit at least testing for
844; it.
845
846define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
847; SSE2-LABEL: combine_nested_undef_test15:
848; SSE2:       # %bb.0:
849; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
850; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
851; SSE2-NEXT:    movaps %xmm1, %xmm0
852; SSE2-NEXT:    retq
853;
854; SSSE3-LABEL: combine_nested_undef_test15:
855; SSSE3:       # %bb.0:
856; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
857; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
858; SSSE3-NEXT:    movaps %xmm1, %xmm0
859; SSSE3-NEXT:    retq
860;
861; SSE41-LABEL: combine_nested_undef_test15:
862; SSE41:       # %bb.0:
863; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
864; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
865; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
866; SSE41-NEXT:    retq
867;
868; AVX1-LABEL: combine_nested_undef_test15:
869; AVX1:       # %bb.0:
870; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
871; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
872; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
873; AVX1-NEXT:    retq
874;
875; AVX2-LABEL: combine_nested_undef_test15:
876; AVX2:       # %bb.0:
877; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
878; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
879; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
880; AVX2-NEXT:    retq
881  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
882  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
883  ret <4 x i32> %2
884}
885
886define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
887; SSE2-LABEL: combine_nested_undef_test16:
888; SSE2:       # %bb.0:
889; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
890; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
891; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
892; SSE2-NEXT:    retq
893;
894; SSSE3-LABEL: combine_nested_undef_test16:
895; SSSE3:       # %bb.0:
896; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
897; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
898; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
899; SSSE3-NEXT:    retq
900;
901; SSE41-LABEL: combine_nested_undef_test16:
902; SSE41:       # %bb.0:
903; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
904; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
905; SSE41-NEXT:    retq
906;
907; AVX-LABEL: combine_nested_undef_test16:
908; AVX:       # %bb.0:
909; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
910; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
911; AVX-NEXT:    retq
912  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
913  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
914  ret <4 x i32> %2
915}
916
917define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
918; SSE2-LABEL: combine_nested_undef_test17:
919; SSE2:       # %bb.0:
920; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
921; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
922; SSE2-NEXT:    retq
923;
924; SSSE3-LABEL: combine_nested_undef_test17:
925; SSSE3:       # %bb.0:
926; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
927; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
928; SSSE3-NEXT:    retq
929;
930; SSE41-LABEL: combine_nested_undef_test17:
931; SSE41:       # %bb.0:
932; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
933; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
934; SSE41-NEXT:    retq
935;
936; AVX-LABEL: combine_nested_undef_test17:
937; AVX:       # %bb.0:
938; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
939; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
940; AVX-NEXT:    retq
941  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
942  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
943  ret <4 x i32> %2
944}
945
946define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
947; SSE-LABEL: combine_nested_undef_test18:
948; SSE:       # %bb.0:
949; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
950; SSE-NEXT:    retq
951;
952; AVX-LABEL: combine_nested_undef_test18:
953; AVX:       # %bb.0:
954; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[1,1,0,3]
955; AVX-NEXT:    retq
956  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
957  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
958  ret <4 x i32> %2
959}
960
961define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
962; SSE2-LABEL: combine_nested_undef_test19:
963; SSE2:       # %bb.0:
964; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
965; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
966; SSE2-NEXT:    retq
967;
968; SSSE3-LABEL: combine_nested_undef_test19:
969; SSSE3:       # %bb.0:
970; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
971; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
972; SSSE3-NEXT:    retq
973;
974; SSE41-LABEL: combine_nested_undef_test19:
975; SSE41:       # %bb.0:
976; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
977; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
978; SSE41-NEXT:    retq
979;
980; AVX-LABEL: combine_nested_undef_test19:
981; AVX:       # %bb.0:
982; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
983; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
984; AVX-NEXT:    retq
985  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
986  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
987  ret <4 x i32> %2
988}
989
990define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
991; SSE2-LABEL: combine_nested_undef_test20:
992; SSE2:       # %bb.0:
993; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
994; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
995; SSE2-NEXT:    movaps %xmm1, %xmm0
996; SSE2-NEXT:    retq
997;
998; SSSE3-LABEL: combine_nested_undef_test20:
999; SSSE3:       # %bb.0:
1000; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
1001; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
1002; SSSE3-NEXT:    movaps %xmm1, %xmm0
1003; SSSE3-NEXT:    retq
1004;
1005; SSE41-LABEL: combine_nested_undef_test20:
1006; SSE41:       # %bb.0:
1007; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1008; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
1009; SSE41-NEXT:    retq
1010;
1011; AVX-LABEL: combine_nested_undef_test20:
1012; AVX:       # %bb.0:
1013; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1014; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,3,0]
1015; AVX-NEXT:    retq
1016  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
1017  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
1018  ret <4 x i32> %2
1019}
1020
1021define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
1022; SSE2-LABEL: combine_nested_undef_test21:
1023; SSE2:       # %bb.0:
1024; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1025; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
1026; SSE2-NEXT:    retq
1027;
1028; SSSE3-LABEL: combine_nested_undef_test21:
1029; SSSE3:       # %bb.0:
1030; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1031; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
1032; SSSE3-NEXT:    retq
1033;
1034; SSE41-LABEL: combine_nested_undef_test21:
1035; SSE41:       # %bb.0:
1036; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1037; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1038; SSE41-NEXT:    retq
1039;
1040; AVX1-LABEL: combine_nested_undef_test21:
1041; AVX1:       # %bb.0:
1042; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1043; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1044; AVX1-NEXT:    retq
1045;
1046; AVX2-LABEL: combine_nested_undef_test21:
1047; AVX2:       # %bb.0:
1048; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1049; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1050; AVX2-NEXT:    retq
1051  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
1052  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1053  ret <4 x i32> %2
1054}
1055
1056
1057; Test that we correctly combine shuffles according to rule
1058;  shuffle(shuffle(x, y), undef) -> shuffle(y, undef)
1059
1060define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) {
1061; SSE-LABEL: combine_nested_undef_test22:
1062; SSE:       # %bb.0:
1063; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
1064; SSE-NEXT:    retq
1065;
1066; AVX-LABEL: combine_nested_undef_test22:
1067; AVX:       # %bb.0:
1068; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[1,1,1,3]
1069; AVX-NEXT:    retq
1070  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1071  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3>
1072  ret <4 x i32> %2
1073}
1074
1075define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) {
1076; SSE-LABEL: combine_nested_undef_test23:
1077; SSE:       # %bb.0:
1078; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
1079; SSE-NEXT:    retq
1080;
1081; AVX-LABEL: combine_nested_undef_test23:
1082; AVX:       # %bb.0:
1083; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[0,1,0,3]
1084; AVX-NEXT:    retq
1085  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
1086  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
1087  ret <4 x i32> %2
1088}
1089
1090define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) {
1091; SSE-LABEL: combine_nested_undef_test24:
1092; SSE:       # %bb.0:
1093; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
1094; SSE-NEXT:    retq
1095;
1096; AVX-LABEL: combine_nested_undef_test24:
1097; AVX:       # %bb.0:
1098; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[0,3,2,3]
1099; AVX-NEXT:    retq
1100  %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1101  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4>
1102  ret <4 x i32> %2
1103}
1104
1105define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) {
1106; SSE-LABEL: combine_nested_undef_test25:
1107; SSE:       # %bb.0:
1108; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1109; SSE-NEXT:    retq
1110;
1111; AVX1-LABEL: combine_nested_undef_test25:
1112; AVX1:       # %bb.0:
1113; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1114; AVX1-NEXT:    retq
1115;
1116; AVX2-LABEL: combine_nested_undef_test25:
1117; AVX2:       # %bb.0:
1118; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1119; AVX2-NEXT:    retq
1120  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4>
1121  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1>
1122  ret <4 x i32> %2
1123}
1124
1125define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) {
1126; SSE-LABEL: combine_nested_undef_test26:
1127; SSE:       # %bb.0:
1128; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1129; SSE-NEXT:    retq
1130;
1131; AVX-LABEL: combine_nested_undef_test26:
1132; AVX:       # %bb.0:
1133; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
1134; AVX-NEXT:    retq
1135  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7>
1136  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
1137  ret <4 x i32> %2
1138}
1139
1140define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) {
1141; SSE-LABEL: combine_nested_undef_test27:
1142; SSE:       # %bb.0:
1143; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1144; SSE-NEXT:    retq
1145;
1146; AVX1-LABEL: combine_nested_undef_test27:
1147; AVX1:       # %bb.0:
1148; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
1149; AVX1-NEXT:    retq
1150;
1151; AVX2-LABEL: combine_nested_undef_test27:
1152; AVX2:       # %bb.0:
1153; AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1154; AVX2-NEXT:    retq
1155  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4>
1156  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
1157  ret <4 x i32> %2
1158}
1159
1160define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
1161; SSE-LABEL: combine_nested_undef_test28:
1162; SSE:       # %bb.0:
1163; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
1164; SSE-NEXT:    retq
1165;
1166; AVX-LABEL: combine_nested_undef_test28:
1167; AVX:       # %bb.0:
1168; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0]
1169; AVX-NEXT:    retq
1170  %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
1171  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2>
1172  ret <4 x i32> %2
1173}
1174
1175define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
1176; SSE-LABEL: combine_test1:
1177; SSE:       # %bb.0:
1178; SSE-NEXT:    movaps %xmm1, %xmm0
1179; SSE-NEXT:    retq
1180;
1181; AVX-LABEL: combine_test1:
1182; AVX:       # %bb.0:
1183; AVX-NEXT:    vmovaps %xmm1, %xmm0
1184; AVX-NEXT:    retq
1185  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1186  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1187  ret <4 x float> %2
1188}
1189
1190define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
1191; SSE2-LABEL: combine_test2:
1192; SSE2:       # %bb.0:
1193; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1194; SSE2-NEXT:    movaps %xmm1, %xmm0
1195; SSE2-NEXT:    retq
1196;
1197; SSSE3-LABEL: combine_test2:
1198; SSSE3:       # %bb.0:
1199; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1200; SSSE3-NEXT:    movaps %xmm1, %xmm0
1201; SSSE3-NEXT:    retq
1202;
1203; SSE41-LABEL: combine_test2:
1204; SSE41:       # %bb.0:
1205; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1206; SSE41-NEXT:    retq
1207;
1208; AVX-LABEL: combine_test2:
1209; AVX:       # %bb.0:
1210; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1211; AVX-NEXT:    retq
1212  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1213  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1214  ret <4 x float> %2
1215}
1216
1217define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
1218; SSE-LABEL: combine_test3:
1219; SSE:       # %bb.0:
1220; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1221; SSE-NEXT:    retq
1222;
1223; AVX-LABEL: combine_test3:
1224; AVX:       # %bb.0:
1225; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1226; AVX-NEXT:    retq
1227  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1228  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1229  ret <4 x float> %2
1230}
1231
1232define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
1233; SSE-LABEL: combine_test4:
1234; SSE:       # %bb.0:
1235; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1236; SSE-NEXT:    retq
1237;
1238; AVX-LABEL: combine_test4:
1239; AVX:       # %bb.0:
1240; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1241; AVX-NEXT:    retq
1242  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1243  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1244  ret <4 x float> %2
1245}
1246
1247define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
1248; SSE2-LABEL: combine_test5:
1249; SSE2:       # %bb.0:
1250; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1251; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1252; SSE2-NEXT:    retq
1253;
1254; SSSE3-LABEL: combine_test5:
1255; SSSE3:       # %bb.0:
1256; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1257; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1258; SSSE3-NEXT:    retq
1259;
1260; SSE41-LABEL: combine_test5:
1261; SSE41:       # %bb.0:
1262; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1263; SSE41-NEXT:    retq
1264;
1265; AVX-LABEL: combine_test5:
1266; AVX:       # %bb.0:
1267; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1268; AVX-NEXT:    retq
1269  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1270  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1271  ret <4 x float> %2
1272}
1273
1274define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
1275; SSE-LABEL: combine_test6:
1276; SSE:       # %bb.0:
1277; SSE-NEXT:    movaps %xmm1, %xmm0
1278; SSE-NEXT:    retq
1279;
1280; AVX-LABEL: combine_test6:
1281; AVX:       # %bb.0:
1282; AVX-NEXT:    vmovaps %xmm1, %xmm0
1283; AVX-NEXT:    retq
1284  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1285  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1286  ret <4 x i32> %2
1287}
1288
1289define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
1290; SSE2-LABEL: combine_test7:
1291; SSE2:       # %bb.0:
1292; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1293; SSE2-NEXT:    movaps %xmm1, %xmm0
1294; SSE2-NEXT:    retq
1295;
1296; SSSE3-LABEL: combine_test7:
1297; SSSE3:       # %bb.0:
1298; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1299; SSSE3-NEXT:    movaps %xmm1, %xmm0
1300; SSSE3-NEXT:    retq
1301;
1302; SSE41-LABEL: combine_test7:
1303; SSE41:       # %bb.0:
1304; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1305; SSE41-NEXT:    retq
1306;
1307; AVX-LABEL: combine_test7:
1308; AVX:       # %bb.0:
1309; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1310; AVX-NEXT:    retq
1311  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1312  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1313  ret <4 x i32> %2
1314}
1315
1316define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) {
1317; SSE-LABEL: combine_test8:
1318; SSE:       # %bb.0:
1319; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1320; SSE-NEXT:    retq
1321;
1322; AVX-LABEL: combine_test8:
1323; AVX:       # %bb.0:
1324; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1325; AVX-NEXT:    retq
1326  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
1327  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1328  ret <4 x i32> %2
1329}
1330
1331define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
1332; SSE-LABEL: combine_test9:
1333; SSE:       # %bb.0:
1334; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1335; SSE-NEXT:    movaps %xmm1, %xmm0
1336; SSE-NEXT:    retq
1337;
1338; AVX-LABEL: combine_test9:
1339; AVX:       # %bb.0:
1340; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1341; AVX-NEXT:    retq
1342  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1343  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1344  ret <4 x i32> %2
1345}
1346
1347define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
1348; SSE2-LABEL: combine_test10:
1349; SSE2:       # %bb.0:
1350; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1351; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1352; SSE2-NEXT:    retq
1353;
1354; SSSE3-LABEL: combine_test10:
1355; SSSE3:       # %bb.0:
1356; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1357; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1358; SSSE3-NEXT:    retq
1359;
1360; SSE41-LABEL: combine_test10:
1361; SSE41:       # %bb.0:
1362; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1363; SSE41-NEXT:    retq
1364;
1365; AVX-LABEL: combine_test10:
1366; AVX:       # %bb.0:
1367; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1368; AVX-NEXT:    retq
1369  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1370  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1371  ret <4 x i32> %2
1372}
1373
1374define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
1375; CHECK-LABEL: combine_test11:
1376; CHECK:       # %bb.0:
1377; CHECK-NEXT:    retq
1378  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1379  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1380  ret <4 x float> %2
1381}
1382
1383define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
1384; SSE2-LABEL: combine_test12:
1385; SSE2:       # %bb.0:
1386; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1387; SSE2-NEXT:    movaps %xmm1, %xmm0
1388; SSE2-NEXT:    retq
1389;
1390; SSSE3-LABEL: combine_test12:
1391; SSSE3:       # %bb.0:
1392; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1393; SSSE3-NEXT:    movaps %xmm1, %xmm0
1394; SSSE3-NEXT:    retq
1395;
1396; SSE41-LABEL: combine_test12:
1397; SSE41:       # %bb.0:
1398; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1399; SSE41-NEXT:    retq
1400;
1401; AVX-LABEL: combine_test12:
1402; AVX:       # %bb.0:
1403; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1404; AVX-NEXT:    retq
1405  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1406  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1407  ret <4 x float> %2
1408}
1409
1410define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) {
1411; SSE-LABEL: combine_test13:
1412; SSE:       # %bb.0:
1413; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1414; SSE-NEXT:    retq
1415;
1416; AVX-LABEL: combine_test13:
1417; AVX:       # %bb.0:
1418; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1419; AVX-NEXT:    retq
1420  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1421  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1422  ret <4 x float> %2
1423}
1424
1425define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
1426; SSE-LABEL: combine_test14:
1427; SSE:       # %bb.0:
1428; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1429; SSE-NEXT:    retq
1430;
1431; AVX-LABEL: combine_test14:
1432; AVX:       # %bb.0:
1433; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1434; AVX-NEXT:    retq
1435  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1436  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1437  ret <4 x float> %2
1438}
1439
1440define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
1441; SSE2-LABEL: combine_test15:
1442; SSE2:       # %bb.0:
1443; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1444; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1445; SSE2-NEXT:    retq
1446;
1447; SSSE3-LABEL: combine_test15:
1448; SSSE3:       # %bb.0:
1449; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1450; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1451; SSSE3-NEXT:    retq
1452;
1453; SSE41-LABEL: combine_test15:
1454; SSE41:       # %bb.0:
1455; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1456; SSE41-NEXT:    retq
1457;
1458; AVX-LABEL: combine_test15:
1459; AVX:       # %bb.0:
1460; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1461; AVX-NEXT:    retq
1462  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1463  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1464  ret <4 x float> %2
1465}
1466
1467define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
1468; CHECK-LABEL: combine_test16:
1469; CHECK:       # %bb.0:
1470; CHECK-NEXT:    retq
1471  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1472  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1473  ret <4 x i32> %2
1474}
1475
1476define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
1477; SSE2-LABEL: combine_test17:
1478; SSE2:       # %bb.0:
1479; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1480; SSE2-NEXT:    movaps %xmm1, %xmm0
1481; SSE2-NEXT:    retq
1482;
1483; SSSE3-LABEL: combine_test17:
1484; SSSE3:       # %bb.0:
1485; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1486; SSSE3-NEXT:    movaps %xmm1, %xmm0
1487; SSSE3-NEXT:    retq
1488;
1489; SSE41-LABEL: combine_test17:
1490; SSE41:       # %bb.0:
1491; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1492; SSE41-NEXT:    retq
1493;
1494; AVX-LABEL: combine_test17:
1495; AVX:       # %bb.0:
1496; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1497; AVX-NEXT:    retq
1498  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1499  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1500  ret <4 x i32> %2
1501}
1502
1503define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) {
1504; SSE-LABEL: combine_test18:
1505; SSE:       # %bb.0:
1506; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1507; SSE-NEXT:    retq
1508;
1509; AVX-LABEL: combine_test18:
1510; AVX:       # %bb.0:
1511; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1512; AVX-NEXT:    retq
1513  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1514  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1515  ret <4 x i32> %2
1516}
1517
1518define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
1519; SSE-LABEL: combine_test19:
1520; SSE:       # %bb.0:
1521; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1522; SSE-NEXT:    retq
1523;
1524; AVX-LABEL: combine_test19:
1525; AVX:       # %bb.0:
1526; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1527; AVX-NEXT:    retq
1528  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
1529  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1530  ret <4 x i32> %2
1531}
1532
1533define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
1534; SSE2-LABEL: combine_test20:
1535; SSE2:       # %bb.0:
1536; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1537; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1538; SSE2-NEXT:    retq
1539;
1540; SSSE3-LABEL: combine_test20:
1541; SSSE3:       # %bb.0:
1542; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1543; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1544; SSSE3-NEXT:    retq
1545;
1546; SSE41-LABEL: combine_test20:
1547; SSE41:       # %bb.0:
1548; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1549; SSE41-NEXT:    retq
1550;
1551; AVX-LABEL: combine_test20:
1552; AVX:       # %bb.0:
1553; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
1554; AVX-NEXT:    retq
1555  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
1556  %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1557  ret <4 x i32> %2
1558}
1559
1560define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) {
1561; SSE-LABEL: combine_test21:
1562; SSE:       # %bb.0:
1563; SSE-NEXT:    movaps %xmm0, %xmm2
1564; SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1565; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1566; SSE-NEXT:    movaps %xmm2, (%rdi)
1567; SSE-NEXT:    retq
1568;
1569; AVX-LABEL: combine_test21:
1570; AVX:       # %bb.0:
1571; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1572; AVX-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
1573; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1574; AVX-NEXT:    vmovaps %xmm2, (%rdi)
1575; AVX-NEXT:    vzeroupper
1576; AVX-NEXT:    retq
1577  %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1578  %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1579  store <4 x i32> %1, <4 x i32>* %ptr, align 16
1580  ret <4 x i32> %2
1581}
1582
1583define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) {
1584; SSE-LABEL: combine_test22:
1585; SSE:       # %bb.0:
1586; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
1587; SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
1588; SSE-NEXT:    retq
1589;
1590; AVX-LABEL: combine_test22:
1591; AVX:       # %bb.0:
1592; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1593; AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
1594; AVX-NEXT:    retq
1595; Current AVX2 lowering of this is still awful, not adding a test case.
1596  %1 = load <2 x float>, <2 x float>* %a, align 8
1597  %2 = load <2 x float>, <2 x float>* %b, align 8
1598  %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
1599  ret <8 x float> %3
1600}
1601
1602; PR22359
1603define void @combine_test23(<8 x float> %v, <2 x float>* %ptr) {
1604; SSE-LABEL: combine_test23:
1605; SSE:       # %bb.0:
1606; SSE-NEXT:    movups %xmm0, (%rdi)
1607; SSE-NEXT:    retq
1608;
1609; AVX-LABEL: combine_test23:
1610; AVX:       # %bb.0:
1611; AVX-NEXT:    vmovups %xmm0, (%rdi)
1612; AVX-NEXT:    vzeroupper
1613; AVX-NEXT:    retq
1614  %idx2 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 1
1615  %shuffle0 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 0, i32 1>
1616  %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 2, i32 3>
1617  store <2 x float> %shuffle0, <2 x float>* %ptr, align 8
1618  store <2 x float> %shuffle1, <2 x float>* %idx2, align 8
1619  ret void
1620}
1621
1622; Check some negative cases.
1623; FIXME: Do any of these really make sense? Are they redundant with the above tests?
1624
1625define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
1626; SSE-LABEL: combine_test1b:
1627; SSE:       # %bb.0:
1628; SSE-NEXT:    movaps %xmm1, %xmm0
1629; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
1630; SSE-NEXT:    retq
1631;
1632; AVX-LABEL: combine_test1b:
1633; AVX:       # %bb.0:
1634; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0]
1635; AVX-NEXT:    retq
1636  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1637  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0>
1638  ret <4 x float> %2
1639}
1640
1641define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
1642; SSE2-LABEL: combine_test2b:
1643; SSE2:       # %bb.0:
1644; SSE2-NEXT:    movaps %xmm1, %xmm0
1645; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1646; SSE2-NEXT:    retq
1647;
1648; SSSE3-LABEL: combine_test2b:
1649; SSSE3:       # %bb.0:
1650; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
1651; SSSE3-NEXT:    retq
1652;
1653; SSE41-LABEL: combine_test2b:
1654; SSE41:       # %bb.0:
1655; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
1656; SSE41-NEXT:    retq
1657;
1658; AVX-LABEL: combine_test2b:
1659; AVX:       # %bb.0:
1660; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm1[0,0]
1661; AVX-NEXT:    retq
1662  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1663  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5>
1664  ret <4 x float> %2
1665}
1666
1667define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
1668; SSE2-LABEL: combine_test3b:
1669; SSE2:       # %bb.0:
1670; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1671; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1672; SSE2-NEXT:    retq
1673;
1674; SSSE3-LABEL: combine_test3b:
1675; SSSE3:       # %bb.0:
1676; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
1677; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
1678; SSSE3-NEXT:    retq
1679;
1680; SSE41-LABEL: combine_test3b:
1681; SSE41:       # %bb.0:
1682; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1683; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1684; SSE41-NEXT:    retq
1685;
1686; AVX-LABEL: combine_test3b:
1687; AVX:       # %bb.0:
1688; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1689; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
1690; AVX-NEXT:    retq
1691  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3>
1692  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7>
1693  ret <4 x float> %2
1694}
1695
1696define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
1697; SSE-LABEL: combine_test4b:
1698; SSE:       # %bb.0:
1699; SSE-NEXT:    movaps %xmm1, %xmm0
1700; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
1701; SSE-NEXT:    retq
1702;
1703; AVX-LABEL: combine_test4b:
1704; AVX:       # %bb.0:
1705; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3]
1706; AVX-NEXT:    retq
1707  %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1708  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7>
1709  ret <4 x float> %2
1710}
1711
1712
1713; Verify that we correctly fold shuffles even when we use illegal vector types.
1714
1715define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
1716; SSE2-LABEL: combine_test1c:
1717; SSE2:       # %bb.0:
1718; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1719; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1720; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1721; SSE2-NEXT:    andps %xmm0, %xmm2
1722; SSE2-NEXT:    andnps %xmm1, %xmm0
1723; SSE2-NEXT:    orps %xmm2, %xmm0
1724; SSE2-NEXT:    retq
1725;
1726; SSSE3-LABEL: combine_test1c:
1727; SSSE3:       # %bb.0:
1728; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1729; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1730; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1731; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
1732; SSSE3-NEXT:    retq
1733;
1734; SSE41-LABEL: combine_test1c:
1735; SSE41:       # %bb.0:
1736; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1737; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1738; SSE41-NEXT:    movaps {{.*#+}} xmm0 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
1739; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
1740; SSE41-NEXT:    movdqa %xmm1, %xmm0
1741; SSE41-NEXT:    retq
1742;
1743; AVX-LABEL: combine_test1c:
1744; AVX:       # %bb.0:
1745; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1746; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1747; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
1748; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1749; AVX-NEXT:    retq
1750  %A = load <4 x i8>, <4 x i8>* %a
1751  %B = load <4 x i8>, <4 x i8>* %b
1752  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1753  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1754  ret <4 x i8> %2
1755}
1756
1757define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
1758; SSE-LABEL: combine_test2c:
1759; SSE:       # %bb.0:
1760; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1761; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1762; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1763; SSE-NEXT:    retq
1764;
1765; AVX-LABEL: combine_test2c:
1766; AVX:       # %bb.0:
1767; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1768; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1769; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1770; AVX-NEXT:    retq
1771  %A = load <4 x i8>, <4 x i8>* %a
1772  %B = load <4 x i8>, <4 x i8>* %b
1773  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5>
1774  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
1775  ret <4 x i8> %2
1776}
1777
1778define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
1779; SSE-LABEL: combine_test3c:
1780; SSE:       # %bb.0:
1781; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1782; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1783; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1784; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1785; SSE-NEXT:    retq
1786;
1787; AVX-LABEL: combine_test3c:
1788; AVX:       # %bb.0:
1789; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1790; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1791; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1792; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1793; AVX-NEXT:    retq
1794  %A = load <4 x i8>, <4 x i8>* %a
1795  %B = load <4 x i8>, <4 x i8>* %b
1796  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
1797  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
1798  ret <4 x i8> %2
1799}
1800
1801define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
1802; SSE2-LABEL: combine_test4c:
1803; SSE2:       # %bb.0:
1804; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1805; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1806; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1807; SSE2-NEXT:    andps %xmm0, %xmm2
1808; SSE2-NEXT:    andnps %xmm1, %xmm0
1809; SSE2-NEXT:    orps %xmm2, %xmm0
1810; SSE2-NEXT:    retq
1811;
1812; SSSE3-LABEL: combine_test4c:
1813; SSSE3:       # %bb.0:
1814; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1815; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1816; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1817; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,3,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
1818; SSSE3-NEXT:    retq
1819;
1820; SSE41-LABEL: combine_test4c:
1821; SSE41:       # %bb.0:
1822; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1823; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1824; SSE41-NEXT:    movaps {{.*#+}} xmm0 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
1825; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
1826; SSE41-NEXT:    movdqa %xmm1, %xmm0
1827; SSE41-NEXT:    retq
1828;
1829; AVX-LABEL: combine_test4c:
1830; AVX:       # %bb.0:
1831; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1832; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1833; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
1834; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
1835; AVX-NEXT:    retq
1836  %A = load <4 x i8>, <4 x i8>* %a
1837  %B = load <4 x i8>, <4 x i8>* %b
1838  %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
1839  %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1840  ret <4 x i8> %2
1841}
1842
1843
1844; The following test cases are generated from this C++ code
1845;
1846;__m128 blend_01(__m128 a, __m128 b)
1847;{
1848;  __m128 s = a;
1849;  s = _mm_blend_ps( s, b, 1<<0 );
1850;  s = _mm_blend_ps( s, b, 1<<1 );
1851;  return s;
1852;}
1853;
1854;__m128 blend_02(__m128 a, __m128 b)
1855;{
1856;  __m128 s = a;
1857;  s = _mm_blend_ps( s, b, 1<<0 );
1858;  s = _mm_blend_ps( s, b, 1<<2 );
1859;  return s;
1860;}
1861;
1862;__m128 blend_123(__m128 a, __m128 b)
1863;{
1864;  __m128 s = a;
1865;  s = _mm_blend_ps( s, b, 1<<1 );
1866;  s = _mm_blend_ps( s, b, 1<<2 );
1867;  s = _mm_blend_ps( s, b, 1<<3 );
1868;  return s;
1869;}
1870
1871; Ideally, we should collapse the following shuffles into a single one.
1872
1873define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
1874; SSE2-LABEL: combine_blend_01:
1875; SSE2:       # %bb.0:
1876; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1877; SSE2-NEXT:    retq
1878;
1879; SSSE3-LABEL: combine_blend_01:
1880; SSSE3:       # %bb.0:
1881; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1882; SSSE3-NEXT:    retq
1883;
1884; SSE41-LABEL: combine_blend_01:
1885; SSE41:       # %bb.0:
1886; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1887; SSE41-NEXT:    retq
1888;
1889; AVX-LABEL: combine_blend_01:
1890; AVX:       # %bb.0:
1891; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1892; AVX-NEXT:    retq
1893  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
1894  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
1895  ret <4 x float> %shuffle6
1896}
1897
1898define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
1899; SSE2-LABEL: combine_blend_02:
1900; SSE2:       # %bb.0:
1901; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
1902; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
1903; SSE2-NEXT:    movaps %xmm1, %xmm0
1904; SSE2-NEXT:    retq
1905;
1906; SSSE3-LABEL: combine_blend_02:
1907; SSSE3:       # %bb.0:
1908; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
1909; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
1910; SSSE3-NEXT:    movaps %xmm1, %xmm0
1911; SSSE3-NEXT:    retq
1912;
1913; SSE41-LABEL: combine_blend_02:
1914; SSE41:       # %bb.0:
1915; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1916; SSE41-NEXT:    retq
1917;
1918; AVX-LABEL: combine_blend_02:
1919; AVX:       # %bb.0:
1920; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
1921; AVX-NEXT:    retq
1922  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3>
1923  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
1924  ret <4 x float> %shuffle6
1925}
1926
1927define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
1928; SSE2-LABEL: combine_blend_123:
1929; SSE2:       # %bb.0:
1930; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1931; SSE2-NEXT:    movaps %xmm1, %xmm0
1932; SSE2-NEXT:    retq
1933;
1934; SSSE3-LABEL: combine_blend_123:
1935; SSSE3:       # %bb.0:
1936; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1937; SSSE3-NEXT:    movaps %xmm1, %xmm0
1938; SSSE3-NEXT:    retq
1939;
1940; SSE41-LABEL: combine_blend_123:
1941; SSE41:       # %bb.0:
1942; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1943; SSE41-NEXT:    retq
1944;
1945; AVX-LABEL: combine_blend_123:
1946; AVX:       # %bb.0:
1947; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1948; AVX-NEXT:    retq
1949  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
1950  %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
1951  %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
1952  ret <4 x float> %shuffle12
1953}
1954
1955define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) {
1956; SSE-LABEL: combine_test_movhl_1:
1957; SSE:       # %bb.0:
1958; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1959; SSE-NEXT:    movaps %xmm1, %xmm0
1960; SSE-NEXT:    retq
1961;
1962; AVX-LABEL: combine_test_movhl_1:
1963; AVX:       # %bb.0:
1964; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1965; AVX-NEXT:    retq
1966  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3>
1967  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3>
1968  ret <4 x i32> %2
1969}
1970
1971define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) {
1972; SSE-LABEL: combine_test_movhl_2:
1973; SSE:       # %bb.0:
1974; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1975; SSE-NEXT:    movaps %xmm1, %xmm0
1976; SSE-NEXT:    retq
1977;
1978; AVX-LABEL: combine_test_movhl_2:
1979; AVX:       # %bb.0:
1980; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1981; AVX-NEXT:    retq
1982  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6>
1983  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2>
1984  ret <4 x i32> %2
1985}
1986
1987define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
1988; SSE-LABEL: combine_test_movhl_3:
1989; SSE:       # %bb.0:
1990; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1991; SSE-NEXT:    movaps %xmm1, %xmm0
1992; SSE-NEXT:    retq
1993;
1994; AVX-LABEL: combine_test_movhl_3:
1995; AVX:       # %bb.0:
1996; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
1997; AVX-NEXT:    retq
1998  %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2>
1999  %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2>
2000  ret <4 x i32> %2
2001}
2002
2003
2004; Verify that we fold shuffles according to rule:
2005;  (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2)
2006
2007define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) {
2008; SSE2-LABEL: combine_undef_input_test1:
2009; SSE2:       # %bb.0:
2010; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2011; SSE2-NEXT:    retq
2012;
2013; SSSE3-LABEL: combine_undef_input_test1:
2014; SSSE3:       # %bb.0:
2015; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2016; SSSE3-NEXT:    retq
2017;
2018; SSE41-LABEL: combine_undef_input_test1:
2019; SSE41:       # %bb.0:
2020; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2021; SSE41-NEXT:    retq
2022;
2023; AVX-LABEL: combine_undef_input_test1:
2024; AVX:       # %bb.0:
2025; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2026; AVX-NEXT:    retq
2027  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2028  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2029  ret <4 x float> %2
2030}
2031
2032define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) {
2033; SSE-LABEL: combine_undef_input_test2:
2034; SSE:       # %bb.0:
2035; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2036; SSE-NEXT:    retq
2037;
2038; AVX-LABEL: combine_undef_input_test2:
2039; AVX:       # %bb.0:
2040; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2041; AVX-NEXT:    retq
2042  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2043  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2044  ret <4 x float> %2
2045}
2046
2047define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) {
2048; SSE-LABEL: combine_undef_input_test3:
2049; SSE:       # %bb.0:
2050; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2051; SSE-NEXT:    retq
2052;
2053; AVX-LABEL: combine_undef_input_test3:
2054; AVX:       # %bb.0:
2055; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2056; AVX-NEXT:    retq
2057  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2058  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2059  ret <4 x float> %2
2060}
2061
2062define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
2063; SSE-LABEL: combine_undef_input_test4:
2064; SSE:       # %bb.0:
2065; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2066; SSE-NEXT:    retq
2067;
2068; AVX-LABEL: combine_undef_input_test4:
2069; AVX:       # %bb.0:
2070; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2071; AVX-NEXT:    retq
2072  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2073  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2074  ret <4 x float> %2
2075}
2076
2077define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) {
2078; SSE2-LABEL: combine_undef_input_test5:
2079; SSE2:       # %bb.0:
2080; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2081; SSE2-NEXT:    retq
2082;
2083; SSSE3-LABEL: combine_undef_input_test5:
2084; SSSE3:       # %bb.0:
2085; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2086; SSSE3-NEXT:    retq
2087;
2088; SSE41-LABEL: combine_undef_input_test5:
2089; SSE41:       # %bb.0:
2090; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2091; SSE41-NEXT:    retq
2092;
2093; AVX-LABEL: combine_undef_input_test5:
2094; AVX:       # %bb.0:
2095; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2096; AVX-NEXT:    retq
2097  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2098  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2099  ret <4 x float> %2
2100}
2101
2102
2103; Verify that we fold shuffles according to rule:
2104;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2105
2106define <4 x float> @combine_undef_input_test6(<4 x float> %a) {
2107; CHECK-LABEL: combine_undef_input_test6:
2108; CHECK:       # %bb.0:
2109; CHECK-NEXT:    retq
2110  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2111  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
2112  ret <4 x float> %2
2113}
2114
2115define <4 x float> @combine_undef_input_test7(<4 x float> %a) {
2116; SSE2-LABEL: combine_undef_input_test7:
2117; SSE2:       # %bb.0:
2118; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2119; SSE2-NEXT:    retq
2120;
2121; SSSE3-LABEL: combine_undef_input_test7:
2122; SSSE3:       # %bb.0:
2123; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2124; SSSE3-NEXT:    retq
2125;
2126; SSE41-LABEL: combine_undef_input_test7:
2127; SSE41:       # %bb.0:
2128; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2129; SSE41-NEXT:    retq
2130;
2131; AVX-LABEL: combine_undef_input_test7:
2132; AVX:       # %bb.0:
2133; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2134; AVX-NEXT:    retq
2135  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2136  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
2137  ret <4 x float> %2
2138}
2139
2140define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
2141; SSE2-LABEL: combine_undef_input_test8:
2142; SSE2:       # %bb.0:
2143; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2144; SSE2-NEXT:    retq
2145;
2146; SSSE3-LABEL: combine_undef_input_test8:
2147; SSSE3:       # %bb.0:
2148; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2149; SSSE3-NEXT:    retq
2150;
2151; SSE41-LABEL: combine_undef_input_test8:
2152; SSE41:       # %bb.0:
2153; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2154; SSE41-NEXT:    retq
2155;
2156; AVX-LABEL: combine_undef_input_test8:
2157; AVX:       # %bb.0:
2158; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2159; AVX-NEXT:    retq
2160  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2161  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
2162  ret <4 x float> %2
2163}
2164
2165define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
2166; SSE-LABEL: combine_undef_input_test9:
2167; SSE:       # %bb.0:
2168; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
2169; SSE-NEXT:    retq
2170;
2171; AVX-LABEL: combine_undef_input_test9:
2172; AVX:       # %bb.0:
2173; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
2174; AVX-NEXT:    retq
2175  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2176  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
2177  ret <4 x float> %2
2178}
2179
2180define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
2181; CHECK-LABEL: combine_undef_input_test10:
2182; CHECK:       # %bb.0:
2183; CHECK-NEXT:    retq
2184  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2185  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
2186  ret <4 x float> %2
2187}
2188
2189define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) {
2190; SSE2-LABEL: combine_undef_input_test11:
2191; SSE2:       # %bb.0:
2192; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2193; SSE2-NEXT:    retq
2194;
2195; SSSE3-LABEL: combine_undef_input_test11:
2196; SSSE3:       # %bb.0:
2197; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2198; SSSE3-NEXT:    retq
2199;
2200; SSE41-LABEL: combine_undef_input_test11:
2201; SSE41:       # %bb.0:
2202; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2203; SSE41-NEXT:    retq
2204;
2205; AVX-LABEL: combine_undef_input_test11:
2206; AVX:       # %bb.0:
2207; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2208; AVX-NEXT:    retq
2209  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2210  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6>
2211  ret <4 x float> %2
2212}
2213
2214define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) {
2215; SSE-LABEL: combine_undef_input_test12:
2216; SSE:       # %bb.0:
2217; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2218; SSE-NEXT:    retq
2219;
2220; AVX-LABEL: combine_undef_input_test12:
2221; AVX:       # %bb.0:
2222; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2223; AVX-NEXT:    retq
2224  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2225  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2226  ret <4 x float> %2
2227}
2228
2229define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) {
2230; SSE-LABEL: combine_undef_input_test13:
2231; SSE:       # %bb.0:
2232; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2233; SSE-NEXT:    retq
2234;
2235; AVX-LABEL: combine_undef_input_test13:
2236; AVX:       # %bb.0:
2237; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2238; AVX-NEXT:    retq
2239  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2240  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5>
2241  ret <4 x float> %2
2242}
2243
2244define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
2245; SSE-LABEL: combine_undef_input_test14:
2246; SSE:       # %bb.0:
2247; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2248; SSE-NEXT:    retq
2249;
2250; AVX-LABEL: combine_undef_input_test14:
2251; AVX:       # %bb.0:
2252; AVX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
2253; AVX-NEXT:    retq
2254  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2255  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2256  ret <4 x float> %2
2257}
2258
2259define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) {
2260; SSE2-LABEL: combine_undef_input_test15:
2261; SSE2:       # %bb.0:
2262; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2263; SSE2-NEXT:    retq
2264;
2265; SSSE3-LABEL: combine_undef_input_test15:
2266; SSSE3:       # %bb.0:
2267; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2268; SSSE3-NEXT:    retq
2269;
2270; SSE41-LABEL: combine_undef_input_test15:
2271; SSE41:       # %bb.0:
2272; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2273; SSE41-NEXT:    retq
2274;
2275; AVX-LABEL: combine_undef_input_test15:
2276; AVX:       # %bb.0:
2277; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2278; AVX-NEXT:    retq
2279  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2280  %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2281  ret <4 x float> %2
2282}
2283
2284
2285; Verify that shuffles are canonicalized according to rules:
2286;  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
2287;
2288; This allows to trigger the following combine rule:
2289;  (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2)
2290;
2291; As a result, all the shuffle pairs in each function below should be
2292; combined into a single legal shuffle operation.
2293
2294define <4 x float> @combine_undef_input_test16(<4 x float> %a) {
2295; CHECK-LABEL: combine_undef_input_test16:
2296; CHECK:       # %bb.0:
2297; CHECK-NEXT:    retq
2298  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
2299  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
2300  ret <4 x float> %2
2301}
2302
2303define <4 x float> @combine_undef_input_test17(<4 x float> %a) {
2304; SSE2-LABEL: combine_undef_input_test17:
2305; SSE2:       # %bb.0:
2306; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2307; SSE2-NEXT:    retq
2308;
2309; SSSE3-LABEL: combine_undef_input_test17:
2310; SSSE3:       # %bb.0:
2311; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2312; SSSE3-NEXT:    retq
2313;
2314; SSE41-LABEL: combine_undef_input_test17:
2315; SSE41:       # %bb.0:
2316; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2317; SSE41-NEXT:    retq
2318;
2319; AVX-LABEL: combine_undef_input_test17:
2320; AVX:       # %bb.0:
2321; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2322; AVX-NEXT:    retq
2323  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
2324  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
2325  ret <4 x float> %2
2326}
2327
2328define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
2329; SSE2-LABEL: combine_undef_input_test18:
2330; SSE2:       # %bb.0:
2331; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
2332; SSE2-NEXT:    retq
2333;
2334; SSSE3-LABEL: combine_undef_input_test18:
2335; SSSE3:       # %bb.0:
2336; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2337; SSSE3-NEXT:    retq
2338;
2339; SSE41-LABEL: combine_undef_input_test18:
2340; SSE41:       # %bb.0:
2341; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
2342; SSE41-NEXT:    retq
2343;
2344; AVX-LABEL: combine_undef_input_test18:
2345; AVX:       # %bb.0:
2346; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
2347; AVX-NEXT:    retq
2348  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
2349  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
2350  ret <4 x float> %2
2351}
2352
2353define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
2354; SSE-LABEL: combine_undef_input_test19:
2355; SSE:       # %bb.0:
2356; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
2357; SSE-NEXT:    retq
2358;
2359; AVX-LABEL: combine_undef_input_test19:
2360; AVX:       # %bb.0:
2361; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
2362; AVX-NEXT:    retq
2363  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
2364  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
2365  ret <4 x float> %2
2366}
2367
2368define <4 x float> @combine_undef_input_test20(<4 x float> %a) {
2369; CHECK-LABEL: combine_undef_input_test20:
2370; CHECK:       # %bb.0:
2371; CHECK-NEXT:    retq
2372  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
2373  %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2374  ret <4 x float> %2
2375}
2376
2377; These tests are designed to test the ability to combine away unnecessary
2378; operations feeding into a shuffle. The AVX cases are the important ones as
2379; they leverage operations which cannot be done naturally on the entire vector
2380; and thus are decomposed into multiple smaller operations.
2381
2382define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
2383; SSE-LABEL: combine_unneeded_subvector1:
2384; SSE:       # %bb.0:
2385; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2386; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0]
2387; SSE-NEXT:    movdqa %xmm0, %xmm1
2388; SSE-NEXT:    retq
2389;
2390; AVX1-LABEL: combine_unneeded_subvector1:
2391; AVX1:       # %bb.0:
2392; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2393; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2394; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2395; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2396; AVX1-NEXT:    retq
2397;
2398; AVX2-SLOW-LABEL: combine_unneeded_subvector1:
2399; AVX2-SLOW:       # %bb.0:
2400; AVX2-SLOW-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2401; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2402; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
2403; AVX2-SLOW-NEXT:    retq
2404;
2405; AVX2-FAST-ALL-LABEL: combine_unneeded_subvector1:
2406; AVX2-FAST-ALL:       # %bb.0:
2407; AVX2-FAST-ALL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2408; AVX2-FAST-ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
2409; AVX2-FAST-ALL-NEXT:    # ymm1 = mem[0,1,0,1]
2410; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
2411; AVX2-FAST-ALL-NEXT:    retq
2412;
2413; AVX2-FAST-PERLANE-LABEL: combine_unneeded_subvector1:
2414; AVX2-FAST-PERLANE:       # %bb.0:
2415; AVX2-FAST-PERLANE-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2416; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2417; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
2418; AVX2-FAST-PERLANE-NEXT:    retq
2419  %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2420  %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
2421  ret <8 x i32> %c
2422}
2423
2424define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
2425; SSE-LABEL: combine_unneeded_subvector2:
2426; SSE:       # %bb.0:
2427; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2428; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0]
2429; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
2430; SSE-NEXT:    retq
2431;
2432; AVX1-LABEL: combine_unneeded_subvector2:
2433; AVX1:       # %bb.0:
2434; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2435; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2436; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2437; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2438; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2439; AVX1-NEXT:    retq
2440;
2441; AVX2-LABEL: combine_unneeded_subvector2:
2442; AVX2:       # %bb.0:
2443; AVX2-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2444; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
2445; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
2446; AVX2-NEXT:    retq
2447  %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
2448  %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
2449  ret <8 x i32> %d
2450}
2451
2452define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
2453; SSE2-LABEL: combine_insertps1:
2454; SSE2:       # %bb.0:
2455; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2456; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2457; SSE2-NEXT:    movaps %xmm1, %xmm0
2458; SSE2-NEXT:    retq
2459;
2460; SSSE3-LABEL: combine_insertps1:
2461; SSSE3:       # %bb.0:
2462; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
2463; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
2464; SSSE3-NEXT:    movaps %xmm1, %xmm0
2465; SSSE3-NEXT:    retq
2466;
2467; SSE41-LABEL: combine_insertps1:
2468; SSE41:       # %bb.0:
2469; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2470; SSE41-NEXT:    retq
2471;
2472; AVX-LABEL: combine_insertps1:
2473; AVX:       # %bb.0:
2474; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
2475; AVX-NEXT:    retq
2476
2477  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4>
2478  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
2479  ret <4 x float> %d
2480}
2481
2482define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
2483; SSE2-LABEL: combine_insertps2:
2484; SSE2:       # %bb.0:
2485; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2486; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2487; SSE2-NEXT:    movaps %xmm1, %xmm0
2488; SSE2-NEXT:    retq
2489;
2490; SSSE3-LABEL: combine_insertps2:
2491; SSSE3:       # %bb.0:
2492; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
2493; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
2494; SSSE3-NEXT:    movaps %xmm1, %xmm0
2495; SSSE3-NEXT:    retq
2496;
2497; SSE41-LABEL: combine_insertps2:
2498; SSE41:       # %bb.0:
2499; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2500; SSE41-NEXT:    retq
2501;
2502; AVX-LABEL: combine_insertps2:
2503; AVX:       # %bb.0:
2504; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
2505; AVX-NEXT:    retq
2506
2507  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7>
2508  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
2509  ret <4 x float> %d
2510}
2511
2512define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
2513; SSE2-LABEL: combine_insertps3:
2514; SSE2:       # %bb.0:
2515; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2516; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2517; SSE2-NEXT:    retq
2518;
2519; SSSE3-LABEL: combine_insertps3:
2520; SSSE3:       # %bb.0:
2521; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
2522; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
2523; SSSE3-NEXT:    retq
2524;
2525; SSE41-LABEL: combine_insertps3:
2526; SSE41:       # %bb.0:
2527; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2528; SSE41-NEXT:    retq
2529;
2530; AVX-LABEL: combine_insertps3:
2531; AVX:       # %bb.0:
2532; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2533; AVX-NEXT:    retq
2534
2535  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2536  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3>
2537  ret <4 x float> %d
2538}
2539
2540define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
2541; SSE2-LABEL: combine_insertps4:
2542; SSE2:       # %bb.0:
2543; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
2544; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2545; SSE2-NEXT:    retq
2546;
2547; SSSE3-LABEL: combine_insertps4:
2548; SSSE3:       # %bb.0:
2549; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
2550; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
2551; SSSE3-NEXT:    retq
2552;
2553; SSE41-LABEL: combine_insertps4:
2554; SSE41:       # %bb.0:
2555; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2556; SSE41-NEXT:    retq
2557;
2558; AVX-LABEL: combine_insertps4:
2559; AVX:       # %bb.0:
2560; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
2561; AVX-NEXT:    retq
2562
2563  %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5>
2564  %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5>
2565  ret <4 x float> %d
2566}
2567
2568define void @combine_scalar_load_with_blend_with_zero(double* %a0, <4 x float>* %a1) {
2569; SSE-LABEL: combine_scalar_load_with_blend_with_zero:
2570; SSE:       # %bb.0:
2571; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
2572; SSE-NEXT:    movaps %xmm0, (%rsi)
2573; SSE-NEXT:    retq
2574;
2575; AVX-LABEL: combine_scalar_load_with_blend_with_zero:
2576; AVX:       # %bb.0:
2577; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
2578; AVX-NEXT:    vmovaps %xmm0, (%rsi)
2579; AVX-NEXT:    retq
2580  %1 = load double, double* %a0, align 8
2581  %2 = insertelement <2 x double> undef, double %1, i32 0
2582  %3 = insertelement <2 x double> %2, double 0.000000e+00, i32 1
2583  %4 = bitcast <2 x double> %3 to <4 x float>
2584  %5 = shufflevector <4 x float> %4, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
2585  store <4 x float> %5, <4 x float>* %a1, align 16
2586  ret void
2587}
2588
2589; PR30371
2590define <4 x float> @combine_constant_insertion_v4f32(float %f) {
2591; SSE2-LABEL: combine_constant_insertion_v4f32:
2592; SSE2:       # %bb.0:
2593; SSE2-NEXT:    movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0>
2594; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2595; SSE2-NEXT:    movaps %xmm1, %xmm0
2596; SSE2-NEXT:    retq
2597;
2598; SSSE3-LABEL: combine_constant_insertion_v4f32:
2599; SSSE3:       # %bb.0:
2600; SSSE3-NEXT:    movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0>
2601; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
2602; SSSE3-NEXT:    movaps %xmm1, %xmm0
2603; SSSE3-NEXT:    retq
2604;
2605; SSE41-LABEL: combine_constant_insertion_v4f32:
2606; SSE41:       # %bb.0:
2607; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
2608; SSE41-NEXT:    retq
2609;
2610; AVX-LABEL: combine_constant_insertion_v4f32:
2611; AVX:       # %bb.0:
2612; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
2613; AVX-NEXT:    retq
2614  %a0 = insertelement <4 x float> undef, float %f, i32 0
2615  %ret = shufflevector <4 x float> %a0, <4 x float> <float undef, float 4.0, float 5.0, float 3.0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2616  ret <4 x float> %ret
2617}
2618
2619define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) {
2620; SSE2-LABEL: combine_constant_insertion_v4i32:
2621; SSE2:       # %bb.0:
2622; SSE2-NEXT:    movd %edi, %xmm1
2623; SSE2-NEXT:    movaps {{.*#+}} xmm0 = <u,4,5,30>
2624; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2625; SSE2-NEXT:    retq
2626;
2627; SSSE3-LABEL: combine_constant_insertion_v4i32:
2628; SSSE3:       # %bb.0:
2629; SSSE3-NEXT:    movd %edi, %xmm1
2630; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = <u,4,5,30>
2631; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2632; SSSE3-NEXT:    retq
2633;
2634; SSE41-LABEL: combine_constant_insertion_v4i32:
2635; SSE41:       # %bb.0:
2636; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = <u,4,5,30>
2637; SSE41-NEXT:    pinsrd $0, %edi, %xmm0
2638; SSE41-NEXT:    retq
2639;
2640; AVX-LABEL: combine_constant_insertion_v4i32:
2641; AVX:       # %bb.0:
2642; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,4,5,30>
2643; AVX-NEXT:    vpinsrd $0, %edi, %xmm0, %xmm0
2644; AVX-NEXT:    retq
2645  %a0 = insertelement <4 x i32> undef, i32 %f, i32 0
2646  %ret = shufflevector <4 x i32> %a0, <4 x i32> <i32 undef, i32 4, i32 5, i32 30>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
2647  ret <4 x i32> %ret
2648}
2649
2650define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
2651; SSE2-LABEL: PR22377:
2652; SSE2:       # %bb.0: # %entry
2653; SSE2-NEXT:    movaps %xmm0, %xmm1
2654; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[2,3]
2655; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
2656; SSE2-NEXT:    addps %xmm0, %xmm1
2657; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2658; SSE2-NEXT:    retq
2659;
2660; SSSE3-LABEL: PR22377:
2661; SSSE3:       # %bb.0: # %entry
2662; SSSE3-NEXT:    movaps %xmm0, %xmm1
2663; SSSE3-NEXT:    haddps %xmm0, %xmm1
2664; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
2665; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
2666; SSSE3-NEXT:    retq
2667;
2668; SSE41-LABEL: PR22377:
2669; SSE41:       # %bb.0: # %entry
2670; SSE41-NEXT:    movaps %xmm0, %xmm1
2671; SSE41-NEXT:    haddps %xmm0, %xmm1
2672; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
2673; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
2674; SSE41-NEXT:    retq
2675;
2676; AVX-LABEL: PR22377:
2677; AVX:       # %bb.0: # %entry
2678; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm1
2679; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1]
2680; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
2681; AVX-NEXT:    retq
2682entry:
2683  %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
2684  %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
2685  %r2 = fadd <4 x float> %s1, %s2
2686  %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
2687  ret <4 x float> %s3
2688}
2689
2690define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) {
2691; SSE2-LABEL: PR22390:
2692; SSE2:       # %bb.0: # %entry
2693; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2694; SSE2-NEXT:    movaps %xmm0, %xmm2
2695; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2696; SSE2-NEXT:    addps %xmm0, %xmm2
2697; SSE2-NEXT:    movaps %xmm2, %xmm0
2698; SSE2-NEXT:    retq
2699;
2700; SSSE3-LABEL: PR22390:
2701; SSSE3:       # %bb.0: # %entry
2702; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2703; SSSE3-NEXT:    movaps %xmm0, %xmm2
2704; SSSE3-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
2705; SSSE3-NEXT:    addps %xmm0, %xmm2
2706; SSSE3-NEXT:    movaps %xmm2, %xmm0
2707; SSSE3-NEXT:    retq
2708;
2709; SSE41-LABEL: PR22390:
2710; SSE41:       # %bb.0: # %entry
2711; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2712; SSE41-NEXT:    blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2713; SSE41-NEXT:    addps %xmm1, %xmm0
2714; SSE41-NEXT:    retq
2715;
2716; AVX-LABEL: PR22390:
2717; AVX:       # %bb.0: # %entry
2718; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2]
2719; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
2720; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
2721; AVX-NEXT:    retq
2722entry:
2723  %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
2724  %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
2725  %r2 = fadd <4 x float> %s1, %s2
2726  ret <4 x float> %r2
2727}
2728
2729define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
2730; SSE-LABEL: PR22412:
2731; SSE:       # %bb.0: # %entry
2732; SSE-NEXT:    movaps %xmm3, %xmm1
2733; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
2734; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[3,2]
2735; SSE-NEXT:    retq
2736;
2737; AVX1-LABEL: PR22412:
2738; AVX1:       # %bb.0: # %entry
2739; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
2740; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2741; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm2[3,2],ymm0[5,4],ymm2[7,6]
2742; AVX1-NEXT:    retq
2743;
2744; AVX2-LABEL: PR22412:
2745; AVX2:       # %bb.0: # %entry
2746; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2747; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1]
2748; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6]
2749; AVX2-NEXT:    retq
2750entry:
2751  %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
2752  %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2>
2753  ret <8 x float> %s2
2754}
2755
2756define <4 x float> @PR30264(<4 x float> %x) {
2757; SSE2-LABEL: PR30264:
2758; SSE2:       # %bb.0:
2759; SSE2-NEXT:    xorps %xmm1, %xmm1
2760; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2761; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3]
2762; SSE2-NEXT:    movaps %xmm1, %xmm0
2763; SSE2-NEXT:    retq
2764;
2765; SSSE3-LABEL: PR30264:
2766; SSSE3:       # %bb.0:
2767; SSSE3-NEXT:    xorps %xmm1, %xmm1
2768; SSSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2769; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3]
2770; SSSE3-NEXT:    movaps %xmm1, %xmm0
2771; SSSE3-NEXT:    retq
2772;
2773; SSE41-LABEL: PR30264:
2774; SSE41:       # %bb.0:
2775; SSE41-NEXT:    movaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0>
2776; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm0[0],zero,xmm1[2,3]
2777; SSE41-NEXT:    movaps %xmm1, %xmm0
2778; SSE41-NEXT:    retq
2779;
2780; AVX-LABEL: PR30264:
2781; AVX:       # %bb.0:
2782; AVX-NEXT:    vmovaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0>
2783; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2,3]
2784; AVX-NEXT:    retq
2785  %shuf1 = shufflevector <4 x float> %x, <4 x float> <float undef, float 0.0, float undef, float undef>, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
2786  %shuf2 = shufflevector <4 x float> %shuf1, <4 x float> <float undef, float undef, float 4.0, float 1.0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2787  ret <4 x float> %shuf2
2788}
2789
2790define <8 x i16> @PR39549(<16 x i8> %x) {
2791; SSE-LABEL: PR39549:
2792; SSE:       # %bb.0:
2793; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2794; SSE-NEXT:    psraw $8, %xmm0
2795; SSE-NEXT:    retq
2796;
2797; AVX-LABEL: PR39549:
2798; AVX:       # %bb.0:
2799; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2800; AVX-NEXT:    vpsraw $8, %xmm0, %xmm0
2801; AVX-NEXT:    retq
2802  %a = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 8, i32 undef, i32 9, i32 undef, i32 10, i32 undef, i32 11, i32 undef, i32 12, i32 undef, i32 13, i32 undef, i32 14, i32 undef, i32 15, i32 undef>
2803  %b = bitcast <16 x i8> %a to <8 x i16>
2804  %c = shl <8 x i16> %b, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
2805  %d = ashr <8 x i16> %c, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
2806  ret <8 x i16> %d
2807}
2808
2809define <4 x i32> @PR41545(<4 x i32> %a0, <16 x i8> %a1) {
2810; SSE-LABEL: PR41545:
2811; SSE:       # %bb.0:
2812; SSE-NEXT:    paddd %xmm1, %xmm0
2813; SSE-NEXT:    retq
2814;
2815; AVX-LABEL: PR41545:
2816; AVX:       # %bb.0:
2817; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2818; AVX-NEXT:    retq
2819  %1  = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
2820  %2  = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
2821  %3  = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
2822  %4  = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
2823  %5  = zext <4 x i8> %1 to <4 x i32>
2824  %6  = zext <4 x i8> %2 to <4 x i32>
2825  %7  = zext <4 x i8> %3 to <4 x i32>
2826  %8  = zext <4 x i8> %4 to <4 x i32>
2827  %9  = shl <4 x i32> %6, <i32 8, i32 8, i32 8, i32 8>
2828  %10 = shl <4 x i32> %7, <i32 16, i32 16, i32 16, i32 16>
2829  %11 = shl <4 x i32> %8, <i32 24, i32 24, i32 24, i32 24>
2830  %12 = or <4 x i32> %5, %9
2831  %13 = or <4 x i32> %12, %10
2832  %14 = or <4 x i32> %13, %11
2833  %15 = add <4 x i32> %a0, %14
2834  ret <4 x i32> %15
2835}
2836
2837define <8 x i16> @shuffle_extract_insert(<8 x i16> %a) {
2838; SSE-LABEL: shuffle_extract_insert:
2839; SSE:       # %bb.0:
2840; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2841; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
2842; SSE-NEXT:    retq
2843;
2844; AVX1-LABEL: shuffle_extract_insert:
2845; AVX1:       # %bb.0:
2846; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2847; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
2848; AVX1-NEXT:    retq
2849;
2850; AVX2-SLOW-LABEL: shuffle_extract_insert:
2851; AVX2-SLOW:       # %bb.0:
2852; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2853; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
2854; AVX2-SLOW-NEXT:    retq
2855;
2856; AVX2-FAST-LABEL: shuffle_extract_insert:
2857; AVX2-FAST:       # %bb.0:
2858; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,12,13,10,11,8,9,14,15]
2859; AVX2-FAST-NEXT:    retq
2860  %a0 = extractelement <8 x i16> %a, i32 0
2861  %a1 = extractelement <8 x i16> %a, i32 1
2862  %a3 = extractelement <8 x i16> %a, i32 3
2863  %a4 = extractelement <8 x i16> %a, i32 4
2864  %a5 = extractelement <8 x i16> %a, i32 5
2865  %a6 = extractelement <8 x i16> %a, i32 6
2866  %a7 = extractelement <8 x i16> %a, i32 7
2867  %1 = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2868  %2 = insertelement <8 x i16> %1, i16 %a1, i32 1
2869  %3 = insertelement <8 x i16> %2, i16 %a0, i32 2
2870  %4 = insertelement <8 x i16> %3, i16 %a3, i32 3
2871  %5 = insertelement <8 x i16> %4, i16 %a6, i32 4
2872  %6 = insertelement <8 x i16> %5, i16 %a5, i32 5
2873  %7 = insertelement <8 x i16> %6, i16 %a4, i32 6
2874  %8 = insertelement <8 x i16> %7, i16 %a7, i32 7
2875  ret <8 x i16> %8
2876}
2877
2878define <8 x i16> @shuffle_extract_insert_double(<8 x i16> %a, <8 x i16> %b) {
2879; SSE2-LABEL: shuffle_extract_insert_double:
2880; SSE2:       # %bb.0:
2881; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
2882; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
2883; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2884; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
2885; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2886; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2887; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2888; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
2889; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2890; SSE2-NEXT:    retq
2891;
2892; SSSE3-LABEL: shuffle_extract_insert_double:
2893; SSSE3:       # %bb.0:
2894; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2895; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2896; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2897; SSSE3-NEXT:    retq
2898;
2899; SSE41-LABEL: shuffle_extract_insert_double:
2900; SSE41:       # %bb.0:
2901; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2902; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2903; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2904; SSE41-NEXT:    retq
2905;
2906; AVX-LABEL: shuffle_extract_insert_double:
2907; AVX:       # %bb.0:
2908; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2909; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2910; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2911; AVX-NEXT:    retq
2912  %a0 = extractelement <8 x i16> %a, i32 0
2913  %a4 = extractelement <8 x i16> %a, i32 4
2914  %a6 = extractelement <8 x i16> %a, i32 6
2915  %b11 = extractelement <8 x i16> %b, i32 3
2916  %b13 = extractelement <8 x i16> %b, i32 5
2917  %b15 = extractelement <8 x i16> %b, i32 7
2918  %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2919  %2 = insertelement <8 x i16> %1, i16 %a0, i32 2
2920  %3 = insertelement <8 x i16> %2, i16 %b11, i32 3
2921  %4 = insertelement <8 x i16> %3, i16 %a6, i32 4
2922  %5 = insertelement <8 x i16> %4, i16 %b13, i32 5
2923  %6 = insertelement <8 x i16> %5, i16 %a4, i32 6
2924  %7 = insertelement <8 x i16> %6, i16 %b15, i32 7
2925  ret <8 x i16> %7
2926}
2927
2928define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i16> %b) {
2929; SSE2-LABEL: shuffle_extract_concat_insert:
2930; SSE2:       # %bb.0:
2931; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2932; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2933; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2934; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2935; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
2936; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7]
2937; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
2938; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2939; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
2940; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2941; SSE2-NEXT:    retq
2942;
2943; SSSE3-LABEL: shuffle_extract_concat_insert:
2944; SSSE3:       # %bb.0:
2945; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2946; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2947; SSSE3-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2948; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2949; SSSE3-NEXT:    retq
2950;
2951; SSE41-LABEL: shuffle_extract_concat_insert:
2952; SSE41:       # %bb.0:
2953; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2954; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2955; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2956; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2957; SSE41-NEXT:    retq
2958;
2959; AVX-LABEL: shuffle_extract_concat_insert:
2960; AVX:       # %bb.0:
2961; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2962; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
2963; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
2964; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2965; AVX-NEXT:    retq
2966  %a = shufflevector <4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2967  %a0 = extractelement <8 x i16> %a, i32 0
2968  %a4 = extractelement <8 x i16> %a, i32 4
2969  %a6 = extractelement <8 x i16> %a, i32 6
2970  %b11 = extractelement <8 x i16> %b, i32 3
2971  %b13 = extractelement <8 x i16> %b, i32 5
2972  %b15 = extractelement <8 x i16> %b, i32 7
2973  %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2974  %2 = insertelement <8 x i16> %1, i16 %a0, i32 2
2975  %3 = insertelement <8 x i16> %2, i16 %b11, i32 3
2976  %4 = insertelement <8 x i16> %3, i16 %a6, i32 4
2977  %5 = insertelement <8 x i16> %4, i16 %b13, i32 5
2978  %6 = insertelement <8 x i16> %5, i16 %a4, i32 6
2979  %7 = insertelement <8 x i16> %6, i16 %b15, i32 7
2980  ret <8 x i16> %7
2981}
2982
2983define <8 x i16> @shuffle_scalar_to_vector_extract(<8 x i8>* %p0, i8* %p1, i8* %p2) {
2984; SSE2-LABEL: shuffle_scalar_to_vector_extract:
2985; SSE2:       # %bb.0:
2986; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
2987; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2988; SSE2-NEXT:    psraw $8, %xmm1
2989; SSE2-NEXT:    pextrw $7, %xmm1, %eax
2990; SSE2-NEXT:    movd %eax, %xmm2
2991; SSE2-NEXT:    movsbl (%rsi), %eax
2992; SSE2-NEXT:    movd %eax, %xmm0
2993; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2994; SSE2-NEXT:    movsbl (%rdx), %eax
2995; SSE2-NEXT:    movd %eax, %xmm0
2996; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2997; SSE2-NEXT:    pxor %xmm0, %xmm0
2998; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2999; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3000; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3001; SSE2-NEXT:    retq
3002;
3003; SSSE3-LABEL: shuffle_scalar_to_vector_extract:
3004; SSSE3:       # %bb.0:
3005; SSSE3-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
3006; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3007; SSSE3-NEXT:    psraw $8, %xmm1
3008; SSSE3-NEXT:    movsbl (%rsi), %eax
3009; SSSE3-NEXT:    movd %eax, %xmm2
3010; SSSE3-NEXT:    palignr {{.*#+}} xmm2 = xmm1[14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
3011; SSSE3-NEXT:    movsbl (%rdx), %eax
3012; SSSE3-NEXT:    movd %eax, %xmm0
3013; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3014; SSSE3-NEXT:    pxor %xmm0, %xmm0
3015; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3016; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
3017; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
3018; SSSE3-NEXT:    retq
3019;
3020; SSE41-LABEL: shuffle_scalar_to_vector_extract:
3021; SSE41:       # %bb.0:
3022; SSE41-NEXT:    pmovsxbw (%rdi), %xmm0
3023; SSE41-NEXT:    pextrw $4, %xmm0, %eax
3024; SSE41-NEXT:    pextrw $7, %xmm0, %ecx
3025; SSE41-NEXT:    pxor %xmm0, %xmm0
3026; SSE41-NEXT:    pinsrw $1, %eax, %xmm0
3027; SSE41-NEXT:    movl $65531, %eax # imm = 0xFFFB
3028; SSE41-NEXT:    pinsrw $2, %eax, %xmm0
3029; SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
3030; SSE41-NEXT:    movsbl (%rsi), %eax
3031; SSE41-NEXT:    pinsrw $5, %eax, %xmm0
3032; SSE41-NEXT:    movsbl (%rdx), %eax
3033; SSE41-NEXT:    pinsrw $6, %eax, %xmm0
3034; SSE41-NEXT:    retq
3035;
3036; AVX-LABEL: shuffle_scalar_to_vector_extract:
3037; AVX:       # %bb.0:
3038; AVX-NEXT:    vpmovsxbw (%rdi), %xmm0
3039; AVX-NEXT:    vpextrw $4, %xmm0, %eax
3040; AVX-NEXT:    vpextrw $7, %xmm0, %ecx
3041; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
3042; AVX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
3043; AVX-NEXT:    movl $65531, %eax # imm = 0xFFFB
3044; AVX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
3045; AVX-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
3046; AVX-NEXT:    movsbl (%rsi), %eax
3047; AVX-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
3048; AVX-NEXT:    movsbl (%rdx), %eax
3049; AVX-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
3050; AVX-NEXT:    retq
3051  %tmp = load <8 x i8>, <8 x i8>* %p0, align 1
3052  %tmp1 = sext <8 x i8> %tmp to <8 x i16>
3053  %tmp2 = load i8, i8* %p1, align 1
3054  %cvt1 = sext i8 %tmp2 to i16
3055  %tmp3 = load i8, i8* %p2, align 1
3056  %cvt2 = sext i8 %tmp3 to i16
3057  %tmp4 = extractelement <8 x i16> %tmp1, i32 4
3058  %tmp5 = extractelement <8 x i16> %tmp1, i32 7
3059  %tmp6 = insertelement <8 x i16> <i16 undef, i16 undef, i16 -5, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 undef, i32 0
3060  %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp4, i32 1
3061  %tmp8 = insertelement <8 x i16> %tmp7, i16 undef, i32 3
3062  %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp5, i32 4
3063  %tmp10 = insertelement <8 x i16> %tmp9, i16 %cvt1, i32 5
3064  %tmp11 = insertelement <8 x i16> %tmp10, i16 %cvt2, i32 6
3065  %tmp12 = insertelement <8 x i16> %tmp11, i16 undef, i32 7
3066  %tmp13 = shufflevector <8 x i16> %tmp12, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
3067  ret <8 x i16> %tmp13
3068}
3069
3070; Bug noticed in D96345
3071define i32 @shuffle_binops_with_undef() {
3072; SSE-LABEL: shuffle_binops_with_undef:
3073; SSE:       # %bb.0: # %entry
3074; SSE-NEXT:    movdqa (%rax), %xmm0
3075; SSE-NEXT:    paddw %xmm0, %xmm0
3076; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
3077; SSE-NEXT:    psrlw %xmm1, %xmm0
3078; SSE-NEXT:    movdqa %xmm0, (%rax)
3079; SSE-NEXT:    retq
3080;
3081; AVX-LABEL: shuffle_binops_with_undef:
3082; AVX:       # %bb.0: # %entry
3083; AVX-NEXT:    vmovdqa (%rax), %xmm0
3084; AVX-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
3085; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
3086; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
3087; AVX-NEXT:    vmovdqa %xmm0, (%rax)
3088; AVX-NEXT:    retq
3089entry:
3090  %load0 = load <8 x i16>, <8 x i16>* undef, align 16
3091  %load1 = load <8 x i16>, <8 x i16>* undef, align 16
3092  %shuf0 = shufflevector <16 x i8> undef, <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
3093  %addi = add <8 x i16> %load0, %load1
3094  %bc0 = bitcast <8 x i16> %addi to <2 x i64>
3095  %bc1 = bitcast <16 x i8> %shuf0 to <8 x i16>
3096  %shuf1 = shufflevector <8 x i16> %load1, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
3097  %addi24 = add <8 x i16> %shuf1, %bc1
3098  %bc2 = bitcast <8 x i16> %addi24 to <2 x i64>
3099  %shuf2 = shufflevector <2 x i64> %bc0, <2 x i64> %bc2, <2 x i32> <i32 0, i32 2>
3100  %bc3 = bitcast <2 x i64> %shuf2 to <8 x i16>
3101  %psrli = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %bc3, i32 ptrtoint (i32 ()* @shuffle_binops_with_undef to i32))
3102  store <8 x i16> %psrli, <8 x i16>* undef, align 16
3103  ret i32 undef
3104}
3105declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32)
3106
3107define void @PR43024() {
3108; SSE-LABEL: PR43024:
3109; SSE:       # %bb.0:
3110; SSE-NEXT:    movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3111; SSE-NEXT:    movaps %xmm0, (%rax)
3112; SSE-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
3113; SSE-NEXT:    xorps %xmm1, %xmm1
3114; SSE-NEXT:    addss %xmm1, %xmm0
3115; SSE-NEXT:    addss %xmm1, %xmm0
3116; SSE-NEXT:    movss %xmm0, (%rax)
3117; SSE-NEXT:    retq
3118;
3119; AVX-LABEL: PR43024:
3120; AVX:       # %bb.0:
3121; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
3122; AVX-NEXT:    vmovaps %xmm0, (%rax)
3123; AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm0
3124; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
3125; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
3126; AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}+12(%rip), %xmm0, %xmm0
3127; AVX-NEXT:    vmovss %xmm0, (%rax)
3128; AVX-NEXT:    retq
3129  store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, <4 x float>* undef, align 16
3130  %1 = load <4 x float>, <4 x float>* undef, align 16
3131  %2 = fmul <4 x float> %1, <float 0x0, float 0x0, float 0x0, float 0x0>
3132  %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
3133  %4 = fadd <4 x float> %2, %3
3134  %5 = fadd <4 x float> zeroinitializer, %4
3135  %6 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
3136  %7 = fadd <4 x float> %6, %5
3137  %8 = extractelement <4 x float> %7, i32 0
3138  store float %8, float* undef, align 8
3139  ret void
3140}
3141
3142define void @PR45604(<32 x i16>* %dst, <8 x i16>* %src) {
3143; SSE2-LABEL: PR45604:
3144; SSE2:       # %bb.0:
3145; SSE2-NEXT:    movdqa (%rsi), %xmm1
3146; SSE2-NEXT:    movd %xmm1, %eax
3147; SSE2-NEXT:    movzwl %ax, %eax
3148; SSE2-NEXT:    movd %eax, %xmm0
3149; SSE2-NEXT:    movl $11, %eax
3150; SSE2-NEXT:    pinsrw $2, %eax, %xmm0
3151; SSE2-NEXT:    pextrw $1, %xmm1, %ecx
3152; SSE2-NEXT:    pinsrw $4, %ecx, %xmm0
3153; SSE2-NEXT:    pinsrw $6, %eax, %xmm0
3154; SSE2-NEXT:    pextrw $2, %xmm1, %ecx
3155; SSE2-NEXT:    movd %ecx, %xmm2
3156; SSE2-NEXT:    pinsrw $2, %eax, %xmm2
3157; SSE2-NEXT:    pextrw $3, %xmm1, %ecx
3158; SSE2-NEXT:    pinsrw $4, %ecx, %xmm2
3159; SSE2-NEXT:    pinsrw $6, %eax, %xmm2
3160; SSE2-NEXT:    pextrw $4, %xmm1, %ecx
3161; SSE2-NEXT:    movd %ecx, %xmm3
3162; SSE2-NEXT:    pinsrw $2, %eax, %xmm3
3163; SSE2-NEXT:    pextrw $5, %xmm1, %ecx
3164; SSE2-NEXT:    pinsrw $4, %ecx, %xmm3
3165; SSE2-NEXT:    pinsrw $6, %eax, %xmm3
3166; SSE2-NEXT:    pextrw $6, %xmm1, %ecx
3167; SSE2-NEXT:    movd %ecx, %xmm4
3168; SSE2-NEXT:    pinsrw $2, %eax, %xmm4
3169; SSE2-NEXT:    pextrw $7, %xmm1, %ecx
3170; SSE2-NEXT:    pinsrw $4, %ecx, %xmm4
3171; SSE2-NEXT:    pinsrw $6, %eax, %xmm4
3172; SSE2-NEXT:    movdqa %xmm4, 48(%rdi)
3173; SSE2-NEXT:    movdqa %xmm3, 32(%rdi)
3174; SSE2-NEXT:    movdqa %xmm2, 16(%rdi)
3175; SSE2-NEXT:    movdqa %xmm0, (%rdi)
3176; SSE2-NEXT:    retq
3177;
3178; SSSE3-LABEL: PR45604:
3179; SSSE3:       # %bb.0:
3180; SSSE3-NEXT:    movdqa (%rsi), %xmm1
3181; SSSE3-NEXT:    movd %xmm1, %eax
3182; SSSE3-NEXT:    movzwl %ax, %eax
3183; SSSE3-NEXT:    movd %eax, %xmm0
3184; SSSE3-NEXT:    movl $11, %eax
3185; SSSE3-NEXT:    pinsrw $2, %eax, %xmm0
3186; SSSE3-NEXT:    pextrw $1, %xmm1, %ecx
3187; SSSE3-NEXT:    pinsrw $4, %ecx, %xmm0
3188; SSSE3-NEXT:    pinsrw $6, %eax, %xmm0
3189; SSSE3-NEXT:    pextrw $2, %xmm1, %ecx
3190; SSSE3-NEXT:    movd %ecx, %xmm2
3191; SSSE3-NEXT:    pinsrw $2, %eax, %xmm2
3192; SSSE3-NEXT:    pextrw $3, %xmm1, %ecx
3193; SSSE3-NEXT:    pinsrw $4, %ecx, %xmm2
3194; SSSE3-NEXT:    pinsrw $6, %eax, %xmm2
3195; SSSE3-NEXT:    pextrw $4, %xmm1, %ecx
3196; SSSE3-NEXT:    movd %ecx, %xmm3
3197; SSSE3-NEXT:    pinsrw $2, %eax, %xmm3
3198; SSSE3-NEXT:    pextrw $5, %xmm1, %ecx
3199; SSSE3-NEXT:    pinsrw $4, %ecx, %xmm3
3200; SSSE3-NEXT:    pinsrw $6, %eax, %xmm3
3201; SSSE3-NEXT:    pextrw $6, %xmm1, %ecx
3202; SSSE3-NEXT:    movd %ecx, %xmm4
3203; SSSE3-NEXT:    pinsrw $2, %eax, %xmm4
3204; SSSE3-NEXT:    pextrw $7, %xmm1, %ecx
3205; SSSE3-NEXT:    pinsrw $4, %ecx, %xmm4
3206; SSSE3-NEXT:    pinsrw $6, %eax, %xmm4
3207; SSSE3-NEXT:    movdqa %xmm4, 48(%rdi)
3208; SSSE3-NEXT:    movdqa %xmm3, 32(%rdi)
3209; SSSE3-NEXT:    movdqa %xmm2, 16(%rdi)
3210; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
3211; SSSE3-NEXT:    retq
3212;
3213; SSE41-LABEL: PR45604:
3214; SSE41:       # %bb.0:
3215; SSE41-NEXT:    movdqa (%rsi), %xmm1
3216; SSE41-NEXT:    pextrw $2, %xmm1, %eax
3217; SSE41-NEXT:    movd %eax, %xmm0
3218; SSE41-NEXT:    movl $11, %eax
3219; SSE41-NEXT:    pinsrw $2, %eax, %xmm0
3220; SSE41-NEXT:    pextrw $3, %xmm1, %ecx
3221; SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
3222; SSE41-NEXT:    pinsrw $6, %eax, %xmm0
3223; SSE41-NEXT:    pextrw $4, %xmm1, %ecx
3224; SSE41-NEXT:    movd %ecx, %xmm2
3225; SSE41-NEXT:    pinsrw $2, %eax, %xmm2
3226; SSE41-NEXT:    pextrw $5, %xmm1, %ecx
3227; SSE41-NEXT:    pinsrw $4, %ecx, %xmm2
3228; SSE41-NEXT:    pinsrw $6, %eax, %xmm2
3229; SSE41-NEXT:    pextrw $6, %xmm1, %ecx
3230; SSE41-NEXT:    movd %ecx, %xmm3
3231; SSE41-NEXT:    pinsrw $2, %eax, %xmm3
3232; SSE41-NEXT:    pextrw $7, %xmm1, %ecx
3233; SSE41-NEXT:    pinsrw $4, %ecx, %xmm3
3234; SSE41-NEXT:    pinsrw $6, %eax, %xmm3
3235; SSE41-NEXT:    pxor %xmm4, %xmm4
3236; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3,4,5,6,7]
3237; SSE41-NEXT:    pinsrw $2, %eax, %xmm4
3238; SSE41-NEXT:    pextrw $1, %xmm1, %ecx
3239; SSE41-NEXT:    pinsrw $4, %ecx, %xmm4
3240; SSE41-NEXT:    pinsrw $6, %eax, %xmm4
3241; SSE41-NEXT:    movdqa %xmm4, (%rdi)
3242; SSE41-NEXT:    movdqa %xmm3, 48(%rdi)
3243; SSE41-NEXT:    movdqa %xmm2, 32(%rdi)
3244; SSE41-NEXT:    movdqa %xmm0, 16(%rdi)
3245; SSE41-NEXT:    retq
3246;
3247; AVX1-LABEL: PR45604:
3248; AVX1:       # %bb.0:
3249; AVX1-NEXT:    vmovdqa (%rsi), %xmm0
3250; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
3251; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
3252; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [11,11,11,0,11,11,11,0]
3253; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
3254; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
3255; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3256; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
3257; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
3258; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
3259; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
3260; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
3261; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
3262; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
3263; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
3264; AVX1-NEXT:    vmovups %ymm0, (%rdi)
3265; AVX1-NEXT:    vmovups %ymm1, 32(%rdi)
3266; AVX1-NEXT:    vzeroupper
3267; AVX1-NEXT:    retq
3268;
3269; AVX2-LABEL: PR45604:
3270; AVX2:       # %bb.0:
3271; AVX2-NEXT:    vmovdqa (%rsi), %xmm0
3272; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,2,0,2]
3273; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u>
3274; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
3275; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0>
3276; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7]
3277; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3]
3278; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
3279; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
3280; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
3281; AVX2-NEXT:    vmovdqu %ymm1, (%rdi)
3282; AVX2-NEXT:    vzeroupper
3283; AVX2-NEXT:    retq
3284  %v1 = load <8 x i16>, <8 x i16>* %src, align 16
3285  %v2 = shufflevector <8 x i16> %v1, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
3286  %v3 = shufflevector <16 x i16> %v2, <16 x i16> <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
3287  store <32 x i16> %v3, <32 x i16>* %dst, align 16
3288  ret void
3289}
3290
3291; Test case reported on D105827
3292define void @SpinningCube() {
3293; SSE2-LABEL: SpinningCube:
3294; SSE2:       # %bb.0: # %entry
3295; SSE2-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
3296; SSE2-NEXT:    movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
3297; SSE2-NEXT:    movaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u>
3298; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3299; SSE2-NEXT:    movaps %xmm2, %xmm3
3300; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[1,3]
3301; SSE2-NEXT:    xorps %xmm4, %xmm4
3302; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
3303; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,3]
3304; SSE2-NEXT:    addps %xmm4, %xmm2
3305; SSE2-NEXT:    movaps %xmm2, (%rax)
3306; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3307; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0,0,0]
3308; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
3309; SSE2-NEXT:    mulps %xmm2, %xmm1
3310; SSE2-NEXT:    addps %xmm0, %xmm1
3311; SSE2-NEXT:    movaps %xmm1, (%rax)
3312; SSE2-NEXT:    retq
3313;
3314; SSSE3-LABEL: SpinningCube:
3315; SSSE3:       # %bb.0: # %entry
3316; SSSE3-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
3317; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
3318; SSSE3-NEXT:    movaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u>
3319; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3320; SSSE3-NEXT:    movaps %xmm2, %xmm3
3321; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[1,3]
3322; SSSE3-NEXT:    xorps %xmm4, %xmm4
3323; SSSE3-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
3324; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,3]
3325; SSSE3-NEXT:    addps %xmm4, %xmm2
3326; SSSE3-NEXT:    movaps %xmm2, (%rax)
3327; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3328; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
3329; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0,0,2]
3330; SSSE3-NEXT:    mulps %xmm1, %xmm2
3331; SSSE3-NEXT:    addps %xmm0, %xmm2
3332; SSSE3-NEXT:    movaps %xmm2, (%rax)
3333; SSSE3-NEXT:    retq
3334;
3335; SSE41-LABEL: SpinningCube:
3336; SSE41:       # %bb.0: # %entry
3337; SSE41-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
3338; SSE41-NEXT:    movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
3339; SSE41-NEXT:    movaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u>
3340; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,1,3]
3341; SSE41-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3342; SSE41-NEXT:    movaps %xmm1, %xmm3
3343; SSE41-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[0]
3344; SSE41-NEXT:    movaps %xmm0, %xmm4
3345; SSE41-NEXT:    insertps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[2,3]
3346; SSE41-NEXT:    addps %xmm3, %xmm4
3347; SSE41-NEXT:    movaps %xmm4, (%rax)
3348; SSE41-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
3349; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0,0,2]
3350; SSE41-NEXT:    mulps %xmm1, %xmm2
3351; SSE41-NEXT:    addps %xmm0, %xmm2
3352; SSE41-NEXT:    movaps %xmm2, (%rax)
3353; SSE41-NEXT:    retq
3354;
3355; AVX1-LABEL: SpinningCube:
3356; AVX1:       # %bb.0: # %entry
3357; AVX1-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
3358; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
3359; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u>
3360; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,3]
3361; AVX1-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
3362; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
3363; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[2,3]
3364; AVX1-NEXT:    vaddps %xmm3, %xmm2, %xmm2
3365; AVX1-NEXT:    vmovaps %xmm2, (%rax)
3366; AVX1-NEXT:    vbroadcastss (%rax), %xmm2
3367; AVX1-NEXT:    vmulps %xmm1, %xmm2, %xmm1
3368; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,3]
3369; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
3370; AVX1-NEXT:    vmovaps %xmm0, (%rax)
3371; AVX1-NEXT:    retq
3372;
3373; AVX2-LABEL: SpinningCube:
3374; AVX2:       # %bb.0: # %entry
3375; AVX2-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
3376; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
3377; AVX2-NEXT:    vmovaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u>
3378; AVX2-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,3]
3379; AVX2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
3380; AVX2-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
3381; AVX2-NEXT:    vinsertps {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[2,3]
3382; AVX2-NEXT:    vaddps %xmm3, %xmm2, %xmm2
3383; AVX2-NEXT:    vmovaps %xmm2, (%rax)
3384; AVX2-NEXT:    vbroadcastss (%rax), %xmm2
3385; AVX2-NEXT:    vmulps %xmm1, %xmm2, %xmm1
3386; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,3]
3387; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
3388; AVX2-NEXT:    vmovaps %xmm0, (%rax)
3389; AVX2-NEXT:    retq
3390entry:
3391  store float 1.000000e+00, float* undef, align 4
3392  %0 = load float, float* undef, align 4
3393  %1 = fmul float undef, 0.000000e+00
3394  %2 = insertelement <4 x float> poison, float %0, i32 3
3395  %3 = load float, float* undef, align 4
3396  %4 = insertelement <2 x float> poison, float %3, i32 0
3397  %5 = shufflevector <2 x float> %4, <2 x float> poison, <2 x i32> zeroinitializer
3398  %6 = fmul <2 x float> %5, <float 0.000000e+00, float -2.000000e+00>
3399  %7 = fadd float %1, undef
3400  %8 = shufflevector <2 x float> %6, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
3401  %9 = shufflevector <4 x float> undef, <4 x float> %8, <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
3402  %10 = insertelement <4 x float> %9, float %7, i32 3
3403  %11 = insertelement <4 x float> %2, float 0x7FF8000000000000, i32 1
3404  %12 = insertelement <4 x float> %11, float undef, i32 0
3405  %13 = insertelement <4 x float> %12, float undef, i32 2
3406  %14 = fadd <4 x float> %10, %13
3407  store <4 x float> %14, <4 x float>* undef, align 16
3408  %15 = load float, float* undef, align 4
3409  %16 = insertelement <2 x float> poison, float %15, i32 0
3410  %17 = shufflevector <2 x float> %16, <2 x float> poison, <2 x i32> zeroinitializer
3411  %18 = fmul <2 x float> %17, <float 0.000000e+00, float -2.000000e+00>
3412  %19 = shufflevector <2 x float> %18, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
3413  %20 = shufflevector <4 x float> undef, <4 x float> %19, <4 x i32> <i32 0, i32 4, i32 5, i32 undef>
3414  %21 = fadd <4 x float> %20, %2
3415  store <4 x float> %21, <4 x float>* undef, align 16
3416  ret void
3417}
3418