1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST-ALL
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST-PERLANE
7
8; fold (shl 0, x) -> 0
9define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) {
10; SSE-LABEL: combine_vec_shl_zero:
11; SSE:       # %bb.0:
12; SSE-NEXT:    xorps %xmm0, %xmm0
13; SSE-NEXT:    retq
14;
15; AVX-LABEL: combine_vec_shl_zero:
16; AVX:       # %bb.0:
17; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
18; AVX-NEXT:    retq
19  %1 = shl <4 x i32> zeroinitializer, %x
20  ret <4 x i32> %1
21}
22
23; fold (shl x, c >= size(x)) -> undef
24define <4 x i32> @combine_vec_shl_outofrange0(<4 x i32> %x) {
25; CHECK-LABEL: combine_vec_shl_outofrange0:
26; CHECK:       # %bb.0:
27; CHECK-NEXT:    retq
28  %1 = shl <4 x i32> %x, <i32 33, i32 33, i32 33, i32 33>
29  ret <4 x i32> %1
30}
31
32define <4 x i32> @combine_vec_shl_outofrange1(<4 x i32> %x) {
33; CHECK-LABEL: combine_vec_shl_outofrange1:
34; CHECK:       # %bb.0:
35; CHECK-NEXT:    retq
36  %1 = shl <4 x i32> %x, <i32 33, i32 34, i32 35, i32 36>
37  ret <4 x i32> %1
38}
39
40define <4 x i32> @combine_vec_shl_outofrange2(<4 x i32> %a0) {
41; CHECK-LABEL: combine_vec_shl_outofrange2:
42; CHECK:       # %bb.0:
43; CHECK-NEXT:    retq
44  %1 = and <4 x i32> %a0, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
45  %2 = shl <4 x i32> %1, <i32 33, i32 33, i32 33, i32 33>
46  ret <4 x i32> %2
47}
48
49define <4 x i32> @combine_vec_shl_outofrange3(<4 x i32> %a0) {
50; CHECK-LABEL: combine_vec_shl_outofrange3:
51; CHECK:       # %bb.0:
52; CHECK-NEXT:    retq
53  %1 = shl <4 x i32> %a0, <i32 33, i32 34, i32 35, i32 undef>
54  ret <4 x i32> %1
55}
56
57; fold (shl x, 0) -> x
58define <4 x i32> @combine_vec_shl_by_zero(<4 x i32> %x) {
59; CHECK-LABEL: combine_vec_shl_by_zero:
60; CHECK:       # %bb.0:
61; CHECK-NEXT:    retq
62  %1 = shl <4 x i32> %x, zeroinitializer
63  ret <4 x i32> %1
64}
65
66; if (shl x, c) is known to be zero, return 0
67define <4 x i32> @combine_vec_shl_known_zero0(<4 x i32> %x) {
68; SSE-LABEL: combine_vec_shl_known_zero0:
69; SSE:       # %bb.0:
70; SSE-NEXT:    xorps %xmm0, %xmm0
71; SSE-NEXT:    retq
72;
73; AVX-LABEL: combine_vec_shl_known_zero0:
74; AVX:       # %bb.0:
75; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
76; AVX-NEXT:    retq
77  %1 = and <4 x i32> %x, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760>
78  %2 = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
79  ret <4 x i32> %2
80}
81
82define <4 x i32> @combine_vec_shl_known_zero1(<4 x i32> %x) {
83; SSE2-LABEL: combine_vec_shl_known_zero1:
84; SSE2:       # %bb.0:
85; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
86; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65536,32768,16384,8192]
87; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
88; SSE2-NEXT:    pmuludq %xmm1, %xmm0
89; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
90; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
91; SSE2-NEXT:    pmuludq %xmm2, %xmm1
92; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
93; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
94; SSE2-NEXT:    retq
95;
96; SSE41-LABEL: combine_vec_shl_known_zero1:
97; SSE41:       # %bb.0:
98; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
99; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
100; SSE41-NEXT:    retq
101;
102; AVX-LABEL: combine_vec_shl_known_zero1:
103; AVX:       # %bb.0:
104; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
105; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
106; AVX-NEXT:    retq
107  %1 = and <4 x i32> %x, <i32 4294901760, i32 8589803520, i32 17179607040, i32 34359214080>
108  %2 = shl <4 x i32> %1, <i32 16, i32 15, i32 14, i32 13>
109  ret <4 x i32> %2
110}
111
112; fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
113define <4 x i32> @combine_vec_shl_trunc_and(<4 x i32> %x, <4 x i64> %y) {
114; SSE2-LABEL: combine_vec_shl_trunc_and:
115; SSE2:       # %bb.0:
116; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
117; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
118; SSE2-NEXT:    pslld $23, %xmm1
119; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
120; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
121; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
122; SSE2-NEXT:    pmuludq %xmm1, %xmm0
123; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
124; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
125; SSE2-NEXT:    pmuludq %xmm2, %xmm1
126; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
127; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
128; SSE2-NEXT:    retq
129;
130; SSE41-LABEL: combine_vec_shl_trunc_and:
131; SSE41:       # %bb.0:
132; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
133; SSE41-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
134; SSE41-NEXT:    pslld $23, %xmm1
135; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
136; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
137; SSE41-NEXT:    pmulld %xmm1, %xmm0
138; SSE41-NEXT:    retq
139;
140; AVX-SLOW-LABEL: combine_vec_shl_trunc_and:
141; AVX-SLOW:       # %bb.0:
142; AVX-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
143; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
144; AVX-SLOW-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
145; AVX-SLOW-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
146; AVX-SLOW-NEXT:    vzeroupper
147; AVX-SLOW-NEXT:    retq
148;
149; AVX-FAST-ALL-LABEL: combine_vec_shl_trunc_and:
150; AVX-FAST-ALL:       # %bb.0:
151; AVX-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
152; AVX-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
153; AVX-FAST-ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
154; AVX-FAST-ALL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
155; AVX-FAST-ALL-NEXT:    vzeroupper
156; AVX-FAST-ALL-NEXT:    retq
157;
158; AVX-FAST-PERLANE-LABEL: combine_vec_shl_trunc_and:
159; AVX-FAST-PERLANE:       # %bb.0:
160; AVX-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm1, %xmm2
161; AVX-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
162; AVX-FAST-PERLANE-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
163; AVX-FAST-PERLANE-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
164; AVX-FAST-PERLANE-NEXT:    vzeroupper
165; AVX-FAST-PERLANE-NEXT:    retq
166  %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
167  %2 = trunc <4 x i64> %1 to <4 x i32>
168  %3 = shl <4 x i32> %x, %2
169  ret <4 x i32> %3
170}
171
172; fold (shl (shl x, c1), c2) -> (shl x, (add c1, c2))
173define <4 x i32> @combine_vec_shl_shl0(<4 x i32> %x) {
174; SSE-LABEL: combine_vec_shl_shl0:
175; SSE:       # %bb.0:
176; SSE-NEXT:    pslld $6, %xmm0
177; SSE-NEXT:    retq
178;
179; AVX-LABEL: combine_vec_shl_shl0:
180; AVX:       # %bb.0:
181; AVX-NEXT:    vpslld $6, %xmm0, %xmm0
182; AVX-NEXT:    retq
183  %1 = shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
184  %2 = shl <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
185  ret <4 x i32> %2
186}
187
188define <4 x i32> @combine_vec_shl_shl1(<4 x i32> %x) {
189; SSE2-LABEL: combine_vec_shl_shl1:
190; SSE2:       # %bb.0:
191; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [16,64,256,1024]
192; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
193; SSE2-NEXT:    pmuludq %xmm1, %xmm0
194; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
195; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
196; SSE2-NEXT:    pmuludq %xmm2, %xmm1
197; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
198; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
199; SSE2-NEXT:    retq
200;
201; SSE41-LABEL: combine_vec_shl_shl1:
202; SSE41:       # %bb.0:
203; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
204; SSE41-NEXT:    retq
205;
206; AVX-LABEL: combine_vec_shl_shl1:
207; AVX:       # %bb.0:
208; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
209; AVX-NEXT:    retq
210  %1 = shl <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3>
211  %2 = shl <4 x i32> %1, <i32 4, i32 5, i32 6, i32 7>
212  ret <4 x i32> %2
213}
214
215; fold (shl (shl x, c1), c2) -> 0
216define <4 x i32> @combine_vec_shl_shlr_zero0(<4 x i32> %x) {
217; SSE-LABEL: combine_vec_shl_shlr_zero0:
218; SSE:       # %bb.0:
219; SSE-NEXT:    xorps %xmm0, %xmm0
220; SSE-NEXT:    retq
221;
222; AVX-LABEL: combine_vec_shl_shlr_zero0:
223; AVX:       # %bb.0:
224; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
225; AVX-NEXT:    retq
226  %1 = shl <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
227  %2 = shl <4 x i32> %1, <i32 20, i32 20, i32 20, i32 20>
228  ret <4 x i32> %2
229}
230
231define <4 x i32> @combine_vec_shl_shl_zero1(<4 x i32> %x) {
232; SSE-LABEL: combine_vec_shl_shl_zero1:
233; SSE:       # %bb.0:
234; SSE-NEXT:    xorps %xmm0, %xmm0
235; SSE-NEXT:    retq
236;
237; AVX-LABEL: combine_vec_shl_shl_zero1:
238; AVX:       # %bb.0:
239; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
240; AVX-NEXT:    retq
241  %1 = shl <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20>
242  %2 = shl <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28>
243  ret <4 x i32> %2
244}
245
246; fold (shl (ext (shl x, c1)), c2) -> (shl (ext x), (add c1, c2))
247define <8 x i32> @combine_vec_shl_ext_shl0(<8 x i16> %x) {
248; SSE2-LABEL: combine_vec_shl_ext_shl0:
249; SSE2:       # %bb.0:
250; SSE2-NEXT:    movdqa %xmm0, %xmm1
251; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
252; SSE2-NEXT:    pslld $20, %xmm0
253; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
254; SSE2-NEXT:    pslld $20, %xmm1
255; SSE2-NEXT:    retq
256;
257; SSE41-LABEL: combine_vec_shl_ext_shl0:
258; SSE41:       # %bb.0:
259; SSE41-NEXT:    movdqa %xmm0, %xmm1
260; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
261; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
262; SSE41-NEXT:    pslld $20, %xmm1
263; SSE41-NEXT:    pslld $20, %xmm0
264; SSE41-NEXT:    retq
265;
266; AVX-LABEL: combine_vec_shl_ext_shl0:
267; AVX:       # %bb.0:
268; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
269; AVX-NEXT:    vpslld $20, %ymm0, %ymm0
270; AVX-NEXT:    retq
271  %1 = shl <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
272  %2 = sext <8 x i16> %1 to <8 x i32>
273  %3 = shl <8 x i32> %2, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
274  ret <8 x i32> %3
275}
276
277define <8 x i32> @combine_vec_shl_ext_shl1(<8 x i16> %x) {
278; SSE-LABEL: combine_vec_shl_ext_shl1:
279; SSE:       # %bb.0:
280; SSE-NEXT:    xorps %xmm0, %xmm0
281; SSE-NEXT:    xorps %xmm1, %xmm1
282; SSE-NEXT:    retq
283;
284; AVX-LABEL: combine_vec_shl_ext_shl1:
285; AVX:       # %bb.0:
286; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
287; AVX-NEXT:    retq
288  %1 = shl <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
289  %2 = sext <8 x i16> %1 to <8 x i32>
290  %3 = shl <8 x i32> %2, <i32 31, i32 31, i32 30, i32 30, i32 29, i32 29, i32 28, i32 28>
291  ret <8 x i32> %3
292}
293
294define <8 x i32> @combine_vec_shl_ext_shl2(<8 x i16> %x) {
295; SSE2-LABEL: combine_vec_shl_ext_shl2:
296; SSE2:       # %bb.0:
297; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
298; SSE2-NEXT:    psrad $16, %xmm1
299; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [131072,524288,2097152,8388608]
300; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
301; SSE2-NEXT:    pmuludq %xmm3, %xmm1
302; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
303; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
304; SSE2-NEXT:    pmuludq %xmm4, %xmm1
305; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
306; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
307; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
308; SSE2-NEXT:    psrad $16, %xmm0
309; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [33554432,134217728,536870912,2147483648]
310; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
311; SSE2-NEXT:    pmuludq %xmm3, %xmm0
312; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
313; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
314; SSE2-NEXT:    pmuludq %xmm4, %xmm0
315; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
316; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
317; SSE2-NEXT:    movdqa %xmm2, %xmm0
318; SSE2-NEXT:    retq
319;
320; SSE41-LABEL: combine_vec_shl_ext_shl2:
321; SSE41:       # %bb.0:
322; SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
323; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
324; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
325; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
326; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
327; SSE41-NEXT:    movdqa %xmm2, %xmm0
328; SSE41-NEXT:    retq
329;
330; AVX-LABEL: combine_vec_shl_ext_shl2:
331; AVX:       # %bb.0:
332; AVX-NEXT:    vpmovsxwd %xmm0, %ymm0
333; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
334; AVX-NEXT:    retq
335  %1 = shl <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
336  %2 = sext <8 x i16> %1 to <8 x i32>
337  %3 = shl <8 x i32> %2, <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
338  ret <8 x i32> %3
339}
340
341; fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C))
342define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) {
343; SSE2-LABEL: combine_vec_shl_zext_lshr0:
344; SSE2:       # %bb.0:
345; SSE2-NEXT:    movdqa %xmm0, %xmm1
346; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
347; SSE2-NEXT:    pxor %xmm2, %xmm2
348; SSE2-NEXT:    movdqa %xmm1, %xmm0
349; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
350; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
351; SSE2-NEXT:    retq
352;
353; SSE41-LABEL: combine_vec_shl_zext_lshr0:
354; SSE41:       # %bb.0:
355; SSE41-NEXT:    movdqa %xmm0, %xmm1
356; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
357; SSE41-NEXT:    pxor %xmm2, %xmm2
358; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
359; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
360; SSE41-NEXT:    retq
361;
362; AVX-LABEL: combine_vec_shl_zext_lshr0:
363; AVX:       # %bb.0:
364; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
365; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
366; AVX-NEXT:    retq
367  %1 = lshr <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
368  %2 = zext <8 x i16> %1 to <8 x i32>
369  %3 = shl <8 x i32> %2, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
370  ret <8 x i32> %3
371}
372
373define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) {
374; SSE2-LABEL: combine_vec_shl_zext_lshr1:
375; SSE2:       # %bb.0:
376; SSE2-NEXT:    movdqa %xmm0, %xmm1
377; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
378; SSE2-NEXT:    pxor %xmm2, %xmm2
379; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
380; SSE2-NEXT:    movdqa %xmm1, %xmm0
381; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
382; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
383; SSE2-NEXT:    retq
384;
385; SSE41-LABEL: combine_vec_shl_zext_lshr1:
386; SSE41:       # %bb.0:
387; SSE41-NEXT:    movdqa %xmm0, %xmm1
388; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
389; SSE41-NEXT:    pxor %xmm2, %xmm2
390; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
391; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
392; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
393; SSE41-NEXT:    retq
394;
395; AVX-LABEL: combine_vec_shl_zext_lshr1:
396; AVX:       # %bb.0:
397; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
398; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
399; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
400; AVX-NEXT:    retq
401  %1 = lshr <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
402  %2 = zext <8 x i16> %1 to <8 x i32>
403  %3 = shl <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
404  ret <8 x i32> %3
405}
406
407; fold (shl (sr[la] exact X,  C1), C2) -> (shl X, (C2-C1)) if C1 <= C2
408define <4 x i32> @combine_vec_shl_ge_ashr_extact0(<4 x i32> %x) {
409; SSE-LABEL: combine_vec_shl_ge_ashr_extact0:
410; SSE:       # %bb.0:
411; SSE-NEXT:    pslld $2, %xmm0
412; SSE-NEXT:    retq
413;
414; AVX-LABEL: combine_vec_shl_ge_ashr_extact0:
415; AVX:       # %bb.0:
416; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
417; AVX-NEXT:    retq
418  %1 = ashr exact <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
419  %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
420  ret <4 x i32> %2
421}
422
423define <4 x i32> @combine_vec_shl_ge_ashr_extact1(<4 x i32> %x) {
424; SSE2-LABEL: combine_vec_shl_ge_ashr_extact1:
425; SSE2:       # %bb.0:
426; SSE2-NEXT:    movdqa %xmm0, %xmm1
427; SSE2-NEXT:    psrad $3, %xmm1
428; SSE2-NEXT:    movdqa %xmm0, %xmm2
429; SSE2-NEXT:    psrad $5, %xmm2
430; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
431; SSE2-NEXT:    movdqa %xmm0, %xmm1
432; SSE2-NEXT:    psrad $8, %xmm1
433; SSE2-NEXT:    psrad $4, %xmm0
434; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3]
435; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [32,64,128,256]
436; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
437; SSE2-NEXT:    pmuludq %xmm0, %xmm3
438; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
439; SSE2-NEXT:    pmuludq %xmm1, %xmm2
440; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
441; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
442; SSE2-NEXT:    retq
443;
444; SSE41-LABEL: combine_vec_shl_ge_ashr_extact1:
445; SSE41:       # %bb.0:
446; SSE41-NEXT:    movdqa %xmm0, %xmm1
447; SSE41-NEXT:    psrad $8, %xmm1
448; SSE41-NEXT:    movdqa %xmm0, %xmm2
449; SSE41-NEXT:    psrad $4, %xmm2
450; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
451; SSE41-NEXT:    movdqa %xmm0, %xmm1
452; SSE41-NEXT:    psrad $5, %xmm1
453; SSE41-NEXT:    psrad $3, %xmm0
454; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
455; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
456; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
457; SSE41-NEXT:    retq
458;
459; AVX-LABEL: combine_vec_shl_ge_ashr_extact1:
460; AVX:       # %bb.0:
461; AVX-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
462; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
463; AVX-NEXT:    retq
464  %1 = ashr exact <4 x i32> %x, <i32 3, i32 4, i32 5, i32 8>
465  %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 8>
466  ret <4 x i32> %2
467}
468
469; fold (shl (sr[la] exact SEL(X,Y),  C1), C2) -> (shl SEL(X,Y), (C2-C1)) if C1 <= C2
470define i32 @combine_shl_ge_sel_ashr_extact0(i32 %x, i32 %y, i32 %z) {
471; CHECK-LABEL: combine_shl_ge_sel_ashr_extact0:
472; CHECK:       # %bb.0:
473; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
474; CHECK-NEXT:    testl %edx, %edx
475; CHECK-NEXT:    cmovel %esi, %edi
476; CHECK-NEXT:    leal (,%rdi,4), %eax
477; CHECK-NEXT:    retq
478  %cmp = icmp ne i32 %z, 0
479  %ashrx = ashr exact i32 %x, 3
480  %ashry = ashr exact i32 %y, 3
481  %sel = select i1 %cmp, i32 %ashrx, i32 %ashry
482  %shl = shl i32 %sel, 5
483  ret i32 %shl
484}
485
486; fold (shl (sr[la] exact X,  C1), C2) -> (sr[la] X, (C2-C1)) if C1  > C2
487define <4 x i32> @combine_vec_shl_lt_ashr_extact0(<4 x i32> %x) {
488; SSE-LABEL: combine_vec_shl_lt_ashr_extact0:
489; SSE:       # %bb.0:
490; SSE-NEXT:    psrad $2, %xmm0
491; SSE-NEXT:    retq
492;
493; AVX-LABEL: combine_vec_shl_lt_ashr_extact0:
494; AVX:       # %bb.0:
495; AVX-NEXT:    vpsrad $2, %xmm0, %xmm0
496; AVX-NEXT:    retq
497  %1 = ashr exact <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
498  %2 = shl <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
499  ret <4 x i32> %2
500}
501
502define <4 x i32> @combine_vec_shl_lt_ashr_extact1(<4 x i32> %x) {
503; SSE2-LABEL: combine_vec_shl_lt_ashr_extact1:
504; SSE2:       # %bb.0:
505; SSE2-NEXT:    movdqa %xmm0, %xmm1
506; SSE2-NEXT:    psrad $5, %xmm1
507; SSE2-NEXT:    movdqa %xmm0, %xmm2
508; SSE2-NEXT:    psrad $7, %xmm2
509; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
510; SSE2-NEXT:    movdqa %xmm0, %xmm1
511; SSE2-NEXT:    psrad $8, %xmm1
512; SSE2-NEXT:    psrad $6, %xmm0
513; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3]
514; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [8,16,32,256]
515; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
516; SSE2-NEXT:    pmuludq %xmm0, %xmm3
517; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
518; SSE2-NEXT:    pmuludq %xmm1, %xmm2
519; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
520; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
521; SSE2-NEXT:    retq
522;
523; SSE41-LABEL: combine_vec_shl_lt_ashr_extact1:
524; SSE41:       # %bb.0:
525; SSE41-NEXT:    movdqa %xmm0, %xmm1
526; SSE41-NEXT:    psrad $8, %xmm1
527; SSE41-NEXT:    movdqa %xmm0, %xmm2
528; SSE41-NEXT:    psrad $6, %xmm2
529; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
530; SSE41-NEXT:    movdqa %xmm0, %xmm1
531; SSE41-NEXT:    psrad $7, %xmm1
532; SSE41-NEXT:    psrad $5, %xmm0
533; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
534; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
535; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
536; SSE41-NEXT:    retq
537;
538; AVX-LABEL: combine_vec_shl_lt_ashr_extact1:
539; AVX:       # %bb.0:
540; AVX-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
541; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
542; AVX-NEXT:    retq
543  %1 = ashr exact <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
544  %2 = shl <4 x i32> %1, <i32 3, i32 4, i32 5, i32 8>
545  ret <4 x i32> %2
546}
547
548; fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) if C2 > C1
549define <4 x i32> @combine_vec_shl_gt_lshr0(<4 x i32> %x) {
550; SSE-LABEL: combine_vec_shl_gt_lshr0:
551; SSE:       # %bb.0:
552; SSE-NEXT:    pslld $2, %xmm0
553; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
554; SSE-NEXT:    retq
555;
556; AVX-LABEL: combine_vec_shl_gt_lshr0:
557; AVX:       # %bb.0:
558; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
559; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
560; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
561; AVX-NEXT:    retq
562  %1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
563  %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
564  ret <4 x i32> %2
565}
566
567define <4 x i32> @combine_vec_shl_gt_lshr1(<4 x i32> %x) {
568; SSE2-LABEL: combine_vec_shl_gt_lshr1:
569; SSE2:       # %bb.0:
570; SSE2-NEXT:    movdqa %xmm0, %xmm1
571; SSE2-NEXT:    psrld $3, %xmm1
572; SSE2-NEXT:    movdqa %xmm0, %xmm2
573; SSE2-NEXT:    psrld $5, %xmm2
574; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
575; SSE2-NEXT:    movdqa %xmm0, %xmm1
576; SSE2-NEXT:    psrld $8, %xmm1
577; SSE2-NEXT:    psrld $4, %xmm0
578; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3]
579; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [32,64,128,256]
580; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
581; SSE2-NEXT:    pmuludq %xmm0, %xmm3
582; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
583; SSE2-NEXT:    pmuludq %xmm1, %xmm2
584; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
585; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
586; SSE2-NEXT:    retq
587;
588; SSE41-LABEL: combine_vec_shl_gt_lshr1:
589; SSE41:       # %bb.0:
590; SSE41-NEXT:    movdqa %xmm0, %xmm1
591; SSE41-NEXT:    psrld $8, %xmm1
592; SSE41-NEXT:    movdqa %xmm0, %xmm2
593; SSE41-NEXT:    psrld $4, %xmm2
594; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
595; SSE41-NEXT:    movdqa %xmm0, %xmm1
596; SSE41-NEXT:    psrld $5, %xmm1
597; SSE41-NEXT:    psrld $3, %xmm0
598; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
599; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
600; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
601; SSE41-NEXT:    retq
602;
603; AVX-LABEL: combine_vec_shl_gt_lshr1:
604; AVX:       # %bb.0:
605; AVX-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
606; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
607; AVX-NEXT:    retq
608  %1 = lshr <4 x i32> %x, <i32 3, i32 4, i32 5, i32 8>
609  %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 8>
610  ret <4 x i32> %2
611}
612
613; fold (shl (srl x, c1), c2) -> (and (srl x, (sub c1, c2), MASK) if C1 >= C2
614define <4 x i32> @combine_vec_shl_le_lshr0(<4 x i32> %x) {
615; SSE-LABEL: combine_vec_shl_le_lshr0:
616; SSE:       # %bb.0:
617; SSE-NEXT:    psrld $2, %xmm0
618; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
619; SSE-NEXT:    retq
620;
621; AVX-LABEL: combine_vec_shl_le_lshr0:
622; AVX:       # %bb.0:
623; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816]
624; AVX-NEXT:    vpsrld $2, %xmm0, %xmm0
625; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
626; AVX-NEXT:    retq
627  %1 = lshr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
628  %2 = shl <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
629  ret <4 x i32> %2
630}
631
632define <4 x i32> @combine_vec_shl_le_lshr1(<4 x i32> %x) {
633; SSE2-LABEL: combine_vec_shl_le_lshr1:
634; SSE2:       # %bb.0:
635; SSE2-NEXT:    movdqa %xmm0, %xmm1
636; SSE2-NEXT:    psrld $5, %xmm1
637; SSE2-NEXT:    movdqa %xmm0, %xmm2
638; SSE2-NEXT:    psrld $7, %xmm2
639; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
640; SSE2-NEXT:    movdqa %xmm0, %xmm1
641; SSE2-NEXT:    psrld $8, %xmm1
642; SSE2-NEXT:    psrld $6, %xmm0
643; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3]
644; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [8,16,32,256]
645; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
646; SSE2-NEXT:    pmuludq %xmm0, %xmm3
647; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
648; SSE2-NEXT:    pmuludq %xmm1, %xmm2
649; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
650; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
651; SSE2-NEXT:    retq
652;
653; SSE41-LABEL: combine_vec_shl_le_lshr1:
654; SSE41:       # %bb.0:
655; SSE41-NEXT:    movdqa %xmm0, %xmm1
656; SSE41-NEXT:    psrld $8, %xmm1
657; SSE41-NEXT:    movdqa %xmm0, %xmm2
658; SSE41-NEXT:    psrld $6, %xmm2
659; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
660; SSE41-NEXT:    movdqa %xmm0, %xmm1
661; SSE41-NEXT:    psrld $7, %xmm1
662; SSE41-NEXT:    psrld $5, %xmm0
663; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
664; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
665; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
666; SSE41-NEXT:    retq
667;
668; AVX-LABEL: combine_vec_shl_le_lshr1:
669; AVX:       # %bb.0:
670; AVX-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
671; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
672; AVX-NEXT:    retq
673  %1 = lshr <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
674  %2 = shl <4 x i32> %1, <i32 3, i32 4, i32 5, i32 8>
675  ret <4 x i32> %2
676}
677
678; fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
679define <4 x i32> @combine_vec_shl_ashr0(<4 x i32> %x) {
680; SSE-LABEL: combine_vec_shl_ashr0:
681; SSE:       # %bb.0:
682; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
683; SSE-NEXT:    retq
684;
685; AVX-LABEL: combine_vec_shl_ashr0:
686; AVX:       # %bb.0:
687; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
688; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
689; AVX-NEXT:    retq
690  %1 = ashr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
691  %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
692  ret <4 x i32> %2
693}
694
695define <4 x i32> @combine_vec_shl_ashr1(<4 x i32> %x) {
696; SSE-LABEL: combine_vec_shl_ashr1:
697; SSE:       # %bb.0:
698; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
699; SSE-NEXT:    retq
700;
701; AVX-LABEL: combine_vec_shl_ashr1:
702; AVX:       # %bb.0:
703; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
704; AVX-NEXT:    retq
705  %1 = ashr <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
706  %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 8>
707  ret <4 x i32> %2
708}
709
710; fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
711define <4 x i32> @combine_vec_shl_add0(<4 x i32> %x) {
712; SSE-LABEL: combine_vec_shl_add0:
713; SSE:       # %bb.0:
714; SSE-NEXT:    pslld $2, %xmm0
715; SSE-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
716; SSE-NEXT:    retq
717;
718; AVX-LABEL: combine_vec_shl_add0:
719; AVX:       # %bb.0:
720; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
721; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
722; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
723; AVX-NEXT:    retq
724  %1 = add <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
725  %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
726  ret <4 x i32> %2
727}
728
729define <4 x i32> @combine_vec_shl_add1(<4 x i32> %x) {
730; SSE2-LABEL: combine_vec_shl_add1:
731; SSE2:       # %bb.0:
732; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2,4,8,16]
733; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
734; SSE2-NEXT:    pmuludq %xmm1, %xmm0
735; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
736; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
737; SSE2-NEXT:    pmuludq %xmm2, %xmm1
738; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
739; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
740; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
741; SSE2-NEXT:    retq
742;
743; SSE41-LABEL: combine_vec_shl_add1:
744; SSE41:       # %bb.0:
745; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
746; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
747; SSE41-NEXT:    retq
748;
749; AVX-LABEL: combine_vec_shl_add1:
750; AVX:       # %bb.0:
751; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
752; AVX-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
753; AVX-NEXT:    retq
754  %1 = add <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
755  %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
756  ret <4 x i32> %2
757}
758
759; fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
760define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) {
761; SSE-LABEL: combine_vec_shl_or0:
762; SSE:       # %bb.0:
763; SSE-NEXT:    pslld $2, %xmm0
764; SSE-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
765; SSE-NEXT:    retq
766;
767; AVX-LABEL: combine_vec_shl_or0:
768; AVX:       # %bb.0:
769; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
770; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
771; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
772; AVX-NEXT:    retq
773  %1 = or  <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
774  %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
775  ret <4 x i32> %2
776}
777
778define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {
779; SSE2-LABEL: combine_vec_shl_or1:
780; SSE2:       # %bb.0:
781; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2,4,8,16]
782; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
783; SSE2-NEXT:    pmuludq %xmm1, %xmm0
784; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
785; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
786; SSE2-NEXT:    pmuludq %xmm2, %xmm1
787; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
788; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
789; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
790; SSE2-NEXT:    retq
791;
792; SSE41-LABEL: combine_vec_shl_or1:
793; SSE41:       # %bb.0:
794; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
795; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
796; SSE41-NEXT:    retq
797;
798; AVX-LABEL: combine_vec_shl_or1:
799; AVX:       # %bb.0:
800; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
801; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
802; AVX-NEXT:    retq
803  %1 = or  <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
804  %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
805  ret <4 x i32> %2
806}
807
808; fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
809define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) {
810; SSE2-LABEL: combine_vec_shl_mul0:
811; SSE2:       # %bb.0:
812; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [20,20,20,20]
813; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
814; SSE2-NEXT:    pmuludq %xmm1, %xmm0
815; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
816; SSE2-NEXT:    pmuludq %xmm1, %xmm2
817; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
818; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
819; SSE2-NEXT:    retq
820;
821; SSE41-LABEL: combine_vec_shl_mul0:
822; SSE41:       # %bb.0:
823; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
824; SSE41-NEXT:    retq
825;
826; AVX-LABEL: combine_vec_shl_mul0:
827; AVX:       # %bb.0:
828; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
829; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
830; AVX-NEXT:    retq
831  %1 = mul <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
832  %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
833  ret <4 x i32> %2
834}
835
836define <4 x i32> @combine_vec_shl_mul1(<4 x i32> %x) {
837; SSE2-LABEL: combine_vec_shl_mul1:
838; SSE2:       # %bb.0:
839; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [10,24,56,128]
840; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
841; SSE2-NEXT:    pmuludq %xmm1, %xmm0
842; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
843; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
844; SSE2-NEXT:    pmuludq %xmm2, %xmm1
845; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
846; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
847; SSE2-NEXT:    retq
848;
849; SSE41-LABEL: combine_vec_shl_mul1:
850; SSE41:       # %bb.0:
851; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
852; SSE41-NEXT:    retq
853;
854; AVX-LABEL: combine_vec_shl_mul1:
855; AVX:       # %bb.0:
856; AVX-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
857; AVX-NEXT:    retq
858  %1 = mul <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
859  %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
860  ret <4 x i32> %2
861}
862
863; fold (add (shl x, c1), c2) -> (or (shl x, c1), c2)
864define <4 x i32> @combine_vec_add_shl_nonsplat(<4 x i32> %a0)  {
865; SSE2-LABEL: combine_vec_add_shl_nonsplat:
866; SSE2:       # %bb.0:
867; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [4,8,16,32]
868; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
869; SSE2-NEXT:    pmuludq %xmm1, %xmm0
870; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
871; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
872; SSE2-NEXT:    pmuludq %xmm2, %xmm1
873; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
874; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
875; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
876; SSE2-NEXT:    retq
877;
878; SSE41-LABEL: combine_vec_add_shl_nonsplat:
879; SSE41:       # %bb.0:
880; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
881; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
882; SSE41-NEXT:    retq
883;
884; AVX-LABEL: combine_vec_add_shl_nonsplat:
885; AVX:       # %bb.0:
886; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
887; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3]
888; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
889; AVX-NEXT:    retq
890  %1 = shl <4 x i32> %a0, <i32 2, i32 3, i32 4, i32 5>
891  %2 = add <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
892  ret <4 x i32> %2
893}
894
895define <4 x i32> @combine_vec_add_shl_and_nonsplat(<4 x i32> %a0)  {
896; SSE2-LABEL: combine_vec_add_shl_and_nonsplat:
897; SSE2:       # %bb.0:
898; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
899; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [4,8,16,32]
900; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
901; SSE2-NEXT:    pmuludq %xmm1, %xmm0
902; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
903; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
904; SSE2-NEXT:    pmuludq %xmm2, %xmm1
905; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
906; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
907; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
908; SSE2-NEXT:    retq
909;
910; SSE41-LABEL: combine_vec_add_shl_and_nonsplat:
911; SSE41:       # %bb.0:
912; SSE41-NEXT:    pxor %xmm1, %xmm1
913; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
914; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
915; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
916; SSE41-NEXT:    retq
917;
918; AVX-LABEL: combine_vec_add_shl_and_nonsplat:
919; AVX:       # %bb.0:
920; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
921; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
922; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
923; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15]
924; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
925; AVX-NEXT:    retq
926  %1 = and <4 x i32> %a0, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760>
927  %2 = shl <4 x i32> %1, <i32 2, i32 3, i32 4, i32 5>
928  %3 = add <4 x i32> %2, <i32 15, i32 15, i32 15, i32 15>
929  ret <4 x i32> %3
930}
931
932define <4 x i32> @combine_vec_add_shuffle_shl(<4 x i32> %a0)  {
933; SSE2-LABEL: combine_vec_add_shuffle_shl:
934; SSE2:       # %bb.0:
935; SSE2-NEXT:    movdqa %xmm0, %xmm1
936; SSE2-NEXT:    pslld $3, %xmm1
937; SSE2-NEXT:    pslld $2, %xmm0
938; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
939; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,3,0]
940; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
941; SSE2-NEXT:    retq
942;
943; SSE41-LABEL: combine_vec_add_shuffle_shl:
944; SSE41:       # %bb.0:
945; SSE41-NEXT:    movdqa %xmm0, %xmm1
946; SSE41-NEXT:    pslld $3, %xmm1
947; SSE41-NEXT:    pslld $2, %xmm0
948; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
949; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
950; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
951; SSE41-NEXT:    retq
952;
953; AVX-LABEL: combine_vec_add_shuffle_shl:
954; AVX:       # %bb.0:
955; AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
956; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
957; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3]
958; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
959; AVX-NEXT:    retq
960  %1 = shl <4 x i32> %a0, <i32 2, i32 3, i32 0, i32 1>
961  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
962  %3 = add <4 x i32> %2, <i32 3, i32 3, i32 3, i32 3>
963  ret <4 x i32> %3
964}
965