1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
5
6; Verify that the following shifts are lowered into a sequence of two shifts plus
7; a blend. On pre-avx2 targets, instead of scalarizing logical and arithmetic
8; packed shift right by a constant build_vector the backend should always try to
9; emit a simpler sequence of two shifts + blend when possible.
10
11define <8 x i16> @test1(<8 x i16> %a) {
12; SSE-LABEL: test1:
13; SSE:       # %bb.0:
14; SSE-NEXT:    movdqa %xmm0, %xmm1
15; SSE-NEXT:    psrlw $3, %xmm1
16; SSE-NEXT:    psrlw $2, %xmm0
17; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
18; SSE-NEXT:    retq
19;
20; AVX1-LABEL: test1:
21; AVX1:       # %bb.0:
22; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm1
23; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm0
24; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
25; AVX1-NEXT:    retq
26;
27; AVX2-LABEL: test1:
28; AVX2:       # %bb.0:
29; AVX2-NEXT:    vpsrlw $3, %xmm0, %xmm1
30; AVX2-NEXT:    vpsrlw $2, %xmm0, %xmm0
31; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
32; AVX2-NEXT:    retq
33  %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
34  ret <8 x i16> %lshr
35}
36
37define <8 x i16> @test2(<8 x i16> %a) {
38; SSE-LABEL: test2:
39; SSE:       # %bb.0:
40; SSE-NEXT:    movdqa %xmm0, %xmm1
41; SSE-NEXT:    psrlw $3, %xmm1
42; SSE-NEXT:    psrlw $2, %xmm0
43; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
44; SSE-NEXT:    retq
45;
46; AVX1-LABEL: test2:
47; AVX1:       # %bb.0:
48; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
49; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
50; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
51; AVX1-NEXT:    retq
52;
53; AVX2-LABEL: test2:
54; AVX2:       # %bb.0:
55; AVX2-NEXT:    vpsrlw $2, %xmm0, %xmm1
56; AVX2-NEXT:    vpsrlw $3, %xmm0, %xmm0
57; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
58; AVX2-NEXT:    retq
59  %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2>
60  ret <8 x i16> %lshr
61}
62
63define <4 x i32> @test3(<4 x i32> %a) {
64; SSE-LABEL: test3:
65; SSE:       # %bb.0:
66; SSE-NEXT:    movdqa %xmm0, %xmm1
67; SSE-NEXT:    psrld $3, %xmm1
68; SSE-NEXT:    psrld $2, %xmm0
69; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
70; SSE-NEXT:    retq
71;
72; AVX1-LABEL: test3:
73; AVX1:       # %bb.0:
74; AVX1-NEXT:    vpsrld $3, %xmm0, %xmm1
75; AVX1-NEXT:    vpsrld $2, %xmm0, %xmm0
76; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
77; AVX1-NEXT:    retq
78;
79; AVX2-LABEL: test3:
80; AVX2:       # %bb.0:
81; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
82; AVX2-NEXT:    retq
83  %lshr = lshr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2>
84  ret <4 x i32> %lshr
85}
86
87define <4 x i32> @test4(<4 x i32> %a) {
88; SSE-LABEL: test4:
89; SSE:       # %bb.0:
90; SSE-NEXT:    movdqa %xmm0, %xmm1
91; SSE-NEXT:    psrld $3, %xmm1
92; SSE-NEXT:    psrld $2, %xmm0
93; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
94; SSE-NEXT:    retq
95;
96; AVX1-LABEL: test4:
97; AVX1:       # %bb.0:
98; AVX1-NEXT:    vpsrld $2, %xmm0, %xmm1
99; AVX1-NEXT:    vpsrld $3, %xmm0, %xmm0
100; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
101; AVX1-NEXT:    retq
102;
103; AVX2-LABEL: test4:
104; AVX2:       # %bb.0:
105; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
106; AVX2-NEXT:    retq
107  %lshr = lshr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
108  ret <4 x i32> %lshr
109}
110
111define <8 x i16> @test5(<8 x i16> %a) {
112; SSE-LABEL: test5:
113; SSE:       # %bb.0:
114; SSE-NEXT:    movdqa %xmm0, %xmm1
115; SSE-NEXT:    psraw $3, %xmm1
116; SSE-NEXT:    psraw $2, %xmm0
117; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
118; SSE-NEXT:    retq
119;
120; AVX1-LABEL: test5:
121; AVX1:       # %bb.0:
122; AVX1-NEXT:    vpsraw $3, %xmm0, %xmm1
123; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm0
124; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
125; AVX1-NEXT:    retq
126;
127; AVX2-LABEL: test5:
128; AVX2:       # %bb.0:
129; AVX2-NEXT:    vpsraw $3, %xmm0, %xmm1
130; AVX2-NEXT:    vpsraw $2, %xmm0, %xmm0
131; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
132; AVX2-NEXT:    retq
133  %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
134  ret <8 x i16> %lshr
135}
136
137define <8 x i16> @test6(<8 x i16> %a) {
138; SSE-LABEL: test6:
139; SSE:       # %bb.0:
140; SSE-NEXT:    movdqa %xmm0, %xmm1
141; SSE-NEXT:    psraw $3, %xmm1
142; SSE-NEXT:    psraw $2, %xmm0
143; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
144; SSE-NEXT:    retq
145;
146; AVX1-LABEL: test6:
147; AVX1:       # %bb.0:
148; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
149; AVX1-NEXT:    vpsraw $3, %xmm0, %xmm0
150; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
151; AVX1-NEXT:    retq
152;
153; AVX2-LABEL: test6:
154; AVX2:       # %bb.0:
155; AVX2-NEXT:    vpsraw $2, %xmm0, %xmm1
156; AVX2-NEXT:    vpsraw $3, %xmm0, %xmm0
157; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
158; AVX2-NEXT:    retq
159  %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2>
160  ret <8 x i16> %lshr
161}
162
163define <4 x i32> @test7(<4 x i32> %a) {
164; SSE-LABEL: test7:
165; SSE:       # %bb.0:
166; SSE-NEXT:    movdqa %xmm0, %xmm1
167; SSE-NEXT:    psrad $3, %xmm1
168; SSE-NEXT:    psrad $2, %xmm0
169; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
170; SSE-NEXT:    retq
171;
172; AVX1-LABEL: test7:
173; AVX1:       # %bb.0:
174; AVX1-NEXT:    vpsrad $3, %xmm0, %xmm1
175; AVX1-NEXT:    vpsrad $2, %xmm0, %xmm0
176; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
177; AVX1-NEXT:    retq
178;
179; AVX2-LABEL: test7:
180; AVX2:       # %bb.0:
181; AVX2-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
182; AVX2-NEXT:    retq
183  %lshr = ashr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2>
184  ret <4 x i32> %lshr
185}
186
187define <4 x i32> @test8(<4 x i32> %a) {
188; SSE-LABEL: test8:
189; SSE:       # %bb.0:
190; SSE-NEXT:    movdqa %xmm0, %xmm1
191; SSE-NEXT:    psrad $3, %xmm1
192; SSE-NEXT:    psrad $2, %xmm0
193; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
194; SSE-NEXT:    retq
195;
196; AVX1-LABEL: test8:
197; AVX1:       # %bb.0:
198; AVX1-NEXT:    vpsrad $2, %xmm0, %xmm1
199; AVX1-NEXT:    vpsrad $3, %xmm0, %xmm0
200; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
201; AVX1-NEXT:    retq
202;
203; AVX2-LABEL: test8:
204; AVX2:       # %bb.0:
205; AVX2-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
206; AVX2-NEXT:    retq
207  %lshr = ashr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
208  ret <4 x i32> %lshr
209}
210
211define <8 x i16> @test9(<8 x i16> %a) {
212; SSE-LABEL: test9:
213; SSE:       # %bb.0:
214; SSE-NEXT:    movdqa %xmm0, %xmm1
215; SSE-NEXT:    psraw $3, %xmm1
216; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,0,0]
217; SSE-NEXT:    psraw $1, %xmm0
218; SSE-NEXT:    pand %xmm2, %xmm0
219; SSE-NEXT:    pandn %xmm1, %xmm2
220; SSE-NEXT:    por %xmm2, %xmm0
221; SSE-NEXT:    retq
222;
223; AVX-LABEL: test9:
224; AVX:       # %bb.0:
225; AVX-NEXT:    vpsraw $3, %xmm0, %xmm1
226; AVX-NEXT:    vpsraw $1, %xmm0, %xmm0
227; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4],xmm1[5,6,7]
228; AVX-NEXT:    retq
229  %lshr = ashr <8 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3>
230  ret <8 x i16> %lshr
231}
232
233define <8 x i32> @test10(<8 x i32>* %a) {
234; SSE-LABEL: test10:
235; SSE:       # %bb.0:
236; SSE-NEXT:    movdqa (%rdi), %xmm0
237; SSE-NEXT:    psrad $1, %xmm0
238; SSE-NEXT:    retq
239;
240; AVX1-LABEL: test10:
241; AVX1:       # %bb.0:
242; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
243; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm0
244; AVX1-NEXT:    retq
245;
246; AVX2-LABEL: test10:
247; AVX2:       # %bb.0:
248; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
249; AVX2-NEXT:    vpsrad $1, %ymm0, %ymm0
250; AVX2-NEXT:    retq
251  %ld = load <8 x i32>, <8 x i32>* %a, align 32
252  %ashr = ashr <8 x i32> %ld, <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
253  ret <8 x i32> %ashr
254}
255
256; test11 vs test12 - show difference between v16i16 that is repeated/non-repeated at v8i16 level (for PBLENDW masks).
257
258define <16 x i16> @test11(<16 x i16> %a) {
259; SSE-LABEL: test11:
260; SSE:       # %bb.0:
261; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
262; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
263; SSE-NEXT:    retq
264;
265; AVX1-LABEL: test11:
266; AVX1:       # %bb.0:
267; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
268; AVX1-NEXT:    vpsllw $1, %xmm1, %xmm2
269; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm1
270; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6],xmm2[7]
271; AVX1-NEXT:    vpsllw $3, %xmm0, %xmm2
272; AVX1-NEXT:    vpsllw $1, %xmm0, %xmm0
273; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
274; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
275; AVX1-NEXT:    retq
276;
277; AVX2-LABEL: test11:
278; AVX2:       # %bb.0:
279; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
280; AVX2-NEXT:    retq
281  %lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 1, i16 1, i16 1, i16 3, i16 1>
282  ret <16 x i16> %lshr
283}
284
285define <16 x i16> @test12(<16 x i16> %a) {
286; SSE-LABEL: test12:
287; SSE:       # %bb.0:
288; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2,8,2,2,2,8,8,8]
289; SSE-NEXT:    pmullw %xmm2, %xmm0
290; SSE-NEXT:    pmullw %xmm2, %xmm1
291; SSE-NEXT:    retq
292;
293; AVX1-LABEL: test12:
294; AVX1:       # %bb.0:
295; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
296; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm2
297; AVX1-NEXT:    vpsllw $1, %xmm1, %xmm1
298; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4],xmm2[5,6,7]
299; AVX1-NEXT:    vpsllw $3, %xmm0, %xmm2
300; AVX1-NEXT:    vpsllw $1, %xmm0, %xmm0
301; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
302; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
303; AVX1-NEXT:    retq
304;
305; AVX2-LABEL: test12:
306; AVX2:       # %bb.0:
307; AVX2-NEXT:    vpsllw $3, %ymm0, %ymm1
308; AVX2-NEXT:    vpsllw $1, %ymm0, %ymm0
309; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5,6,7],ymm0[8],ymm1[9],ymm0[10,11,12],ymm1[13,14,15]
310; AVX2-NEXT:    retq
311  %lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3>
312  ret <16 x i16> %lshr
313}
314