1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
5
6; Verify that the following shifts are lowered into a sequence of two shifts plus
7; a blend. On pre-avx2 targets, instead of scalarizing logical and arithmetic
8; packed shift right by a constant build_vector the backend should always try to
9; emit a simpler sequence of two shifts + blend when possible.
10
11define <8 x i16> @test1(<8 x i16> %a) {
12; SSE-LABEL: test1:
13; SSE:       # %bb.0:
14; SSE-NEXT:    movdqa %xmm0, %xmm1
15; SSE-NEXT:    psrlw $3, %xmm1
16; SSE-NEXT:    psrlw $2, %xmm0
17; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
18; SSE-NEXT:    retq
19;
20; AVX1-LABEL: test1:
21; AVX1:       # %bb.0:
22; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm1
23; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm0
24; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
25; AVX1-NEXT:    retq
26;
27; AVX2-LABEL: test1:
28; AVX2:       # %bb.0:
29; AVX2-NEXT:    vpsrlw $3, %xmm0, %xmm1
30; AVX2-NEXT:    vpsrlw $2, %xmm0, %xmm0
31; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
32; AVX2-NEXT:    retq
33  %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
34  ret <8 x i16> %lshr
35}
36
37define <8 x i16> @test2(<8 x i16> %a) {
38; SSE-LABEL: test2:
39; SSE:       # %bb.0:
40; SSE-NEXT:    movdqa %xmm0, %xmm1
41; SSE-NEXT:    psrlw $3, %xmm1
42; SSE-NEXT:    psrlw $2, %xmm0
43; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
44; SSE-NEXT:    retq
45;
46; AVX1-LABEL: test2:
47; AVX1:       # %bb.0:
48; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
49; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
50; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
51; AVX1-NEXT:    retq
52;
53; AVX2-LABEL: test2:
54; AVX2:       # %bb.0:
55; AVX2-NEXT:    vpsrlw $2, %xmm0, %xmm1
56; AVX2-NEXT:    vpsrlw $3, %xmm0, %xmm0
57; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
58; AVX2-NEXT:    retq
59  %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2>
60  ret <8 x i16> %lshr
61}
62
63define <4 x i32> @test3(<4 x i32> %a) {
64; SSE-LABEL: test3:
65; SSE:       # %bb.0:
66; SSE-NEXT:    movdqa %xmm0, %xmm1
67; SSE-NEXT:    psrld $3, %xmm1
68; SSE-NEXT:    psrld $2, %xmm0
69; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
70; SSE-NEXT:    retq
71;
72; AVX1-LABEL: test3:
73; AVX1:       # %bb.0:
74; AVX1-NEXT:    vpsrld $3, %xmm0, %xmm1
75; AVX1-NEXT:    vpsrld $2, %xmm0, %xmm0
76; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
77; AVX1-NEXT:    retq
78;
79; AVX2-LABEL: test3:
80; AVX2:       # %bb.0:
81; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
82; AVX2-NEXT:    retq
83  %lshr = lshr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2>
84  ret <4 x i32> %lshr
85}
86
87define <4 x i32> @test4(<4 x i32> %a) {
88; SSE-LABEL: test4:
89; SSE:       # %bb.0:
90; SSE-NEXT:    movdqa %xmm0, %xmm1
91; SSE-NEXT:    psrld $3, %xmm1
92; SSE-NEXT:    psrld $2, %xmm0
93; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
94; SSE-NEXT:    retq
95;
96; AVX1-LABEL: test4:
97; AVX1:       # %bb.0:
98; AVX1-NEXT:    vpsrld $2, %xmm0, %xmm1
99; AVX1-NEXT:    vpsrld $3, %xmm0, %xmm0
100; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
101; AVX1-NEXT:    retq
102;
103; AVX2-LABEL: test4:
104; AVX2:       # %bb.0:
105; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
106; AVX2-NEXT:    retq
107  %lshr = lshr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
108  ret <4 x i32> %lshr
109}
110
111define <8 x i16> @test5(<8 x i16> %a) {
112; SSE-LABEL: test5:
113; SSE:       # %bb.0:
114; SSE-NEXT:    movdqa %xmm0, %xmm1
115; SSE-NEXT:    psraw $3, %xmm1
116; SSE-NEXT:    psraw $2, %xmm0
117; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
118; SSE-NEXT:    retq
119;
120; AVX1-LABEL: test5:
121; AVX1:       # %bb.0:
122; AVX1-NEXT:    vpsraw $3, %xmm0, %xmm1
123; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm0
124; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
125; AVX1-NEXT:    retq
126;
127; AVX2-LABEL: test5:
128; AVX2:       # %bb.0:
129; AVX2-NEXT:    vpsraw $3, %xmm0, %xmm1
130; AVX2-NEXT:    vpsraw $2, %xmm0, %xmm0
131; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
132; AVX2-NEXT:    retq
133  %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
134  ret <8 x i16> %lshr
135}
136
137define <8 x i16> @test6(<8 x i16> %a) {
138; SSE-LABEL: test6:
139; SSE:       # %bb.0:
140; SSE-NEXT:    movdqa %xmm0, %xmm1
141; SSE-NEXT:    psraw $3, %xmm1
142; SSE-NEXT:    psraw $2, %xmm0
143; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
144; SSE-NEXT:    retq
145;
146; AVX1-LABEL: test6:
147; AVX1:       # %bb.0:
148; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
149; AVX1-NEXT:    vpsraw $3, %xmm0, %xmm0
150; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
151; AVX1-NEXT:    retq
152;
153; AVX2-LABEL: test6:
154; AVX2:       # %bb.0:
155; AVX2-NEXT:    vpsraw $2, %xmm0, %xmm1
156; AVX2-NEXT:    vpsraw $3, %xmm0, %xmm0
157; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
158; AVX2-NEXT:    retq
159  %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2>
160  ret <8 x i16> %lshr
161}
162
163define <4 x i32> @test7(<4 x i32> %a) {
164; SSE-LABEL: test7:
165; SSE:       # %bb.0:
166; SSE-NEXT:    movdqa %xmm0, %xmm1
167; SSE-NEXT:    psrad $3, %xmm1
168; SSE-NEXT:    psrad $2, %xmm0
169; SSE-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
170; SSE-NEXT:    retq
171;
172; AVX1-LABEL: test7:
173; AVX1:       # %bb.0:
174; AVX1-NEXT:    vpsrad $3, %xmm0, %xmm1
175; AVX1-NEXT:    vpsrad $2, %xmm0, %xmm0
176; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
177; AVX1-NEXT:    retq
178;
179; AVX2-LABEL: test7:
180; AVX2:       # %bb.0:
181; AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
182; AVX2-NEXT:    retq
183  %lshr = ashr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2>
184  ret <4 x i32> %lshr
185}
186
187define <4 x i32> @test8(<4 x i32> %a) {
188; SSE-LABEL: test8:
189; SSE:       # %bb.0:
190; SSE-NEXT:    movdqa %xmm0, %xmm1
191; SSE-NEXT:    psrad $3, %xmm1
192; SSE-NEXT:    psrad $2, %xmm0
193; SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
194; SSE-NEXT:    retq
195;
196; AVX1-LABEL: test8:
197; AVX1:       # %bb.0:
198; AVX1-NEXT:    vpsrad $2, %xmm0, %xmm1
199; AVX1-NEXT:    vpsrad $3, %xmm0, %xmm0
200; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
201; AVX1-NEXT:    retq
202;
203; AVX2-LABEL: test8:
204; AVX2:       # %bb.0:
205; AVX2-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
206; AVX2-NEXT:    retq
207  %lshr = ashr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
208  ret <4 x i32> %lshr
209}
210
211define <8 x i16> @test9(<8 x i16> %a) {
212; SSE-LABEL: test9:
213; SSE:       # %bb.0:
214; SSE-NEXT:    movdqa %xmm0, %xmm1
215; SSE-NEXT:    psraw $3, %xmm1
216; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,0,0]
217; SSE-NEXT:    psraw $1, %xmm0
218; SSE-NEXT:    pand %xmm2, %xmm0
219; SSE-NEXT:    pandn %xmm1, %xmm2
220; SSE-NEXT:    por %xmm2, %xmm0
221; SSE-NEXT:    retq
222;
223; AVX-LABEL: test9:
224; AVX:       # %bb.0:
225; AVX-NEXT:    vpsraw $3, %xmm0, %xmm1
226; AVX-NEXT:    vpsraw $1, %xmm0, %xmm0
227; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4],xmm1[5,6,7]
228; AVX-NEXT:    retq
229  %lshr = ashr <8 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3>
230  ret <8 x i16> %lshr
231}
232
233define <8 x i32> @test10(<8 x i32>* %a) {
234; SSE-LABEL: test10:
235; SSE:       # %bb.0:
236; SSE-NEXT:    movdqa (%rdi), %xmm0
237; SSE-NEXT:    movdqa 16(%rdi), %xmm1
238; SSE-NEXT:    psrad %xmm0, %xmm1
239; SSE-NEXT:    psrad $1, %xmm0
240; SSE-NEXT:    retq
241;
242; AVX1-LABEL: test10:
243; AVX1:       # %bb.0:
244; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
245; AVX1-NEXT:    vpsrad $1, %xmm0, %xmm0
246; AVX1-NEXT:    retq
247;
248; AVX2-LABEL: test10:
249; AVX2:       # %bb.0:
250; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
251; AVX2-NEXT:    vpsrad $1, %ymm0, %ymm0
252; AVX2-NEXT:    retq
253  %ld = load <8 x i32>, <8 x i32>* %a, align 32
254  %ashr = ashr <8 x i32> %ld, <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
255  ret <8 x i32> %ashr
256}
257
258; test11 vs test12 - show difference between v16i16 that is repeated/non-repeated at v8i16 level (for PBLENDW masks).
259
260define <16 x i16> @test11(<16 x i16> %a) {
261; SSE-LABEL: test11:
262; SSE:       # %bb.0:
263; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
264; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm1
265; SSE-NEXT:    retq
266;
267; AVX1-LABEL: test11:
268; AVX1:       # %bb.0:
269; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
270; AVX1-NEXT:    vpsllw $1, %xmm1, %xmm2
271; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm1
272; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6],xmm2[7]
273; AVX1-NEXT:    vpsllw $3, %xmm0, %xmm2
274; AVX1-NEXT:    vpsllw $1, %xmm0, %xmm0
275; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
276; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
277; AVX1-NEXT:    retq
278;
279; AVX2-LABEL: test11:
280; AVX2:       # %bb.0:
281; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
282; AVX2-NEXT:    retq
283  %lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 1, i16 1, i16 1, i16 3, i16 1>
284  ret <16 x i16> %lshr
285}
286
287define <16 x i16> @test12(<16 x i16> %a) {
288; SSE-LABEL: test12:
289; SSE:       # %bb.0:
290; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2,8,2,2,2,8,8,8]
291; SSE-NEXT:    pmullw %xmm2, %xmm0
292; SSE-NEXT:    pmullw %xmm2, %xmm1
293; SSE-NEXT:    retq
294;
295; AVX1-LABEL: test12:
296; AVX1:       # %bb.0:
297; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
298; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm2
299; AVX1-NEXT:    vpsllw $1, %xmm1, %xmm1
300; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4],xmm2[5,6,7]
301; AVX1-NEXT:    vpsllw $3, %xmm0, %xmm2
302; AVX1-NEXT:    vpsllw $1, %xmm0, %xmm0
303; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
304; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
305; AVX1-NEXT:    retq
306;
307; AVX2-LABEL: test12:
308; AVX2:       # %bb.0:
309; AVX2-NEXT:    vpsllw $3, %ymm0, %ymm1
310; AVX2-NEXT:    vpsllw $1, %ymm0, %ymm0
311; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5,6,7],ymm0[8],ymm1[9],ymm0[10,11,12],ymm1[13,14,15]
312; AVX2-NEXT:    retq
313  %lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3>
314  ret <16 x i16> %lshr
315}
316