1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=XOPAVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=XOPAVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512DQVL
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL
10;
11; 32-bit runs to make sure we do reasonable things for i64 shifts.
12; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=X86-AVX1
13; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86-AVX2
14
15;
16; Variable Shifts
17;
18
19define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
20; AVX1-LABEL: var_shift_v4i64:
21; AVX1:       # %bb.0:
22; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
23; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
24; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm4
25; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
26; AVX1-NEXT:    vpsrlq %xmm5, %xmm3, %xmm6
27; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
28; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
29; AVX1-NEXT:    vpsrlq %xmm2, %xmm6, %xmm2
30; AVX1-NEXT:    vpsrlq %xmm5, %xmm6, %xmm5
31; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7]
32; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
33; AVX1-NEXT:    vpsubq %xmm4, %xmm2, %xmm2
34; AVX1-NEXT:    vpsrlq %xmm1, %xmm3, %xmm4
35; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
36; AVX1-NEXT:    vpsrlq %xmm5, %xmm3, %xmm3
37; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
38; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm1
39; AVX1-NEXT:    vpsrlq %xmm5, %xmm0, %xmm0
40; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
41; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
42; AVX1-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
43; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
44; AVX1-NEXT:    retq
45;
46; AVX2-LABEL: var_shift_v4i64:
47; AVX2:       # %bb.0:
48; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
49; AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm2
50; AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
51; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
52; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
53; AVX2-NEXT:    retq
54;
55; XOPAVX1-LABEL: var_shift_v4i64:
56; XOPAVX1:       # %bb.0:
57; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
58; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
59; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
60; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
61; XOPAVX1-NEXT:    vpshaq %xmm2, %xmm4, %xmm2
62; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
63; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
64; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
65; XOPAVX1-NEXT:    retq
66;
67; XOPAVX2-LABEL: var_shift_v4i64:
68; XOPAVX2:       # %bb.0:
69; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
70; XOPAVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm2
71; XOPAVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
72; XOPAVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
73; XOPAVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
74; XOPAVX2-NEXT:    retq
75;
76; AVX512-LABEL: var_shift_v4i64:
77; AVX512:       # %bb.0:
78; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
79; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
80; AVX512-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
81; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
82; AVX512-NEXT:    retq
83;
84; AVX512VL-LABEL: var_shift_v4i64:
85; AVX512VL:       # %bb.0:
86; AVX512VL-NEXT:    vpsravq %ymm1, %ymm0, %ymm0
87; AVX512VL-NEXT:    retq
88;
89; X86-AVX1-LABEL: var_shift_v4i64:
90; X86-AVX1:       # %bb.0:
91; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
92; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
93; X86-AVX1-NEXT:    # xmm3 = mem[0,0]
94; X86-AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm4
95; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
96; X86-AVX1-NEXT:    vpsrlq %xmm5, %xmm3, %xmm6
97; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
98; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
99; X86-AVX1-NEXT:    vpsrlq %xmm2, %xmm6, %xmm2
100; X86-AVX1-NEXT:    vpsrlq %xmm5, %xmm6, %xmm5
101; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7]
102; X86-AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
103; X86-AVX1-NEXT:    vpsubq %xmm4, %xmm2, %xmm2
104; X86-AVX1-NEXT:    vpsrlq %xmm1, %xmm3, %xmm4
105; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
106; X86-AVX1-NEXT:    vpsrlq %xmm5, %xmm3, %xmm3
107; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
108; X86-AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm1
109; X86-AVX1-NEXT:    vpsrlq %xmm5, %xmm0, %xmm0
110; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
111; X86-AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
112; X86-AVX1-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
113; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
114; X86-AVX1-NEXT:    retl
115;
116; X86-AVX2-LABEL: var_shift_v4i64:
117; X86-AVX2:       # %bb.0:
118; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
119; X86-AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm2
120; X86-AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
121; X86-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
122; X86-AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
123; X86-AVX2-NEXT:    retl
124  %shift = ashr <4 x i64> %a, %b
125  ret <4 x i64> %shift
126}
127
128define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
129; AVX1-LABEL: var_shift_v8i32:
130; AVX1:       # %bb.0:
131; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
132; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
133; AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
134; AVX1-NEXT:    vpsrad %xmm4, %xmm2, %xmm4
135; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
136; AVX1-NEXT:    vpsrad %xmm5, %xmm2, %xmm5
137; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
138; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
139; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
140; AVX1-NEXT:    vpsrad %xmm6, %xmm2, %xmm6
141; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
142; AVX1-NEXT:    vpsrad %xmm3, %xmm2, %xmm2
143; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
144; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
145; AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
146; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
147; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
148; AVX1-NEXT:    vpsrad %xmm4, %xmm0, %xmm4
149; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
150; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
151; AVX1-NEXT:    vpsrad %xmm4, %xmm0, %xmm4
152; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
153; AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
154; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
155; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
156; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
157; AVX1-NEXT:    retq
158;
159; AVX2-LABEL: var_shift_v8i32:
160; AVX2:       # %bb.0:
161; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
162; AVX2-NEXT:    retq
163;
164; XOPAVX1-LABEL: var_shift_v8i32:
165; XOPAVX1:       # %bb.0:
166; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
167; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
168; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
169; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
170; XOPAVX1-NEXT:    vpshad %xmm2, %xmm4, %xmm2
171; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
172; XOPAVX1-NEXT:    vpshad %xmm1, %xmm0, %xmm0
173; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
174; XOPAVX1-NEXT:    retq
175;
176; XOPAVX2-LABEL: var_shift_v8i32:
177; XOPAVX2:       # %bb.0:
178; XOPAVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
179; XOPAVX2-NEXT:    retq
180;
181; AVX512-LABEL: var_shift_v8i32:
182; AVX512:       # %bb.0:
183; AVX512-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
184; AVX512-NEXT:    retq
185;
186; AVX512VL-LABEL: var_shift_v8i32:
187; AVX512VL:       # %bb.0:
188; AVX512VL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
189; AVX512VL-NEXT:    retq
190;
191; X86-AVX1-LABEL: var_shift_v8i32:
192; X86-AVX1:       # %bb.0:
193; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
194; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
195; X86-AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
196; X86-AVX1-NEXT:    vpsrad %xmm4, %xmm2, %xmm4
197; X86-AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
198; X86-AVX1-NEXT:    vpsrad %xmm5, %xmm2, %xmm5
199; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
200; X86-AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
201; X86-AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
202; X86-AVX1-NEXT:    vpsrad %xmm6, %xmm2, %xmm6
203; X86-AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
204; X86-AVX1-NEXT:    vpsrad %xmm3, %xmm2, %xmm2
205; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
206; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
207; X86-AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
208; X86-AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
209; X86-AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
210; X86-AVX1-NEXT:    vpsrad %xmm4, %xmm0, %xmm4
211; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
212; X86-AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
213; X86-AVX1-NEXT:    vpsrad %xmm4, %xmm0, %xmm4
214; X86-AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
215; X86-AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
216; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
217; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
218; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
219; X86-AVX1-NEXT:    retl
220;
221; X86-AVX2-LABEL: var_shift_v8i32:
222; X86-AVX2:       # %bb.0:
223; X86-AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
224; X86-AVX2-NEXT:    retl
225  %shift = ashr <8 x i32> %a, %b
226  ret <8 x i32> %shift
227}
228
229define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
230; AVX1-LABEL: var_shift_v16i16:
231; AVX1:       # %bb.0:
232; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
233; AVX1-NEXT:    vpsllw $12, %xmm2, %xmm3
234; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
235; AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
236; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
237; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
238; AVX1-NEXT:    vpsraw $8, %xmm4, %xmm5
239; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
240; AVX1-NEXT:    vpsraw $4, %xmm2, %xmm4
241; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
242; AVX1-NEXT:    vpsraw $2, %xmm2, %xmm4
243; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
244; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
245; AVX1-NEXT:    vpsraw $1, %xmm2, %xmm4
246; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
247; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
248; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
249; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
250; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
251; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
252; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm4
253; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
254; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
255; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
256; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
257; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
258; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
259; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
260; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
261; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
262; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
263; AVX1-NEXT:    retq
264;
265; AVX2-LABEL: var_shift_v16i16:
266; AVX2:       # %bb.0:
267; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
268; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
269; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
270; AVX2-NEXT:    vpsravd %ymm3, %ymm4, %ymm3
271; AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
272; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
273; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
274; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
275; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
276; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
277; AVX2-NEXT:    retq
278;
279; XOPAVX1-LABEL: var_shift_v16i16:
280; XOPAVX1:       # %bb.0:
281; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
282; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
283; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
284; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
285; XOPAVX1-NEXT:    vpshaw %xmm2, %xmm4, %xmm2
286; XOPAVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
287; XOPAVX1-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
288; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
289; XOPAVX1-NEXT:    retq
290;
291; XOPAVX2-LABEL: var_shift_v16i16:
292; XOPAVX2:       # %bb.0:
293; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
294; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
295; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
296; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
297; XOPAVX2-NEXT:    vpshaw %xmm2, %xmm4, %xmm2
298; XOPAVX2-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
299; XOPAVX2-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
300; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
301; XOPAVX2-NEXT:    retq
302;
303; AVX512DQ-LABEL: var_shift_v16i16:
304; AVX512DQ:       # %bb.0:
305; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
306; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
307; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
308; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
309; AVX512DQ-NEXT:    retq
310;
311; AVX512BW-LABEL: var_shift_v16i16:
312; AVX512BW:       # %bb.0:
313; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
314; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
315; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
316; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
317; AVX512BW-NEXT:    retq
318;
319; AVX512DQVL-LABEL: var_shift_v16i16:
320; AVX512DQVL:       # %bb.0:
321; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
322; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
323; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
324; AVX512DQVL-NEXT:    vpmovdw %zmm0, %ymm0
325; AVX512DQVL-NEXT:    retq
326;
327; AVX512BWVL-LABEL: var_shift_v16i16:
328; AVX512BWVL:       # %bb.0:
329; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
330; AVX512BWVL-NEXT:    retq
331;
332; X86-AVX1-LABEL: var_shift_v16i16:
333; X86-AVX1:       # %bb.0:
334; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
335; X86-AVX1-NEXT:    vpsllw $12, %xmm2, %xmm3
336; X86-AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
337; X86-AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
338; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
339; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
340; X86-AVX1-NEXT:    vpsraw $8, %xmm4, %xmm5
341; X86-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
342; X86-AVX1-NEXT:    vpsraw $4, %xmm2, %xmm4
343; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
344; X86-AVX1-NEXT:    vpsraw $2, %xmm2, %xmm4
345; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
346; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
347; X86-AVX1-NEXT:    vpsraw $1, %xmm2, %xmm4
348; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
349; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
350; X86-AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
351; X86-AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
352; X86-AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
353; X86-AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
354; X86-AVX1-NEXT:    vpsraw $8, %xmm0, %xmm4
355; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
356; X86-AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
357; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
358; X86-AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
359; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
360; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
361; X86-AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
362; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
363; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
364; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
365; X86-AVX1-NEXT:    retl
366;
367; X86-AVX2-LABEL: var_shift_v16i16:
368; X86-AVX2:       # %bb.0:
369; X86-AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
370; X86-AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
371; X86-AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
372; X86-AVX2-NEXT:    vpsravd %ymm3, %ymm4, %ymm3
373; X86-AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
374; X86-AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
375; X86-AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
376; X86-AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
377; X86-AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
378; X86-AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
379; X86-AVX2-NEXT:    retl
380  %shift = ashr <16 x i16> %a, %b
381  ret <16 x i16> %shift
382}
383
384define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
385; AVX1-LABEL: var_shift_v32i8:
386; AVX1:       # %bb.0:
387; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
388; AVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
389; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
390; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
391; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
392; AVX1-NEXT:    vpsraw $4, %xmm5, %xmm6
393; AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
394; AVX1-NEXT:    vpsraw $2, %xmm5, %xmm6
395; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
396; AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
397; AVX1-NEXT:    vpsraw $1, %xmm5, %xmm6
398; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
399; AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm3
400; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
401; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
402; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
403; AVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
404; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
405; AVX1-NEXT:    vpsraw $2, %xmm4, %xmm5
406; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
407; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
408; AVX1-NEXT:    vpsraw $1, %xmm4, %xmm5
409; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
410; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
411; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
412; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
413; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
414; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
415; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
416; AVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
417; AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
418; AVX1-NEXT:    vpsraw $2, %xmm4, %xmm5
419; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
420; AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
421; AVX1-NEXT:    vpsraw $1, %xmm4, %xmm5
422; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
423; AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
424; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
425; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
426; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
427; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm4
428; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
429; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm4
430; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
431; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
432; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm4
433; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
434; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
435; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
436; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
437; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
438; AVX1-NEXT:    retq
439;
440; AVX2-LABEL: var_shift_v32i8:
441; AVX2:       # %bb.0:
442; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
443; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
444; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
445; AVX2-NEXT:    vpsraw $4, %ymm3, %ymm4
446; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
447; AVX2-NEXT:    vpsraw $2, %ymm3, %ymm4
448; AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
449; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
450; AVX2-NEXT:    vpsraw $1, %ymm3, %ymm4
451; AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
452; AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
453; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
454; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
455; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
456; AVX2-NEXT:    vpsraw $4, %ymm0, %ymm3
457; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
458; AVX2-NEXT:    vpsraw $2, %ymm0, %ymm3
459; AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
460; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
461; AVX2-NEXT:    vpsraw $1, %ymm0, %ymm3
462; AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
463; AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
464; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
465; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
466; AVX2-NEXT:    retq
467;
468; XOPAVX1-LABEL: var_shift_v32i8:
469; XOPAVX1:       # %bb.0:
470; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
471; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
472; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
473; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
474; XOPAVX1-NEXT:    vpshab %xmm2, %xmm4, %xmm2
475; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
476; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
477; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
478; XOPAVX1-NEXT:    retq
479;
480; XOPAVX2-LABEL: var_shift_v32i8:
481; XOPAVX2:       # %bb.0:
482; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
483; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
484; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
485; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
486; XOPAVX2-NEXT:    vpshab %xmm2, %xmm4, %xmm2
487; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
488; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
489; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
490; XOPAVX2-NEXT:    retq
491;
492; AVX512DQ-LABEL: var_shift_v32i8:
493; AVX512DQ:       # %bb.0:
494; AVX512DQ-NEXT:    vpsllw $5, %ymm1, %ymm1
495; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
496; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
497; AVX512DQ-NEXT:    vpsraw $4, %ymm3, %ymm4
498; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
499; AVX512DQ-NEXT:    vpsraw $2, %ymm3, %ymm4
500; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
501; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
502; AVX512DQ-NEXT:    vpsraw $1, %ymm3, %ymm4
503; AVX512DQ-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
504; AVX512DQ-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
505; AVX512DQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
506; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
507; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
508; AVX512DQ-NEXT:    vpsraw $4, %ymm0, %ymm3
509; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
510; AVX512DQ-NEXT:    vpsraw $2, %ymm0, %ymm3
511; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
512; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
513; AVX512DQ-NEXT:    vpsraw $1, %ymm0, %ymm3
514; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
515; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
516; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
517; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
518; AVX512DQ-NEXT:    retq
519;
520; AVX512BW-LABEL: var_shift_v32i8:
521; AVX512BW:       # %bb.0:
522; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
523; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
524; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
525; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
526; AVX512BW-NEXT:    retq
527;
528; AVX512DQVL-LABEL: var_shift_v32i8:
529; AVX512DQVL:       # %bb.0:
530; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
531; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
532; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
533; AVX512DQVL-NEXT:    vpsraw $4, %ymm3, %ymm4
534; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
535; AVX512DQVL-NEXT:    vpsraw $2, %ymm3, %ymm4
536; AVX512DQVL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
537; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
538; AVX512DQVL-NEXT:    vpsraw $1, %ymm3, %ymm4
539; AVX512DQVL-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
540; AVX512DQVL-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
541; AVX512DQVL-NEXT:    vpsrlw $8, %ymm2, %ymm2
542; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
543; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
544; AVX512DQVL-NEXT:    vpsraw $4, %ymm0, %ymm3
545; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
546; AVX512DQVL-NEXT:    vpsraw $2, %ymm0, %ymm3
547; AVX512DQVL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
548; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
549; AVX512DQVL-NEXT:    vpsraw $1, %ymm0, %ymm3
550; AVX512DQVL-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
551; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
552; AVX512DQVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
553; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
554; AVX512DQVL-NEXT:    retq
555;
556; AVX512BWVL-LABEL: var_shift_v32i8:
557; AVX512BWVL:       # %bb.0:
558; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
559; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
560; AVX512BWVL-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
561; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
562; AVX512BWVL-NEXT:    retq
563;
564; X86-AVX1-LABEL: var_shift_v32i8:
565; X86-AVX1:       # %bb.0:
566; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
567; X86-AVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
568; X86-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
569; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
570; X86-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
571; X86-AVX1-NEXT:    vpsraw $4, %xmm5, %xmm6
572; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
573; X86-AVX1-NEXT:    vpsraw $2, %xmm5, %xmm6
574; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
575; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm5
576; X86-AVX1-NEXT:    vpsraw $1, %xmm5, %xmm6
577; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
578; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm6, %xmm5, %xmm3
579; X86-AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
580; X86-AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
581; X86-AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
582; X86-AVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
583; X86-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
584; X86-AVX1-NEXT:    vpsraw $2, %xmm4, %xmm5
585; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
586; X86-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm4
587; X86-AVX1-NEXT:    vpsraw $1, %xmm4, %xmm5
588; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
589; X86-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
590; X86-AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
591; X86-AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
592; X86-AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
593; X86-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
594; X86-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
595; X86-AVX1-NEXT:    vpsraw $4, %xmm4, %xmm5
596; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
597; X86-AVX1-NEXT:    vpsraw $2, %xmm4, %xmm5
598; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
599; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm4
600; X86-AVX1-NEXT:    vpsraw $1, %xmm4, %xmm5
601; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
602; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm5, %xmm4, %xmm3
603; X86-AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
604; X86-AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
605; X86-AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
606; X86-AVX1-NEXT:    vpsraw $4, %xmm0, %xmm4
607; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
608; X86-AVX1-NEXT:    vpsraw $2, %xmm0, %xmm4
609; X86-AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
610; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
611; X86-AVX1-NEXT:    vpsraw $1, %xmm0, %xmm4
612; X86-AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
613; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
614; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
615; X86-AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
616; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
617; X86-AVX1-NEXT:    retl
618;
619; X86-AVX2-LABEL: var_shift_v32i8:
620; X86-AVX2:       # %bb.0:
621; X86-AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
622; X86-AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
623; X86-AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
624; X86-AVX2-NEXT:    vpsraw $4, %ymm3, %ymm4
625; X86-AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
626; X86-AVX2-NEXT:    vpsraw $2, %ymm3, %ymm4
627; X86-AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
628; X86-AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
629; X86-AVX2-NEXT:    vpsraw $1, %ymm3, %ymm4
630; X86-AVX2-NEXT:    vpaddw %ymm2, %ymm2, %ymm2
631; X86-AVX2-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
632; X86-AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
633; X86-AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
634; X86-AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
635; X86-AVX2-NEXT:    vpsraw $4, %ymm0, %ymm3
636; X86-AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
637; X86-AVX2-NEXT:    vpsraw $2, %ymm0, %ymm3
638; X86-AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
639; X86-AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
640; X86-AVX2-NEXT:    vpsraw $1, %ymm0, %ymm3
641; X86-AVX2-NEXT:    vpaddw %ymm1, %ymm1, %ymm1
642; X86-AVX2-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
643; X86-AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
644; X86-AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
645; X86-AVX2-NEXT:    retl
646  %shift = ashr <32 x i8> %a, %b
647  ret <32 x i8> %shift
648}
649
650;
651; Uniform Variable Shifts
652;
653
654define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
655; AVX1-LABEL: splatvar_shift_v4i64:
656; AVX1:       # %bb.0:
657; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
658; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
659; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
660; AVX1-NEXT:    vpsrlq %xmm1, %xmm3, %xmm3
661; AVX1-NEXT:    vpxor %xmm2, %xmm3, %xmm3
662; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm3
663; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
664; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
665; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
666; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
667; AVX1-NEXT:    retq
668;
669; AVX2-LABEL: splatvar_shift_v4i64:
670; AVX2:       # %bb.0:
671; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
672; AVX2-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
673; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
674; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
675; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
676; AVX2-NEXT:    retq
677;
678; XOPAVX1-LABEL: splatvar_shift_v4i64:
679; XOPAVX1:       # %bb.0:
680; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
681; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
682; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
683; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
684; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm2, %xmm2
685; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
686; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
687; XOPAVX1-NEXT:    retq
688;
689; XOPAVX2-LABEL: splatvar_shift_v4i64:
690; XOPAVX2:       # %bb.0:
691; XOPAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
692; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
693; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
694; XOPAVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
695; XOPAVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
696; XOPAVX2-NEXT:    retq
697;
698; AVX512-LABEL: splatvar_shift_v4i64:
699; AVX512:       # %bb.0:
700; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
701; AVX512-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
702; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
703; AVX512-NEXT:    retq
704;
705; AVX512VL-LABEL: splatvar_shift_v4i64:
706; AVX512VL:       # %bb.0:
707; AVX512VL-NEXT:    vpsraq %xmm1, %ymm0, %ymm0
708; AVX512VL-NEXT:    retq
709;
710; X86-AVX1-LABEL: splatvar_shift_v4i64:
711; X86-AVX1:       # %bb.0:
712; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
713; X86-AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
714; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
715; X86-AVX1-NEXT:    vpsrlq %xmm1, %xmm3, %xmm3
716; X86-AVX1-NEXT:    vpxor %xmm2, %xmm3, %xmm3
717; X86-AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm3
718; X86-AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
719; X86-AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
720; X86-AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
721; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
722; X86-AVX1-NEXT:    retl
723;
724; X86-AVX2-LABEL: splatvar_shift_v4i64:
725; X86-AVX2:       # %bb.0:
726; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
727; X86-AVX2-NEXT:    vpsrlq %xmm1, %ymm2, %ymm2
728; X86-AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
729; X86-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
730; X86-AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
731; X86-AVX2-NEXT:    retl
732  %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
733  %shift = ashr <4 x i64> %a, %splat
734  ret <4 x i64> %shift
735}
736
737define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
738; AVX1-LABEL: splatvar_shift_v8i32:
739; AVX1:       # %bb.0:
740; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
741; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
742; AVX1-NEXT:    vpsrad %xmm1, %xmm2, %xmm2
743; AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
744; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
745; AVX1-NEXT:    retq
746;
747; AVX2-LABEL: splatvar_shift_v8i32:
748; AVX2:       # %bb.0:
749; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
750; AVX2-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
751; AVX2-NEXT:    retq
752;
753; XOPAVX1-LABEL: splatvar_shift_v8i32:
754; XOPAVX1:       # %bb.0:
755; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
756; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
757; XOPAVX1-NEXT:    vpsrad %xmm1, %xmm2, %xmm2
758; XOPAVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
759; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
760; XOPAVX1-NEXT:    retq
761;
762; XOPAVX2-LABEL: splatvar_shift_v8i32:
763; XOPAVX2:       # %bb.0:
764; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
765; XOPAVX2-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
766; XOPAVX2-NEXT:    retq
767;
768; AVX512-LABEL: splatvar_shift_v8i32:
769; AVX512:       # %bb.0:
770; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
771; AVX512-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
772; AVX512-NEXT:    retq
773;
774; AVX512VL-LABEL: splatvar_shift_v8i32:
775; AVX512VL:       # %bb.0:
776; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
777; AVX512VL-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
778; AVX512VL-NEXT:    retq
779;
780; X86-AVX1-LABEL: splatvar_shift_v8i32:
781; X86-AVX1:       # %bb.0:
782; X86-AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
783; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
784; X86-AVX1-NEXT:    vpsrad %xmm1, %xmm2, %xmm2
785; X86-AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
786; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
787; X86-AVX1-NEXT:    retl
788;
789; X86-AVX2-LABEL: splatvar_shift_v8i32:
790; X86-AVX2:       # %bb.0:
791; X86-AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
792; X86-AVX2-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
793; X86-AVX2-NEXT:    retl
794  %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
795  %shift = ashr <8 x i32> %a, %splat
796  ret <8 x i32> %shift
797}
798
799define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
800; AVX1-LABEL: splatvar_shift_v16i16:
801; AVX1:       # %bb.0:
802; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
803; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
804; AVX1-NEXT:    vpsraw %xmm1, %xmm2, %xmm2
805; AVX1-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
806; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
807; AVX1-NEXT:    retq
808;
809; AVX2-LABEL: splatvar_shift_v16i16:
810; AVX2:       # %bb.0:
811; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
812; AVX2-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
813; AVX2-NEXT:    retq
814;
815; XOPAVX1-LABEL: splatvar_shift_v16i16:
816; XOPAVX1:       # %bb.0:
817; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
818; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
819; XOPAVX1-NEXT:    vpsraw %xmm1, %xmm2, %xmm2
820; XOPAVX1-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
821; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
822; XOPAVX1-NEXT:    retq
823;
824; XOPAVX2-LABEL: splatvar_shift_v16i16:
825; XOPAVX2:       # %bb.0:
826; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
827; XOPAVX2-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
828; XOPAVX2-NEXT:    retq
829;
830; AVX512-LABEL: splatvar_shift_v16i16:
831; AVX512:       # %bb.0:
832; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
833; AVX512-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
834; AVX512-NEXT:    retq
835;
836; AVX512VL-LABEL: splatvar_shift_v16i16:
837; AVX512VL:       # %bb.0:
838; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
839; AVX512VL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
840; AVX512VL-NEXT:    retq
841;
842; X86-AVX1-LABEL: splatvar_shift_v16i16:
843; X86-AVX1:       # %bb.0:
844; X86-AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
845; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
846; X86-AVX1-NEXT:    vpsraw %xmm1, %xmm2, %xmm2
847; X86-AVX1-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
848; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
849; X86-AVX1-NEXT:    retl
850;
851; X86-AVX2-LABEL: splatvar_shift_v16i16:
852; X86-AVX2:       # %bb.0:
853; X86-AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
854; X86-AVX2-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
855; X86-AVX2-NEXT:    retl
856  %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
857  %shift = ashr <16 x i16> %a, %splat
858  ret <16 x i16> %shift
859}
860
861define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
862; AVX1-LABEL: splatvar_shift_v32i8:
863; AVX1:       # %bb.0:
864; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
865; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
866; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
867; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
868; AVX1-NEXT:    vpsrlw %xmm1, %xmm3, %xmm3
869; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
870; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
871; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896]
872; AVX1-NEXT:    vpsrlw %xmm1, %xmm4, %xmm4
873; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
874; AVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
875; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
876; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
877; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
878; AVX1-NEXT:    vpsubb %xmm4, %xmm0, %xmm0
879; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
880; AVX1-NEXT:    retq
881;
882; AVX2-LABEL: splatvar_shift_v32i8:
883; AVX2:       # %bb.0:
884; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
885; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
886; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
887; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
888; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
889; AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
890; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
891; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
892; AVX2-NEXT:    vpsrlw %xmm1, %ymm2, %ymm1
893; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
894; AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
895; AVX2-NEXT:    retq
896;
897; XOPAVX1-LABEL: splatvar_shift_v32i8:
898; XOPAVX1:       # %bb.0:
899; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
900; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
901; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
902; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
903; XOPAVX1-NEXT:    vpshab %xmm1, %xmm2, %xmm2
904; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
905; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
906; XOPAVX1-NEXT:    retq
907;
908; XOPAVX2-LABEL: splatvar_shift_v32i8:
909; XOPAVX2:       # %bb.0:
910; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
911; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
912; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
913; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
914; XOPAVX2-NEXT:    vpshab %xmm1, %xmm2, %xmm2
915; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
916; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
917; XOPAVX2-NEXT:    retq
918;
919; AVX512DQ-LABEL: splatvar_shift_v32i8:
920; AVX512DQ:       # %bb.0:
921; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
922; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
923; AVX512DQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
924; AVX512DQ-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
925; AVX512DQ-NEXT:    vpsrlw $8, %xmm2, %xmm2
926; AVX512DQ-NEXT:    vpbroadcastb %xmm2, %ymm2
927; AVX512DQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
928; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
929; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm2, %ymm1
930; AVX512DQ-NEXT:    vpxor %ymm1, %ymm0, %ymm0
931; AVX512DQ-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
932; AVX512DQ-NEXT:    retq
933;
934; AVX512BW-LABEL: splatvar_shift_v32i8:
935; AVX512BW:       # %bb.0:
936; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
937; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
938; AVX512BW-NEXT:    vpsraw %xmm1, %zmm0, %zmm0
939; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
940; AVX512BW-NEXT:    retq
941;
942; AVX512DQVL-LABEL: splatvar_shift_v32i8:
943; AVX512DQVL:       # %bb.0:
944; AVX512DQVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
945; AVX512DQVL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
946; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
947; AVX512DQVL-NEXT:    vpsrlw %xmm1, %ymm2, %ymm2
948; AVX512DQVL-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
949; AVX512DQVL-NEXT:    vpsrlw %xmm1, %xmm3, %xmm1
950; AVX512DQVL-NEXT:    vpsrlw $8, %xmm1, %xmm1
951; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %ymm1
952; AVX512DQVL-NEXT:    vpternlogq $108, %ymm0, %ymm2, %ymm1
953; AVX512DQVL-NEXT:    vpsubb %ymm2, %ymm1, %ymm0
954; AVX512DQVL-NEXT:    retq
955;
956; AVX512BWVL-LABEL: splatvar_shift_v32i8:
957; AVX512BWVL:       # %bb.0:
958; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
959; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
960; AVX512BWVL-NEXT:    vpsraw %xmm1, %zmm0, %zmm0
961; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
962; AVX512BWVL-NEXT:    retq
963;
964; X86-AVX1-LABEL: splatvar_shift_v32i8:
965; X86-AVX1:       # %bb.0:
966; X86-AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
967; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
968; X86-AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
969; X86-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
970; X86-AVX1-NEXT:    vpsrlw %xmm1, %xmm3, %xmm3
971; X86-AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
972; X86-AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
973; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [32896,32896,32896,32896,32896,32896,32896,32896]
974; X86-AVX1-NEXT:    vpsrlw %xmm1, %xmm4, %xmm4
975; X86-AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
976; X86-AVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
977; X86-AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
978; X86-AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
979; X86-AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
980; X86-AVX1-NEXT:    vpsubb %xmm4, %xmm0, %xmm0
981; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
982; X86-AVX1-NEXT:    retl
983;
984; X86-AVX2-LABEL: splatvar_shift_v32i8:
985; X86-AVX2:       # %bb.0:
986; X86-AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
987; X86-AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
988; X86-AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
989; X86-AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
990; X86-AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
991; X86-AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
992; X86-AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
993; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
994; X86-AVX2-NEXT:    vpsrlw %xmm1, %ymm2, %ymm1
995; X86-AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
996; X86-AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
997; X86-AVX2-NEXT:    retl
998  %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
999  %shift = ashr <32 x i8> %a, %splat
1000  ret <32 x i8> %shift
1001}
1002
1003;
1004; Constant Shifts
1005;
1006
1007define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
1008; AVX1-LABEL: constant_shift_v4i64:
1009; AVX1:       # %bb.0:
1010; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1011; AVX1-NEXT:    vpsrlq $62, %xmm1, %xmm2
1012; AVX1-NEXT:    vpsrlq $31, %xmm1, %xmm1
1013; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1014; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4294967296,2]
1015; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
1016; AVX1-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
1017; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm2
1018; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
1019; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1020; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4611686018427387904,72057594037927936]
1021; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
1022; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
1023; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1024; AVX1-NEXT:    retq
1025;
1026; AVX2-LABEL: constant_shift_v4i64:
1027; AVX2:       # %bb.0:
1028; AVX2-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1029; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2]
1030; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
1031; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
1032; AVX2-NEXT:    retq
1033;
1034; XOPAVX1-LABEL: constant_shift_v4i64:
1035; XOPAVX1:       # %bb.0:
1036; XOPAVX1-NEXT:    vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1037; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1038; XOPAVX1-NEXT:    vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1039; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1040; XOPAVX1-NEXT:    retq
1041;
1042; XOPAVX2-LABEL: constant_shift_v4i64:
1043; XOPAVX2:       # %bb.0:
1044; XOPAVX2-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1045; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2]
1046; XOPAVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
1047; XOPAVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
1048; XOPAVX2-NEXT:    retq
1049;
1050; AVX512-LABEL: constant_shift_v4i64:
1051; AVX512:       # %bb.0:
1052; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1053; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,7,31,62]
1054; AVX512-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
1055; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1056; AVX512-NEXT:    retq
1057;
1058; AVX512VL-LABEL: constant_shift_v4i64:
1059; AVX512VL:       # %bb.0:
1060; AVX512VL-NEXT:    vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1061; AVX512VL-NEXT:    retq
1062;
1063; X86-AVX1-LABEL: constant_shift_v4i64:
1064; X86-AVX1:       # %bb.0:
1065; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1066; X86-AVX1-NEXT:    vpsrlq $62, %xmm1, %xmm2
1067; X86-AVX1-NEXT:    vpsrlq $31, %xmm1, %xmm1
1068; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1069; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,0]
1070; X86-AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
1071; X86-AVX1-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
1072; X86-AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm2
1073; X86-AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
1074; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1075; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1073741824,0,16777216]
1076; X86-AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
1077; X86-AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
1078; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1079; X86-AVX1-NEXT:    retl
1080;
1081; X86-AVX2-LABEL: constant_shift_v4i64:
1082; X86-AVX2:       # %bb.0:
1083; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0]
1084; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
1085; X86-AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm2
1086; X86-AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
1087; X86-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
1088; X86-AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
1089; X86-AVX2-NEXT:    retl
1090  %shift = ashr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
1091  ret <4 x i64> %shift
1092}
1093
1094define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
1095; AVX1-LABEL: constant_shift_v8i32:
1096; AVX1:       # %bb.0:
1097; AVX1-NEXT:    vpsrad $7, %xmm0, %xmm1
1098; AVX1-NEXT:    vpsrad $5, %xmm0, %xmm2
1099; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1100; AVX1-NEXT:    vpsrad $6, %xmm0, %xmm2
1101; AVX1-NEXT:    vpsrad $4, %xmm0, %xmm3
1102; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1103; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
1104; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1105; AVX1-NEXT:    vpsrad $7, %xmm0, %xmm2
1106; AVX1-NEXT:    vpsrad $9, %xmm0, %xmm3
1107; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1108; AVX1-NEXT:    vpsrad $8, %xmm0, %xmm0
1109; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1110; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1111; AVX1-NEXT:    retq
1112;
1113; AVX2-LABEL: constant_shift_v8i32:
1114; AVX2:       # %bb.0:
1115; AVX2-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1116; AVX2-NEXT:    retq
1117;
1118; XOPAVX1-LABEL: constant_shift_v8i32:
1119; XOPAVX1:       # %bb.0:
1120; XOPAVX1-NEXT:    vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1121; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1122; XOPAVX1-NEXT:    vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1123; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1124; XOPAVX1-NEXT:    retq
1125;
1126; XOPAVX2-LABEL: constant_shift_v8i32:
1127; XOPAVX2:       # %bb.0:
1128; XOPAVX2-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1129; XOPAVX2-NEXT:    retq
1130;
1131; AVX512-LABEL: constant_shift_v8i32:
1132; AVX512:       # %bb.0:
1133; AVX512-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1134; AVX512-NEXT:    retq
1135;
1136; AVX512VL-LABEL: constant_shift_v8i32:
1137; AVX512VL:       # %bb.0:
1138; AVX512VL-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1139; AVX512VL-NEXT:    retq
1140;
1141; X86-AVX1-LABEL: constant_shift_v8i32:
1142; X86-AVX1:       # %bb.0:
1143; X86-AVX1-NEXT:    vpsrad $7, %xmm0, %xmm1
1144; X86-AVX1-NEXT:    vpsrad $5, %xmm0, %xmm2
1145; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1146; X86-AVX1-NEXT:    vpsrad $6, %xmm0, %xmm2
1147; X86-AVX1-NEXT:    vpsrad $4, %xmm0, %xmm3
1148; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1149; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
1150; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1151; X86-AVX1-NEXT:    vpsrad $7, %xmm0, %xmm2
1152; X86-AVX1-NEXT:    vpsrad $9, %xmm0, %xmm3
1153; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1154; X86-AVX1-NEXT:    vpsrad $8, %xmm0, %xmm0
1155; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1156; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1157; X86-AVX1-NEXT:    retl
1158;
1159; X86-AVX2-LABEL: constant_shift_v8i32:
1160; X86-AVX2:       # %bb.0:
1161; X86-AVX2-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1162; X86-AVX2-NEXT:    retl
1163  %shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
1164  ret <8 x i32> %shift
1165}
1166
1167define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
1168; AVX1-LABEL: constant_shift_v16i16:
1169; AVX1:       # %bb.0:
1170; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1171; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1172; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm2
1173; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
1174; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1175; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1176; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1177; AVX1-NEXT:    retq
1178;
1179; AVX2-LABEL: constant_shift_v16i16:
1180; AVX2:       # %bb.0:
1181; AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1182; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1183; AVX2-NEXT:    vpsraw $1, %xmm0, %xmm0
1184; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5,6,7]
1185; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1186; AVX2-NEXT:    retq
1187;
1188; XOPAVX1-LABEL: constant_shift_v16i16:
1189; XOPAVX1:       # %bb.0:
1190; XOPAVX1-NEXT:    vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1191; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1192; XOPAVX1-NEXT:    vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1193; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1194; XOPAVX1-NEXT:    retq
1195;
1196; XOPAVX2-LABEL: constant_shift_v16i16:
1197; XOPAVX2:       # %bb.0:
1198; XOPAVX2-NEXT:    vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1199; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1200; XOPAVX2-NEXT:    vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1201; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1202; XOPAVX2-NEXT:    retq
1203;
1204; AVX512DQ-LABEL: constant_shift_v16i16:
1205; AVX512DQ:       # %bb.0:
1206; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
1207; AVX512DQ-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1208; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
1209; AVX512DQ-NEXT:    retq
1210;
1211; AVX512BW-LABEL: constant_shift_v16i16:
1212; AVX512BW:       # %bb.0:
1213; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1214; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1215; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
1216; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1217; AVX512BW-NEXT:    retq
1218;
1219; AVX512DQVL-LABEL: constant_shift_v16i16:
1220; AVX512DQVL:       # %bb.0:
1221; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
1222; AVX512DQVL-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1223; AVX512DQVL-NEXT:    vpmovdw %zmm0, %ymm0
1224; AVX512DQVL-NEXT:    retq
1225;
1226; AVX512BWVL-LABEL: constant_shift_v16i16:
1227; AVX512BWVL:       # %bb.0:
1228; AVX512BWVL-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1229; AVX512BWVL-NEXT:    retq
1230;
1231; X86-AVX1-LABEL: constant_shift_v16i16:
1232; X86-AVX1:       # %bb.0:
1233; X86-AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
1234; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1235; X86-AVX1-NEXT:    vpsraw $1, %xmm0, %xmm2
1236; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
1237; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1238; X86-AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1239; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1240; X86-AVX1-NEXT:    retl
1241;
1242; X86-AVX2-LABEL: constant_shift_v16i16:
1243; X86-AVX2:       # %bb.0:
1244; X86-AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1
1245; X86-AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1246; X86-AVX2-NEXT:    vpsraw $1, %xmm0, %xmm0
1247; X86-AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5,6,7]
1248; X86-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1249; X86-AVX2-NEXT:    retl
1250  %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1251  ret <16 x i16> %shift
1252}
1253
1254define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
1255; AVX1-LABEL: constant_shift_v32i8:
1256; AVX1:       # %bb.0:
1257; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1258; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1259; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
1260; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,4,8,16,32,64,128,256]
1261; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
1262; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
1263; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1264; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
1265; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [256,128,64,32,16,8,4,2]
1266; AVX1-NEXT:    vpmullw %xmm4, %xmm1, %xmm1
1267; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1268; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
1269; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1270; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
1271; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
1272; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
1273; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1274; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
1275; AVX1-NEXT:    vpmullw %xmm4, %xmm0, %xmm0
1276; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1277; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1278; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1279; AVX1-NEXT:    retq
1280;
1281; AVX2-LABEL: constant_shift_v32i8:
1282; AVX2:       # %bb.0:
1283; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1284; AVX2-NEXT:    vpsraw $8, %ymm1, %ymm1
1285; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1286; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
1287; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1288; AVX2-NEXT:    vpsraw $8, %ymm0, %ymm0
1289; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1290; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1291; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1292; AVX2-NEXT:    retq
1293;
1294; XOPAVX1-LABEL: constant_shift_v32i8:
1295; XOPAVX1:       # %bb.0:
1296; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1297; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0]
1298; XOPAVX1-NEXT:    vpshab %xmm2, %xmm1, %xmm1
1299; XOPAVX1-NEXT:    vpshab %xmm2, %xmm0, %xmm0
1300; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1301; XOPAVX1-NEXT:    retq
1302;
1303; XOPAVX2-LABEL: constant_shift_v32i8:
1304; XOPAVX2:       # %bb.0:
1305; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1306; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0]
1307; XOPAVX2-NEXT:    vpshab %xmm2, %xmm1, %xmm1
1308; XOPAVX2-NEXT:    vpshab %xmm2, %xmm0, %xmm0
1309; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1310; XOPAVX2-NEXT:    retq
1311;
1312; AVX512DQ-LABEL: constant_shift_v32i8:
1313; AVX512DQ:       # %bb.0:
1314; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1315; AVX512DQ-NEXT:    vpsraw $8, %ymm1, %ymm1
1316; AVX512DQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1317; AVX512DQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
1318; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1319; AVX512DQ-NEXT:    vpsraw $8, %ymm0, %ymm0
1320; AVX512DQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1321; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
1322; AVX512DQ-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1323; AVX512DQ-NEXT:    retq
1324;
1325; AVX512BW-LABEL: constant_shift_v32i8:
1326; AVX512BW:       # %bb.0:
1327; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
1328; AVX512BW-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1329; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1330; AVX512BW-NEXT:    retq
1331;
1332; AVX512DQVL-LABEL: constant_shift_v32i8:
1333; AVX512DQVL:       # %bb.0:
1334; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1335; AVX512DQVL-NEXT:    vpsraw $8, %ymm1, %ymm1
1336; AVX512DQVL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1337; AVX512DQVL-NEXT:    vpsrlw $8, %ymm1, %ymm1
1338; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1339; AVX512DQVL-NEXT:    vpsraw $8, %ymm0, %ymm0
1340; AVX512DQVL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1341; AVX512DQVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1342; AVX512DQVL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1343; AVX512DQVL-NEXT:    retq
1344;
1345; AVX512BWVL-LABEL: constant_shift_v32i8:
1346; AVX512BWVL:       # %bb.0:
1347; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
1348; AVX512BWVL-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1349; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
1350; AVX512BWVL-NEXT:    retq
1351;
1352; X86-AVX1-LABEL: constant_shift_v32i8:
1353; X86-AVX1:       # %bb.0:
1354; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1355; X86-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1356; X86-AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
1357; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,4,8,16,32,64,128,256]
1358; X86-AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
1359; X86-AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
1360; X86-AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1361; X86-AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
1362; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [256,128,64,32,16,8,4,2]
1363; X86-AVX1-NEXT:    vpmullw %xmm4, %xmm1, %xmm1
1364; X86-AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1365; X86-AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
1366; X86-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1367; X86-AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
1368; X86-AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
1369; X86-AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
1370; X86-AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1371; X86-AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
1372; X86-AVX1-NEXT:    vpmullw %xmm4, %xmm0, %xmm0
1373; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1374; X86-AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1375; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1376; X86-AVX1-NEXT:    retl
1377;
1378; X86-AVX2-LABEL: constant_shift_v32i8:
1379; X86-AVX2:       # %bb.0:
1380; X86-AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1381; X86-AVX2-NEXT:    vpsraw $8, %ymm1, %ymm1
1382; X86-AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
1383; X86-AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
1384; X86-AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1385; X86-AVX2-NEXT:    vpsraw $8, %ymm0, %ymm0
1386; X86-AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1387; X86-AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1388; X86-AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1389; X86-AVX2-NEXT:    retl
1390  %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
1391  ret <32 x i8> %shift
1392}
1393
1394;
1395; Uniform Constant Shifts
1396;
1397
1398define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
1399; AVX1-LABEL: splatconstant_shift_v4i64:
1400; AVX1:       # %bb.0:
1401; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1402; AVX1-NEXT:    vpsrad $7, %xmm1, %xmm2
1403; AVX1-NEXT:    vpsrlq $7, %xmm1, %xmm1
1404; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1405; AVX1-NEXT:    vpsrad $7, %xmm0, %xmm2
1406; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
1407; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1408; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1409; AVX1-NEXT:    retq
1410;
1411; AVX2-LABEL: splatconstant_shift_v4i64:
1412; AVX2:       # %bb.0:
1413; AVX2-NEXT:    vpsrad $7, %ymm0, %ymm1
1414; AVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
1415; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1416; AVX2-NEXT:    retq
1417;
1418; XOPAVX1-LABEL: splatconstant_shift_v4i64:
1419; XOPAVX1:       # %bb.0:
1420; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1421; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [18446744073709551609,18446744073709551609]
1422; XOPAVX1-NEXT:    vpshaq %xmm2, %xmm1, %xmm1
1423; XOPAVX1-NEXT:    vpshaq %xmm2, %xmm0, %xmm0
1424; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1425; XOPAVX1-NEXT:    retq
1426;
1427; XOPAVX2-LABEL: splatconstant_shift_v4i64:
1428; XOPAVX2:       # %bb.0:
1429; XOPAVX2-NEXT:    vpsrad $7, %ymm0, %ymm1
1430; XOPAVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
1431; XOPAVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1432; XOPAVX2-NEXT:    retq
1433;
1434; AVX512-LABEL: splatconstant_shift_v4i64:
1435; AVX512:       # %bb.0:
1436; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1437; AVX512-NEXT:    vpsraq $7, %zmm0, %zmm0
1438; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1439; AVX512-NEXT:    retq
1440;
1441; AVX512VL-LABEL: splatconstant_shift_v4i64:
1442; AVX512VL:       # %bb.0:
1443; AVX512VL-NEXT:    vpsraq $7, %ymm0, %ymm0
1444; AVX512VL-NEXT:    retq
1445;
1446; X86-AVX1-LABEL: splatconstant_shift_v4i64:
1447; X86-AVX1:       # %bb.0:
1448; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1449; X86-AVX1-NEXT:    vpsrad $7, %xmm1, %xmm2
1450; X86-AVX1-NEXT:    vpsrlq $7, %xmm1, %xmm1
1451; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1452; X86-AVX1-NEXT:    vpsrad $7, %xmm0, %xmm2
1453; X86-AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
1454; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1455; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1456; X86-AVX1-NEXT:    retl
1457;
1458; X86-AVX2-LABEL: splatconstant_shift_v4i64:
1459; X86-AVX2:       # %bb.0:
1460; X86-AVX2-NEXT:    vpsrad $7, %ymm0, %ymm1
1461; X86-AVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
1462; X86-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1463; X86-AVX2-NEXT:    retl
1464  %shift = ashr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
1465  ret <4 x i64> %shift
1466}
1467
1468define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
1469; AVX1-LABEL: splatconstant_shift_v8i32:
1470; AVX1:       # %bb.0:
1471; AVX1-NEXT:    vpsrad $5, %xmm0, %xmm1
1472; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1473; AVX1-NEXT:    vpsrad $5, %xmm0, %xmm0
1474; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1475; AVX1-NEXT:    retq
1476;
1477; AVX2-LABEL: splatconstant_shift_v8i32:
1478; AVX2:       # %bb.0:
1479; AVX2-NEXT:    vpsrad $5, %ymm0, %ymm0
1480; AVX2-NEXT:    retq
1481;
1482; XOPAVX1-LABEL: splatconstant_shift_v8i32:
1483; XOPAVX1:       # %bb.0:
1484; XOPAVX1-NEXT:    vpsrad $5, %xmm0, %xmm1
1485; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1486; XOPAVX1-NEXT:    vpsrad $5, %xmm0, %xmm0
1487; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1488; XOPAVX1-NEXT:    retq
1489;
1490; XOPAVX2-LABEL: splatconstant_shift_v8i32:
1491; XOPAVX2:       # %bb.0:
1492; XOPAVX2-NEXT:    vpsrad $5, %ymm0, %ymm0
1493; XOPAVX2-NEXT:    retq
1494;
1495; AVX512-LABEL: splatconstant_shift_v8i32:
1496; AVX512:       # %bb.0:
1497; AVX512-NEXT:    vpsrad $5, %ymm0, %ymm0
1498; AVX512-NEXT:    retq
1499;
1500; AVX512VL-LABEL: splatconstant_shift_v8i32:
1501; AVX512VL:       # %bb.0:
1502; AVX512VL-NEXT:    vpsrad $5, %ymm0, %ymm0
1503; AVX512VL-NEXT:    retq
1504;
1505; X86-AVX1-LABEL: splatconstant_shift_v8i32:
1506; X86-AVX1:       # %bb.0:
1507; X86-AVX1-NEXT:    vpsrad $5, %xmm0, %xmm1
1508; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1509; X86-AVX1-NEXT:    vpsrad $5, %xmm0, %xmm0
1510; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1511; X86-AVX1-NEXT:    retl
1512;
1513; X86-AVX2-LABEL: splatconstant_shift_v8i32:
1514; X86-AVX2:       # %bb.0:
1515; X86-AVX2-NEXT:    vpsrad $5, %ymm0, %ymm0
1516; X86-AVX2-NEXT:    retl
1517  %shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
1518  ret <8 x i32> %shift
1519}
1520
1521define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
1522; AVX1-LABEL: splatconstant_shift_v16i16:
1523; AVX1:       # %bb.0:
1524; AVX1-NEXT:    vpsraw $3, %xmm0, %xmm1
1525; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1526; AVX1-NEXT:    vpsraw $3, %xmm0, %xmm0
1527; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1528; AVX1-NEXT:    retq
1529;
1530; AVX2-LABEL: splatconstant_shift_v16i16:
1531; AVX2:       # %bb.0:
1532; AVX2-NEXT:    vpsraw $3, %ymm0, %ymm0
1533; AVX2-NEXT:    retq
1534;
1535; XOPAVX1-LABEL: splatconstant_shift_v16i16:
1536; XOPAVX1:       # %bb.0:
1537; XOPAVX1-NEXT:    vpsraw $3, %xmm0, %xmm1
1538; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1539; XOPAVX1-NEXT:    vpsraw $3, %xmm0, %xmm0
1540; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1541; XOPAVX1-NEXT:    retq
1542;
1543; XOPAVX2-LABEL: splatconstant_shift_v16i16:
1544; XOPAVX2:       # %bb.0:
1545; XOPAVX2-NEXT:    vpsraw $3, %ymm0, %ymm0
1546; XOPAVX2-NEXT:    retq
1547;
1548; AVX512-LABEL: splatconstant_shift_v16i16:
1549; AVX512:       # %bb.0:
1550; AVX512-NEXT:    vpsraw $3, %ymm0, %ymm0
1551; AVX512-NEXT:    retq
1552;
1553; AVX512VL-LABEL: splatconstant_shift_v16i16:
1554; AVX512VL:       # %bb.0:
1555; AVX512VL-NEXT:    vpsraw $3, %ymm0, %ymm0
1556; AVX512VL-NEXT:    retq
1557;
1558; X86-AVX1-LABEL: splatconstant_shift_v16i16:
1559; X86-AVX1:       # %bb.0:
1560; X86-AVX1-NEXT:    vpsraw $3, %xmm0, %xmm1
1561; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1562; X86-AVX1-NEXT:    vpsraw $3, %xmm0, %xmm0
1563; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1564; X86-AVX1-NEXT:    retl
1565;
1566; X86-AVX2-LABEL: splatconstant_shift_v16i16:
1567; X86-AVX2:       # %bb.0:
1568; X86-AVX2-NEXT:    vpsraw $3, %ymm0, %ymm0
1569; X86-AVX2-NEXT:    retl
1570  %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
1571  ret <16 x i16> %shift
1572}
1573
1574define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
1575; AVX1-LABEL: splatconstant_shift_v32i8:
1576; AVX1:       # %bb.0:
1577; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1578; AVX1-NEXT:    vpsrlw $3, %xmm1, %xmm1
1579; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
1580; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
1581; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1582; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
1583; AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
1584; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
1585; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1586; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
1587; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
1588; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1589; AVX1-NEXT:    retq
1590;
1591; AVX2-LABEL: splatconstant_shift_v32i8:
1592; AVX2:       # %bb.0:
1593; AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
1594; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1595; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1596; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
1597; AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
1598; AVX2-NEXT:    retq
1599;
1600; XOPAVX1-LABEL: splatconstant_shift_v32i8:
1601; XOPAVX1:       # %bb.0:
1602; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1603; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253]
1604; XOPAVX1-NEXT:    vpshab %xmm2, %xmm1, %xmm1
1605; XOPAVX1-NEXT:    vpshab %xmm2, %xmm0, %xmm0
1606; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1607; XOPAVX1-NEXT:    retq
1608;
1609; XOPAVX2-LABEL: splatconstant_shift_v32i8:
1610; XOPAVX2:       # %bb.0:
1611; XOPAVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
1612; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1613; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1614; XOPAVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
1615; XOPAVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
1616; XOPAVX2-NEXT:    retq
1617;
1618; AVX512-LABEL: splatconstant_shift_v32i8:
1619; AVX512:       # %bb.0:
1620; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
1621; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1622; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1623; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
1624; AVX512-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
1625; AVX512-NEXT:    retq
1626;
1627; AVX512VL-LABEL: splatconstant_shift_v32i8:
1628; AVX512VL:       # %bb.0:
1629; AVX512VL-NEXT:    vpsrlw $3, %ymm0, %ymm0
1630; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1631; AVX512VL-NEXT:    vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1632; AVX512VL-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
1633; AVX512VL-NEXT:    retq
1634;
1635; X86-AVX1-LABEL: splatconstant_shift_v32i8:
1636; X86-AVX1:       # %bb.0:
1637; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1638; X86-AVX1-NEXT:    vpsrlw $3, %xmm1, %xmm1
1639; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
1640; X86-AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
1641; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1642; X86-AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
1643; X86-AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
1644; X86-AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
1645; X86-AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1646; X86-AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
1647; X86-AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
1648; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1649; X86-AVX1-NEXT:    retl
1650;
1651; X86-AVX2-LABEL: splatconstant_shift_v32i8:
1652; X86-AVX2:       # %bb.0:
1653; X86-AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
1654; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1655; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1656; X86-AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
1657; X86-AVX2-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
1658; X86-AVX2-NEXT:    retl
1659  %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
1660  ret <32 x i8> %shift
1661}
1662
1663;
1664; Special Cases
1665;
1666
1667define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind {
1668; AVX1-LABEL: shift32_v4i64:
1669; AVX1:       # %bb.0:
1670; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1671; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
1672; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1673; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1674; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
1675; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1676; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1677; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1678; AVX1-NEXT:    retq
1679;
1680; AVX2-LABEL: shift32_v4i64:
1681; AVX2:       # %bb.0:
1682; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm1
1683; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
1684; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1685; AVX2-NEXT:    retq
1686;
1687; XOPAVX1-LABEL: shift32_v4i64:
1688; XOPAVX1:       # %bb.0:
1689; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1690; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [18446744073709551584,18446744073709551584]
1691; XOPAVX1-NEXT:    vpshaq %xmm2, %xmm1, %xmm1
1692; XOPAVX1-NEXT:    vpshaq %xmm2, %xmm0, %xmm0
1693; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1694; XOPAVX1-NEXT:    retq
1695;
1696; XOPAVX2-LABEL: shift32_v4i64:
1697; XOPAVX2:       # %bb.0:
1698; XOPAVX2-NEXT:    vpsrad $31, %ymm0, %ymm1
1699; XOPAVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
1700; XOPAVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1701; XOPAVX2-NEXT:    retq
1702;
1703; AVX512-LABEL: shift32_v4i64:
1704; AVX512:       # %bb.0:
1705; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1706; AVX512-NEXT:    vpsraq $32, %zmm0, %zmm0
1707; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1708; AVX512-NEXT:    retq
1709;
1710; AVX512VL-LABEL: shift32_v4i64:
1711; AVX512VL:       # %bb.0:
1712; AVX512VL-NEXT:    vpsraq $32, %ymm0, %ymm0
1713; AVX512VL-NEXT:    retq
1714;
1715; X86-AVX1-LABEL: shift32_v4i64:
1716; X86-AVX1:       # %bb.0:
1717; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1718; X86-AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
1719; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1720; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1721; X86-AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
1722; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1723; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1724; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1725; X86-AVX1-NEXT:    retl
1726;
1727; X86-AVX2-LABEL: shift32_v4i64:
1728; X86-AVX2:       # %bb.0:
1729; X86-AVX2-NEXT:    vpsrad $31, %ymm0, %ymm1
1730; X86-AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
1731; X86-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
1732; X86-AVX2-NEXT:    retl
1733  %shift = ashr <4 x i64> %a, <i64 32, i64 32, i64 32, i64 32>
1734  ret <4 x i64> %shift
1735}
1736