1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512DQ
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW
4
5;
6; Variable Shifts
7;
8
9define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
10; ALL-LABEL: var_shift_v8i64:
11; ALL:       # %bb.0:
12; ALL-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0
13; ALL-NEXT:    retq
14  %shift = lshr <8 x i64> %a, %b
15  ret <8 x i64> %shift
16}
17
18define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
19; ALL-LABEL: var_shift_v16i32:
20; ALL:       # %bb.0:
21; ALL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
22; ALL-NEXT:    retq
23  %shift = lshr <16 x i32> %a, %b
24  ret <16 x i32> %shift
25}
26
27define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
28; AVX512DQ-LABEL: var_shift_v32i16:
29; AVX512DQ:       # %bb.0:
30; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
31; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
32; AVX512DQ-NEXT:    vpsrlvd %zmm2, %zmm3, %zmm2
33; AVX512DQ-NEXT:    vpmovdw %zmm2, %ymm2
34; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
35; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
36; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
37; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
38; AVX512DQ-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
39; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
40; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
41; AVX512DQ-NEXT:    retq
42;
43; AVX512BW-LABEL: var_shift_v32i16:
44; AVX512BW:       # %bb.0:
45; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
46; AVX512BW-NEXT:    retq
47  %shift = lshr <32 x i16> %a, %b
48  ret <32 x i16> %shift
49}
50
51define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
52; AVX512DQ-LABEL: var_shift_v64i8:
53; AVX512DQ:       # %bb.0:
54; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
55; AVX512DQ-NEXT:    vpsrlw $4, %ymm2, %ymm3
56; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
57; AVX512DQ-NEXT:    vpand %ymm4, %ymm3, %ymm3
58; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
59; AVX512DQ-NEXT:    vpsllw $5, %ymm5, %ymm5
60; AVX512DQ-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
61; AVX512DQ-NEXT:    vpsrlw $2, %ymm2, %ymm3
62; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
63; AVX512DQ-NEXT:    vpand %ymm6, %ymm3, %ymm3
64; AVX512DQ-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
65; AVX512DQ-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
66; AVX512DQ-NEXT:    vpsrlw $1, %ymm2, %ymm3
67; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
68; AVX512DQ-NEXT:    vpand %ymm7, %ymm3, %ymm3
69; AVX512DQ-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
70; AVX512DQ-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
71; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
72; AVX512DQ-NEXT:    vpand %ymm4, %ymm3, %ymm3
73; AVX512DQ-NEXT:    vpsllw $5, %ymm1, %ymm1
74; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
75; AVX512DQ-NEXT:    vpsrlw $2, %ymm0, %ymm3
76; AVX512DQ-NEXT:    vpand %ymm6, %ymm3, %ymm3
77; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
78; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
79; AVX512DQ-NEXT:    vpsrlw $1, %ymm0, %ymm3
80; AVX512DQ-NEXT:    vpand %ymm7, %ymm3, %ymm3
81; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
82; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
83; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
84; AVX512DQ-NEXT:    retq
85;
86; AVX512BW-LABEL: var_shift_v64i8:
87; AVX512BW:       # %bb.0:
88; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm2
89; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
90; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
91; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
92; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
93; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm2
94; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
95; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
96; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
97; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
98; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm2
99; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
100; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
101; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
102; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
103; AVX512BW-NEXT:    retq
104  %shift = lshr <64 x i8> %a, %b
105  ret <64 x i8> %shift
106}
107
108;
109; Uniform Variable Shifts
110;
111
112define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
113; ALL-LABEL: splatvar_shift_v8i64:
114; ALL:       # %bb.0:
115; ALL-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0
116; ALL-NEXT:    retq
117  %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
118  %shift = lshr <8 x i64> %a, %splat
119  ret <8 x i64> %shift
120}
121
122define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
123; ALL-LABEL: splatvar_shift_v16i32:
124; ALL:       # %bb.0:
125; ALL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
126; ALL-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
127; ALL-NEXT:    retq
128  %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
129  %shift = lshr <16 x i32> %a, %splat
130  ret <16 x i32> %shift
131}
132
133define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
134; AVX512DQ-LABEL: splatvar_shift_v32i16:
135; AVX512DQ:       # %bb.0:
136; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
137; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
138; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm2, %ymm2
139; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
140; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
141; AVX512DQ-NEXT:    retq
142;
143; AVX512BW-LABEL: splatvar_shift_v32i16:
144; AVX512BW:       # %bb.0:
145; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
146; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
147; AVX512BW-NEXT:    retq
148  %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
149  %shift = lshr <32 x i16> %a, %splat
150  ret <32 x i16> %shift
151}
152
153define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
154; AVX512DQ-LABEL: splatvar_shift_v64i8:
155; AVX512DQ:       # %bb.0:
156; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
157; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
158; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm2, %ymm2
159; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
160; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
161; AVX512DQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
162; AVX512DQ-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
163; AVX512DQ-NEXT:    vpsrlw $8, %xmm1, %xmm1
164; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
165; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm1, %zmm1
166; AVX512DQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
167; AVX512DQ-NEXT:    retq
168;
169; AVX512BW-LABEL: splatvar_shift_v64i8:
170; AVX512BW:       # %bb.0:
171; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
172; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
173; AVX512BW-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
174; AVX512BW-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
175; AVX512BW-NEXT:    vpsrlw $8, %xmm1, %xmm1
176; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
177; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
178; AVX512BW-NEXT:    retq
179  %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
180  %shift = lshr <64 x i8> %a, %splat
181  ret <64 x i8> %shift
182}
183
184;
185; Constant Shifts
186;
187
188define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
189; ALL-LABEL: constant_shift_v8i64:
190; ALL:       # %bb.0:
191; ALL-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
192; ALL-NEXT:    retq
193  %shift = lshr <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62>
194  ret <8 x i64> %shift
195}
196
197define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
198; ALL-LABEL: constant_shift_v16i32:
199; ALL:       # %bb.0:
200; ALL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
201; ALL-NEXT:    retq
202  %shift = lshr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
203  ret <16 x i32> %shift
204}
205
206define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
207; AVX512DQ-LABEL: constant_shift_v32i16:
208; AVX512DQ:       # %bb.0:
209; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
210; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
211; AVX512DQ-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm3
212; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3,4,5,6,7]
213; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
214; AVX512DQ-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm2
215; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
216; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
217; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
218; AVX512DQ-NEXT:    retq
219;
220; AVX512BW-LABEL: constant_shift_v32i16:
221; AVX512BW:       # %bb.0:
222; AVX512BW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
223; AVX512BW-NEXT:    retq
224  %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
225  ret <32 x i16> %shift
226}
227
228define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
229; AVX512DQ-LABEL: constant_shift_v64i8:
230; AVX512DQ:       # %bb.0:
231; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
232; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
233; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
234; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256]
235; AVX512DQ-NEXT:    # ymm4 = mem[0,1,0,1]
236; AVX512DQ-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
237; AVX512DQ-NEXT:    vpsrlw $8, %ymm3, %ymm3
238; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
239; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
240; AVX512DQ-NEXT:    # ymm5 = mem[0,1,0,1]
241; AVX512DQ-NEXT:    vpmullw %ymm5, %ymm1, %ymm1
242; AVX512DQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
243; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
244; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
245; AVX512DQ-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
246; AVX512DQ-NEXT:    vpsrlw $8, %ymm3, %ymm3
247; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
248; AVX512DQ-NEXT:    vpmullw %ymm5, %ymm0, %ymm0
249; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
250; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
251; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
252; AVX512DQ-NEXT:    retq
253;
254; AVX512BW-LABEL: constant_shift_v64i8:
255; AVX512BW:       # %bb.0:
256; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
257; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
258; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
259; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
260; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
261; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
262; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
263; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
264; AVX512BW-NEXT:    retq
265  %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
266  ret <64 x i8> %shift
267}
268
269;
270; Uniform Constant Shifts
271;
272
273define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
274; ALL-LABEL: splatconstant_shift_v8i64:
275; ALL:       # %bb.0:
276; ALL-NEXT:    vpsrlq $7, %zmm0, %zmm0
277; ALL-NEXT:    retq
278  %shift = lshr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
279  ret <8 x i64> %shift
280}
281
282define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
283; ALL-LABEL: splatconstant_shift_v16i32:
284; ALL:       # %bb.0:
285; ALL-NEXT:    vpsrld $5, %zmm0, %zmm0
286; ALL-NEXT:    retq
287  %shift = lshr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
288  ret <16 x i32> %shift
289}
290
291define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
292; AVX512DQ-LABEL: splatconstant_shift_v32i16:
293; AVX512DQ:       # %bb.0:
294; AVX512DQ-NEXT:    vpsrlw $3, %ymm0, %ymm1
295; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
296; AVX512DQ-NEXT:    vpsrlw $3, %ymm0, %ymm0
297; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
298; AVX512DQ-NEXT:    retq
299;
300; AVX512BW-LABEL: splatconstant_shift_v32i16:
301; AVX512BW:       # %bb.0:
302; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0
303; AVX512BW-NEXT:    retq
304  %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
305  ret <32 x i16> %shift
306}
307
308define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
309; AVX512DQ-LABEL: splatconstant_shift_v64i8:
310; AVX512DQ:       # %bb.0:
311; AVX512DQ-NEXT:    vpsrlw $3, %ymm0, %ymm1
312; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
313; AVX512DQ-NEXT:    vpsrlw $3, %ymm0, %ymm0
314; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
315; AVX512DQ-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
316; AVX512DQ-NEXT:    retq
317;
318; AVX512BW-LABEL: splatconstant_shift_v64i8:
319; AVX512BW:       # %bb.0:
320; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0
321; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
322; AVX512BW-NEXT:    retq
323  %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
324  ret <64 x i8> %shift
325}
326