1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
10
11;
12; Variable Rotates
13;
14
15define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
16; AVX1-LABEL: var_rotate_v4i64:
17; AVX1:       # %bb.0:
18; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
19; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm3
20; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
21; AVX1-NEXT:    vpsubq %xmm4, %xmm2, %xmm2
22; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
23; AVX1-NEXT:    vpsllq %xmm4, %xmm5, %xmm6
24; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
25; AVX1-NEXT:    vpsllq %xmm4, %xmm5, %xmm4
26; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
27; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm6
28; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
29; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
30; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
31; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
32; AVX1-NEXT:    vpsrlq %xmm2, %xmm5, %xmm4
33; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
34; AVX1-NEXT:    vpsrlq %xmm2, %xmm5, %xmm2
35; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
36; AVX1-NEXT:    vpsrlq %xmm3, %xmm0, %xmm4
37; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
38; AVX1-NEXT:    vpsrlq %xmm3, %xmm0, %xmm0
39; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
40; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
41; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
42; AVX1-NEXT:    retq
43;
44; AVX2-LABEL: var_rotate_v4i64:
45; AVX2:       # %bb.0:
46; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [64,64,64,64]
47; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm2
48; AVX2-NEXT:    vpsllvq %ymm1, %ymm0, %ymm1
49; AVX2-NEXT:    vpsrlvq %ymm2, %ymm0, %ymm0
50; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
51; AVX2-NEXT:    retq
52;
53; AVX512F-LABEL: var_rotate_v4i64:
54; AVX512F:       # %bb.0:
55; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
56; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
57; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
58; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
59; AVX512F-NEXT:    retq
60;
61; AVX512VL-LABEL: var_rotate_v4i64:
62; AVX512VL:       # %bb.0:
63; AVX512VL-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
64; AVX512VL-NEXT:    retq
65;
66; AVX512BW-LABEL: var_rotate_v4i64:
67; AVX512BW:       # %bb.0:
68; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
69; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
70; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
71; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
72; AVX512BW-NEXT:    retq
73;
74; AVX512VLBW-LABEL: var_rotate_v4i64:
75; AVX512VLBW:       # %bb.0:
76; AVX512VLBW-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
77; AVX512VLBW-NEXT:    retq
78;
79; XOPAVX1-LABEL: var_rotate_v4i64:
80; XOPAVX1:       # %bb.0:
81; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
82; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
83; XOPAVX1-NEXT:    vprotq %xmm2, %xmm3, %xmm2
84; XOPAVX1-NEXT:    vprotq %xmm1, %xmm0, %xmm0
85; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
86; XOPAVX1-NEXT:    retq
87;
88; XOPAVX2-LABEL: var_rotate_v4i64:
89; XOPAVX2:       # %bb.0:
90; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
91; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
92; XOPAVX2-NEXT:    vprotq %xmm2, %xmm3, %xmm2
93; XOPAVX2-NEXT:    vprotq %xmm1, %xmm0, %xmm0
94; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
95; XOPAVX2-NEXT:    retq
96  %b64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %b
97  %shl = shl <4 x i64> %a, %b
98  %lshr = lshr <4 x i64> %a, %b64
99  %or = or <4 x i64> %shl, %lshr
100  ret <4 x i64> %or
101}
102
103define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
104; AVX1-LABEL: var_rotate_v8i32:
105; AVX1:       # %bb.0:
106; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
107; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [31,31,31,31]
108; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
109; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
110; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
111; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
112; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
113; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
114; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
115; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
116; AVX1-NEXT:    vpmuludq %xmm5, %xmm7, %xmm5
117; AVX1-NEXT:    vpmuludq %xmm2, %xmm6, %xmm2
118; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
119; AVX1-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
120; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,0,2,2]
121; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
122; AVX1-NEXT:    vpor %xmm6, %xmm2, %xmm2
123; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
124; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
125; AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
126; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
127; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
128; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
129; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
130; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
131; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
132; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
133; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
134; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
135; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
136; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
137; AVX1-NEXT:    retq
138;
139; AVX2-LABEL: var_rotate_v8i32:
140; AVX2:       # %bb.0:
141; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31]
142; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
143; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm2
144; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32]
145; AVX2-NEXT:    vpsubd %ymm1, %ymm3, %ymm1
146; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
147; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
148; AVX2-NEXT:    retq
149;
150; AVX512F-LABEL: var_rotate_v8i32:
151; AVX512F:       # %bb.0:
152; AVX512F-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
153; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
154; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
155; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
156; AVX512F-NEXT:    retq
157;
158; AVX512VL-LABEL: var_rotate_v8i32:
159; AVX512VL:       # %bb.0:
160; AVX512VL-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
161; AVX512VL-NEXT:    retq
162;
163; AVX512BW-LABEL: var_rotate_v8i32:
164; AVX512BW:       # %bb.0:
165; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
166; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
167; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
168; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
169; AVX512BW-NEXT:    retq
170;
171; AVX512VLBW-LABEL: var_rotate_v8i32:
172; AVX512VLBW:       # %bb.0:
173; AVX512VLBW-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
174; AVX512VLBW-NEXT:    retq
175;
176; XOPAVX1-LABEL: var_rotate_v8i32:
177; XOPAVX1:       # %bb.0:
178; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
179; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
180; XOPAVX1-NEXT:    vprotd %xmm2, %xmm3, %xmm2
181; XOPAVX1-NEXT:    vprotd %xmm1, %xmm0, %xmm0
182; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
183; XOPAVX1-NEXT:    retq
184;
185; XOPAVX2-LABEL: var_rotate_v8i32:
186; XOPAVX2:       # %bb.0:
187; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
188; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
189; XOPAVX2-NEXT:    vprotd %xmm2, %xmm3, %xmm2
190; XOPAVX2-NEXT:    vprotd %xmm1, %xmm0, %xmm0
191; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
192; XOPAVX2-NEXT:    retq
193  %b32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
194  %shl = shl <8 x i32> %a, %b
195  %lshr = lshr <8 x i32> %a, %b32
196  %or = or <8 x i32> %shl, %lshr
197  ret <8 x i32> %or
198}
199
200define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
201; AVX1-LABEL: var_rotate_v16i16:
202; AVX1:       # %bb.0:
203; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
204; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
205; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
206; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
207; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
208; AVX1-NEXT:    vpslld $23, %xmm5, %xmm5
209; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
210; AVX1-NEXT:    vpaddd %xmm6, %xmm5, %xmm5
211; AVX1-NEXT:    vcvttps2dq %xmm5, %xmm5
212; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
213; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
214; AVX1-NEXT:    vpaddd %xmm6, %xmm2, %xmm2
215; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
216; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
217; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
218; AVX1-NEXT:    vpmulhuw %xmm2, %xmm5, %xmm7
219; AVX1-NEXT:    vpmullw %xmm2, %xmm5, %xmm2
220; AVX1-NEXT:    vpor %xmm7, %xmm2, %xmm2
221; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
222; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
223; AVX1-NEXT:    vpslld $23, %xmm3, %xmm3
224; AVX1-NEXT:    vpaddd %xmm6, %xmm3, %xmm3
225; AVX1-NEXT:    vcvttps2dq %xmm3, %xmm3
226; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
227; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
228; AVX1-NEXT:    vpaddd %xmm6, %xmm1, %xmm1
229; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
230; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
231; AVX1-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm3
232; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
233; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
234; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
235; AVX1-NEXT:    retq
236;
237; AVX2-LABEL: var_rotate_v16i16:
238; AVX2:       # %bb.0:
239; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
240; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
241; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
242; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
243; AVX2-NEXT:    vpsllvd %ymm4, %ymm3, %ymm4
244; AVX2-NEXT:    vpsrld $16, %ymm4, %ymm4
245; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
246; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
247; AVX2-NEXT:    vpsllvd %ymm5, %ymm0, %ymm5
248; AVX2-NEXT:    vpsrld $16, %ymm5, %ymm5
249; AVX2-NEXT:    vpackusdw %ymm4, %ymm5, %ymm4
250; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
251; AVX2-NEXT:    vpsubw %ymm1, %ymm5, %ymm1
252; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
253; AVX2-NEXT:    vpsrlvd %ymm5, %ymm3, %ymm3
254; AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
255; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
256; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
257; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
258; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
259; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
260; AVX2-NEXT:    retq
261;
262; AVX512F-LABEL: var_rotate_v16i16:
263; AVX512F:       # %bb.0:
264; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
265; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
266; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
267; AVX512F-NEXT:    vpsllvd %zmm2, %zmm0, %zmm2
268; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
269; AVX512F-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
270; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
271; AVX512F-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
272; AVX512F-NEXT:    vpord %zmm0, %zmm2, %zmm0
273; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
274; AVX512F-NEXT:    retq
275;
276; AVX512VL-LABEL: var_rotate_v16i16:
277; AVX512VL:       # %bb.0:
278; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
279; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
280; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
281; AVX512VL-NEXT:    vpsllvd %zmm2, %zmm0, %zmm2
282; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
283; AVX512VL-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
284; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
285; AVX512VL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
286; AVX512VL-NEXT:    vpord %zmm0, %zmm2, %zmm0
287; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
288; AVX512VL-NEXT:    retq
289;
290; AVX512BW-LABEL: var_rotate_v16i16:
291; AVX512BW:       # %bb.0:
292; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
293; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
294; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2
295; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
296; AVX512BW-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
297; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
298; AVX512BW-NEXT:    vpor %ymm0, %ymm2, %ymm0
299; AVX512BW-NEXT:    retq
300;
301; AVX512VLBW-LABEL: var_rotate_v16i16:
302; AVX512VLBW:       # %bb.0:
303; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
304; AVX512VLBW-NEXT:    vpsllvw %ymm1, %ymm0, %ymm2
305; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
306; AVX512VLBW-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
307; AVX512VLBW-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
308; AVX512VLBW-NEXT:    vpor %ymm0, %ymm2, %ymm0
309; AVX512VLBW-NEXT:    retq
310;
311; XOPAVX1-LABEL: var_rotate_v16i16:
312; XOPAVX1:       # %bb.0:
313; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
314; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
315; XOPAVX1-NEXT:    vprotw %xmm2, %xmm3, %xmm2
316; XOPAVX1-NEXT:    vprotw %xmm1, %xmm0, %xmm0
317; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
318; XOPAVX1-NEXT:    retq
319;
320; XOPAVX2-LABEL: var_rotate_v16i16:
321; XOPAVX2:       # %bb.0:
322; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
323; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
324; XOPAVX2-NEXT:    vprotw %xmm2, %xmm3, %xmm2
325; XOPAVX2-NEXT:    vprotw %xmm1, %xmm0, %xmm0
326; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
327; XOPAVX2-NEXT:    retq
328  %b16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
329  %shl = shl <16 x i16> %a, %b
330  %lshr = lshr <16 x i16> %a, %b16
331  %or = or <16 x i16> %shl, %lshr
332  ret <16 x i16> %or
333}
334
335define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
336; AVX1-LABEL: var_rotate_v32i8:
337; AVX1:       # %bb.0:
338; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
339; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
340; AVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
341; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
342; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm5
343; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
344; AVX1-NEXT:    vpand %xmm9, %xmm5, %xmm5
345; AVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
346; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
347; AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
348; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
349; AVX1-NEXT:    vpsrlw $6, %xmm2, %xmm3
350; AVX1-NEXT:    vmovdqa {{.*#+}} xmm10 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
351; AVX1-NEXT:    vpand %xmm10, %xmm3, %xmm3
352; AVX1-NEXT:    vpsllw $2, %xmm2, %xmm4
353; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
354; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
355; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
356; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm4
357; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
358; AVX1-NEXT:    vpsrlw $7, %xmm2, %xmm3
359; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
360; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
361; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm7
362; AVX1-NEXT:    vpor %xmm3, %xmm7, %xmm3
363; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm4
364; AVX1-NEXT:    vpblendvb %xmm4, %xmm3, %xmm2, %xmm2
365; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
366; AVX1-NEXT:    vpand %xmm8, %xmm3, %xmm3
367; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm4
368; AVX1-NEXT:    vpand %xmm9, %xmm4, %xmm4
369; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
370; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
371; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
372; AVX1-NEXT:    vpsrlw $6, %xmm0, %xmm3
373; AVX1-NEXT:    vpand %xmm10, %xmm3, %xmm3
374; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm4
375; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
376; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
377; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
378; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
379; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm3
380; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
381; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm4
382; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
383; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
384; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
385; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
386; AVX1-NEXT:    retq
387;
388; AVX2-LABEL: var_rotate_v32i8:
389; AVX2:       # %bb.0:
390; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
391; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
392; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm3
393; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
394; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
395; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
396; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
397; AVX2-NEXT:    vpsrlw $6, %ymm0, %ymm2
398; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
399; AVX2-NEXT:    vpsllw $2, %ymm0, %ymm3
400; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
401; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
402; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
403; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
404; AVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
405; AVX2-NEXT:    vpsrlw $7, %ymm0, %ymm3
406; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
407; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm2
408; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
409; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
410; AVX2-NEXT:    retq
411;
412; AVX512F-LABEL: var_rotate_v32i8:
413; AVX512F:       # %bb.0:
414; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm2
415; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
416; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm3
417; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
418; AVX512F-NEXT:    vpor %ymm2, %ymm3, %ymm2
419; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
420; AVX512F-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
421; AVX512F-NEXT:    vpsrlw $6, %ymm0, %ymm2
422; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
423; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm3
424; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
425; AVX512F-NEXT:    vpor %ymm2, %ymm3, %ymm2
426; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
427; AVX512F-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
428; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
429; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm3
430; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
431; AVX512F-NEXT:    vpor %ymm3, %ymm2, %ymm2
432; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
433; AVX512F-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
434; AVX512F-NEXT:    retq
435;
436; AVX512VL-LABEL: var_rotate_v32i8:
437; AVX512VL:       # %bb.0:
438; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm2
439; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
440; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm3
441; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
442; AVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
443; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
444; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
445; AVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm2
446; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
447; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm3
448; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
449; AVX512VL-NEXT:    vpor %ymm2, %ymm3, %ymm2
450; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
451; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
452; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm2
453; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm3
454; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
455; AVX512VL-NEXT:    vpor %ymm3, %ymm2, %ymm2
456; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
457; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
458; AVX512VL-NEXT:    retq
459;
460; AVX512BW-LABEL: var_rotate_v32i8:
461; AVX512BW:       # %bb.0:
462; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
463; AVX512BW-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
464; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
465; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
466; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
467; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
468; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
469; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
470; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
471; AVX512BW-NEXT:    retq
472;
473; AVX512VLBW-LABEL: var_rotate_v32i8:
474; AVX512VLBW:       # %bb.0:
475; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
476; AVX512VLBW-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
477; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
478; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
479; AVX512VLBW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
480; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
481; AVX512VLBW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
482; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
483; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
484; AVX512VLBW-NEXT:    retq
485;
486; XOPAVX1-LABEL: var_rotate_v32i8:
487; XOPAVX1:       # %bb.0:
488; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
489; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
490; XOPAVX1-NEXT:    vprotb %xmm2, %xmm3, %xmm2
491; XOPAVX1-NEXT:    vprotb %xmm1, %xmm0, %xmm0
492; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
493; XOPAVX1-NEXT:    retq
494;
495; XOPAVX2-LABEL: var_rotate_v32i8:
496; XOPAVX2:       # %bb.0:
497; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
498; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
499; XOPAVX2-NEXT:    vprotb %xmm2, %xmm3, %xmm2
500; XOPAVX2-NEXT:    vprotb %xmm1, %xmm0, %xmm0
501; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
502; XOPAVX2-NEXT:    retq
503  %b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
504  %shl = shl <32 x i8> %a, %b
505  %lshr = lshr <32 x i8> %a, %b8
506  %or = or <32 x i8> %shl, %lshr
507  ret <32 x i8> %or
508}
509
510;
511; Uniform Variable Rotates
512;
513
514define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
515; AVX1-LABEL: splatvar_rotate_v4i64:
516; AVX1:       # %bb.0:
517; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = xmm1[0,0]
518; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [64,64]
519; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
520; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
521; AVX1-NEXT:    vpsllq %xmm1, %xmm3, %xmm4
522; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
523; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
524; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm3
525; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
526; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
527; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
528; AVX1-NEXT:    retq
529;
530; AVX2-LABEL: splatvar_rotate_v4i64:
531; AVX2:       # %bb.0:
532; AVX2-NEXT:    vpsllq %xmm1, %ymm0, %ymm2
533; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [64,64]
534; AVX2-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
535; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
536; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
537; AVX2-NEXT:    retq
538;
539; AVX512F-LABEL: splatvar_rotate_v4i64:
540; AVX512F:       # %bb.0:
541; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
542; AVX512F-NEXT:    vpbroadcastq %xmm1, %ymm1
543; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
544; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
545; AVX512F-NEXT:    retq
546;
547; AVX512VL-LABEL: splatvar_rotate_v4i64:
548; AVX512VL:       # %bb.0:
549; AVX512VL-NEXT:    vpbroadcastq %xmm1, %ymm1
550; AVX512VL-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
551; AVX512VL-NEXT:    retq
552;
553; AVX512BW-LABEL: splatvar_rotate_v4i64:
554; AVX512BW:       # %bb.0:
555; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
556; AVX512BW-NEXT:    vpbroadcastq %xmm1, %ymm1
557; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
558; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
559; AVX512BW-NEXT:    retq
560;
561; AVX512VLBW-LABEL: splatvar_rotate_v4i64:
562; AVX512VLBW:       # %bb.0:
563; AVX512VLBW-NEXT:    vpbroadcastq %xmm1, %ymm1
564; AVX512VLBW-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
565; AVX512VLBW-NEXT:    retq
566;
567; XOPAVX1-LABEL: splatvar_rotate_v4i64:
568; XOPAVX1:       # %bb.0:
569; XOPAVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
570; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
571; XOPAVX1-NEXT:    vprotq %xmm1, %xmm2, %xmm2
572; XOPAVX1-NEXT:    vprotq %xmm1, %xmm0, %xmm0
573; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
574; XOPAVX1-NEXT:    retq
575;
576; XOPAVX2-LABEL: splatvar_rotate_v4i64:
577; XOPAVX2:       # %bb.0:
578; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
579; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
580; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
581; XOPAVX2-NEXT:    vprotq %xmm3, %xmm2, %xmm2
582; XOPAVX2-NEXT:    vprotq %xmm1, %xmm0, %xmm0
583; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
584; XOPAVX2-NEXT:    retq
585  %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
586  %splat64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %splat
587  %shl = shl <4 x i64> %a, %splat
588  %lshr = lshr <4 x i64> %a, %splat64
589  %or = or <4 x i64> %shl, %lshr
590  ret <4 x i64> %or
591}
592
593define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
594; AVX1-LABEL: splatvar_rotate_v8i32:
595; AVX1:       # %bb.0:
596; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
597; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
598; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
599; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
600; AVX1-NEXT:    vpslld %xmm3, %xmm2, %xmm4
601; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
602; AVX1-NEXT:    vpsubd %xmm1, %xmm5, %xmm1
603; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
604; AVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
605; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
606; AVX1-NEXT:    vpslld %xmm3, %xmm0, %xmm3
607; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
608; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
609; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
610; AVX1-NEXT:    retq
611;
612; AVX2-LABEL: splatvar_rotate_v8i32:
613; AVX2:       # %bb.0:
614; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
615; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
616; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
617; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
618; AVX2-NEXT:    vpslld %xmm2, %ymm0, %ymm2
619; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
620; AVX2-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
621; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
622; AVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
623; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
624; AVX2-NEXT:    retq
625;
626; AVX512F-LABEL: splatvar_rotate_v8i32:
627; AVX512F:       # %bb.0:
628; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
629; AVX512F-NEXT:    vpbroadcastd %xmm1, %ymm1
630; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
631; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
632; AVX512F-NEXT:    retq
633;
634; AVX512VL-LABEL: splatvar_rotate_v8i32:
635; AVX512VL:       # %bb.0:
636; AVX512VL-NEXT:    vpbroadcastd %xmm1, %ymm1
637; AVX512VL-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
638; AVX512VL-NEXT:    retq
639;
640; AVX512BW-LABEL: splatvar_rotate_v8i32:
641; AVX512BW:       # %bb.0:
642; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
643; AVX512BW-NEXT:    vpbroadcastd %xmm1, %ymm1
644; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
645; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
646; AVX512BW-NEXT:    retq
647;
648; AVX512VLBW-LABEL: splatvar_rotate_v8i32:
649; AVX512VLBW:       # %bb.0:
650; AVX512VLBW-NEXT:    vpbroadcastd %xmm1, %ymm1
651; AVX512VLBW-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
652; AVX512VLBW-NEXT:    retq
653;
654; XOPAVX1-LABEL: splatvar_rotate_v8i32:
655; XOPAVX1:       # %bb.0:
656; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
657; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
658; XOPAVX1-NEXT:    vprotd %xmm1, %xmm2, %xmm2
659; XOPAVX1-NEXT:    vprotd %xmm1, %xmm0, %xmm0
660; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
661; XOPAVX1-NEXT:    retq
662;
663; XOPAVX2-LABEL: splatvar_rotate_v8i32:
664; XOPAVX2:       # %bb.0:
665; XOPAVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
666; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
667; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
668; XOPAVX2-NEXT:    vprotd %xmm3, %xmm2, %xmm2
669; XOPAVX2-NEXT:    vprotd %xmm1, %xmm0, %xmm0
670; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
671; XOPAVX2-NEXT:    retq
672  %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
673  %splat32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %splat
674  %shl = shl <8 x i32> %a, %splat
675  %lshr = lshr <8 x i32> %a, %splat32
676  %or = or <8 x i32> %shl, %lshr
677  ret <8 x i32> %or
678}
679
680define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
681; AVX1-LABEL: splatvar_rotate_v16i16:
682; AVX1:       # %bb.0:
683; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
684; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
685; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
686; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
687; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
688; AVX1-NEXT:    vpsllw %xmm3, %xmm2, %xmm4
689; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
690; AVX1-NEXT:    vpsubw %xmm1, %xmm5, %xmm1
691; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
692; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
693; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
694; AVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
695; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
696; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
697; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
698; AVX1-NEXT:    retq
699;
700; AVX2-LABEL: splatvar_rotate_v16i16:
701; AVX2:       # %bb.0:
702; AVX2-NEXT:    vpbroadcastw %xmm1, %ymm1
703; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
704; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
705; AVX2-NEXT:    vpsllw %xmm2, %ymm0, %ymm2
706; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
707; AVX2-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
708; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
709; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
710; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
711; AVX2-NEXT:    retq
712;
713; AVX512-LABEL: splatvar_rotate_v16i16:
714; AVX512:       # %bb.0:
715; AVX512-NEXT:    vpbroadcastw %xmm1, %ymm1
716; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
717; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
718; AVX512-NEXT:    vpsllw %xmm2, %ymm0, %ymm2
719; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
720; AVX512-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
721; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
722; AVX512-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
723; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
724; AVX512-NEXT:    retq
725;
726; XOPAVX1-LABEL: splatvar_rotate_v16i16:
727; XOPAVX1:       # %bb.0:
728; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
729; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
730; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
731; XOPAVX1-NEXT:    vprotw %xmm1, %xmm2, %xmm2
732; XOPAVX1-NEXT:    vprotw %xmm1, %xmm0, %xmm0
733; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
734; XOPAVX1-NEXT:    retq
735;
736; XOPAVX2-LABEL: splatvar_rotate_v16i16:
737; XOPAVX2:       # %bb.0:
738; XOPAVX2-NEXT:    vpbroadcastw %xmm1, %ymm1
739; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
740; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
741; XOPAVX2-NEXT:    vprotw %xmm3, %xmm2, %xmm2
742; XOPAVX2-NEXT:    vprotw %xmm1, %xmm0, %xmm0
743; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
744; XOPAVX2-NEXT:    retq
745  %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
746  %splat16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
747  %shl = shl <16 x i16> %a, %splat
748  %lshr = lshr <16 x i16> %a, %splat16
749  %or = or <16 x i16> %shl, %lshr
750  ret <16 x i16> %or
751}
752
753define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
754; AVX1-LABEL: splatvar_rotate_v32i8:
755; AVX1:       # %bb.0:
756; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
757; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
758; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
759; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
760; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
761; AVX1-NEXT:    vpsllw %xmm3, %xmm4, %xmm5
762; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
763; AVX1-NEXT:    vpsllw %xmm3, %xmm6, %xmm7
764; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
765; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
766; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
767; AVX1-NEXT:    vpsubb %xmm1, %xmm7, %xmm1
768; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
769; AVX1-NEXT:    vpsrlw %xmm1, %xmm4, %xmm4
770; AVX1-NEXT:    vpsrlw %xmm1, %xmm6, %xmm6
771; AVX1-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
772; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
773; AVX1-NEXT:    vpor %xmm4, %xmm5, %xmm4
774; AVX1-NEXT:    vpsllw %xmm3, %xmm0, %xmm3
775; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
776; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
777; AVX1-NEXT:    vpand %xmm6, %xmm0, %xmm0
778; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
779; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
780; AVX1-NEXT:    retq
781;
782; AVX2-LABEL: splatvar_rotate_v32i8:
783; AVX2:       # %bb.0:
784; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
785; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
786; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
787; AVX2-NEXT:    vpsllw %xmm2, %ymm0, %ymm3
788; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
789; AVX2-NEXT:    vpsllw %xmm2, %ymm4, %ymm2
790; AVX2-NEXT:    vpbroadcastb %xmm2, %ymm2
791; AVX2-NEXT:    vpand %ymm2, %ymm3, %ymm2
792; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
793; AVX2-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
794; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
795; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
796; AVX2-NEXT:    vpsrlw %xmm1, %ymm4, %ymm1
797; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
798; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
799; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
800; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
801; AVX2-NEXT:    retq
802;
803; AVX512F-LABEL: splatvar_rotate_v32i8:
804; AVX512F:       # %bb.0:
805; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
806; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
807; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
808; AVX512F-NEXT:    vpsllw %xmm2, %ymm0, %ymm3
809; AVX512F-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
810; AVX512F-NEXT:    vpsllw %xmm2, %ymm4, %ymm2
811; AVX512F-NEXT:    vpbroadcastb %xmm2, %ymm2
812; AVX512F-NEXT:    vpand %ymm2, %ymm3, %ymm2
813; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
814; AVX512F-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
815; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
816; AVX512F-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
817; AVX512F-NEXT:    vpsrlw %xmm1, %ymm4, %ymm1
818; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
819; AVX512F-NEXT:    vpbroadcastb %xmm1, %ymm1
820; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
821; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
822; AVX512F-NEXT:    retq
823;
824; AVX512VL-LABEL: splatvar_rotate_v32i8:
825; AVX512VL:       # %bb.0:
826; AVX512VL-NEXT:    vpbroadcastb %xmm1, %ymm1
827; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
828; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
829; AVX512VL-NEXT:    vpsllw %xmm2, %ymm0, %ymm3
830; AVX512VL-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
831; AVX512VL-NEXT:    vpsllw %xmm2, %ymm4, %ymm2
832; AVX512VL-NEXT:    vpbroadcastb %xmm2, %ymm2
833; AVX512VL-NEXT:    vpand %ymm2, %ymm3, %ymm2
834; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
835; AVX512VL-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
836; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
837; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
838; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm4, %ymm1
839; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
840; AVX512VL-NEXT:    vpbroadcastb %xmm1, %ymm1
841; AVX512VL-NEXT:    vpand %ymm1, %ymm0, %ymm0
842; AVX512VL-NEXT:    vpor %ymm0, %ymm2, %ymm0
843; AVX512VL-NEXT:    retq
844;
845; AVX512BW-LABEL: splatvar_rotate_v32i8:
846; AVX512BW:       # %bb.0:
847; AVX512BW-NEXT:    vpbroadcastb %xmm1, %ymm1
848; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
849; AVX512BW-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
850; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
851; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
852; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
853; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
854; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
855; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
856; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
857; AVX512BW-NEXT:    retq
858;
859; AVX512VLBW-LABEL: splatvar_rotate_v32i8:
860; AVX512VLBW:       # %bb.0:
861; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %ymm1
862; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
863; AVX512VLBW-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
864; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
865; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
866; AVX512VLBW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
867; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
868; AVX512VLBW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
869; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
870; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
871; AVX512VLBW-NEXT:    retq
872;
873; XOPAVX1-LABEL: splatvar_rotate_v32i8:
874; XOPAVX1:       # %bb.0:
875; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
876; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
877; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
878; XOPAVX1-NEXT:    vprotb %xmm1, %xmm2, %xmm2
879; XOPAVX1-NEXT:    vprotb %xmm1, %xmm0, %xmm0
880; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
881; XOPAVX1-NEXT:    retq
882;
883; XOPAVX2-LABEL: splatvar_rotate_v32i8:
884; XOPAVX2:       # %bb.0:
885; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
886; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
887; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
888; XOPAVX2-NEXT:    vprotb %xmm3, %xmm2, %xmm2
889; XOPAVX2-NEXT:    vprotb %xmm1, %xmm0, %xmm0
890; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
891; XOPAVX2-NEXT:    retq
892  %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
893  %splat8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
894  %shl = shl <32 x i8> %a, %splat
895  %lshr = lshr <32 x i8> %a, %splat8
896  %or = or <32 x i8> %shl, %lshr
897  ret <32 x i8> %or
898}
899
900;
901; Constant Rotates
902;
903
904define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind {
905; AVX1-LABEL: constant_rotate_v4i64:
906; AVX1:       # %bb.0:
907; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
908; AVX1-NEXT:    vpsllq $60, %xmm1, %xmm2
909; AVX1-NEXT:    vpsllq $50, %xmm1, %xmm3
910; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
911; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm3
912; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm4
913; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
914; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
915; AVX1-NEXT:    vpsrlq $4, %xmm1, %xmm3
916; AVX1-NEXT:    vpsrlq $14, %xmm1, %xmm1
917; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
918; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm3
919; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm0
920; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
921; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
922; AVX1-NEXT:    vorps %ymm0, %ymm2, %ymm0
923; AVX1-NEXT:    retq
924;
925; AVX2-LABEL: constant_rotate_v4i64:
926; AVX2:       # %bb.0:
927; AVX2-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm1
928; AVX2-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
929; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
930; AVX2-NEXT:    retq
931;
932; AVX512F-LABEL: constant_rotate_v4i64:
933; AVX512F:       # %bb.0:
934; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
935; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
936; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
937; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
938; AVX512F-NEXT:    retq
939;
940; AVX512VL-LABEL: constant_rotate_v4i64:
941; AVX512VL:       # %bb.0:
942; AVX512VL-NEXT:    vprolvq {{.*}}(%rip), %ymm0, %ymm0
943; AVX512VL-NEXT:    retq
944;
945; AVX512BW-LABEL: constant_rotate_v4i64:
946; AVX512BW:       # %bb.0:
947; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
948; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
949; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
950; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
951; AVX512BW-NEXT:    retq
952;
953; AVX512VLBW-LABEL: constant_rotate_v4i64:
954; AVX512VLBW:       # %bb.0:
955; AVX512VLBW-NEXT:    vprolvq {{.*}}(%rip), %ymm0, %ymm0
956; AVX512VLBW-NEXT:    retq
957;
958; XOPAVX1-LABEL: constant_rotate_v4i64:
959; XOPAVX1:       # %bb.0:
960; XOPAVX1-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm1
961; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
962; XOPAVX1-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm0
963; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
964; XOPAVX1-NEXT:    retq
965;
966; XOPAVX2-LABEL: constant_rotate_v4i64:
967; XOPAVX2:       # %bb.0:
968; XOPAVX2-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm1
969; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
970; XOPAVX2-NEXT:    vprotq {{.*}}(%rip), %xmm0, %xmm0
971; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
972; XOPAVX2-NEXT:    retq
973  %shl = shl <4 x i64> %a, <i64 4, i64 14, i64 50, i64 60>
974  %lshr = lshr <4 x i64> %a, <i64 60, i64 50, i64 14, i64 4>
975  %or = or <4 x i64> %shl, %lshr
976  ret <4 x i64> %or
977}
978
979define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
980; AVX1-LABEL: constant_rotate_v8i32:
981; AVX1:       # %bb.0:
982; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [256,512,1024,2048]
983; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
984; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
985; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
986; AVX1-NEXT:    vpmuludq %xmm2, %xmm4, %xmm2
987; AVX1-NEXT:    vpmuludq %xmm1, %xmm3, %xmm1
988; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
989; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
990; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
991; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
992; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
993; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [16,32,64,128]
994; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
995; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
996; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
997; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
998; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
999; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1000; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
1001; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
1002; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1003; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1004; AVX1-NEXT:    retq
1005;
1006; AVX2-LABEL: constant_rotate_v8i32:
1007; AVX2:       # %bb.0:
1008; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm1
1009; AVX2-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
1010; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1011; AVX2-NEXT:    retq
1012;
1013; AVX512F-LABEL: constant_rotate_v8i32:
1014; AVX512F:       # %bb.0:
1015; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1016; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1017; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
1018; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1019; AVX512F-NEXT:    retq
1020;
1021; AVX512VL-LABEL: constant_rotate_v8i32:
1022; AVX512VL:       # %bb.0:
1023; AVX512VL-NEXT:    vprolvd {{.*}}(%rip), %ymm0, %ymm0
1024; AVX512VL-NEXT:    retq
1025;
1026; AVX512BW-LABEL: constant_rotate_v8i32:
1027; AVX512BW:       # %bb.0:
1028; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1029; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
1030; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
1031; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1032; AVX512BW-NEXT:    retq
1033;
1034; AVX512VLBW-LABEL: constant_rotate_v8i32:
1035; AVX512VLBW:       # %bb.0:
1036; AVX512VLBW-NEXT:    vprolvd {{.*}}(%rip), %ymm0, %ymm0
1037; AVX512VLBW-NEXT:    retq
1038;
1039; XOPAVX1-LABEL: constant_rotate_v8i32:
1040; XOPAVX1:       # %bb.0:
1041; XOPAVX1-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm1
1042; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1043; XOPAVX1-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
1044; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1045; XOPAVX1-NEXT:    retq
1046;
1047; XOPAVX2-LABEL: constant_rotate_v8i32:
1048; XOPAVX2:       # %bb.0:
1049; XOPAVX2-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm1
1050; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1051; XOPAVX2-NEXT:    vprotd {{.*}}(%rip), %xmm0, %xmm0
1052; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1053; XOPAVX2-NEXT:    retq
1054  %shl = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1055  %lshr = lshr <8 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
1056  %or = or <8 x i32> %shl, %lshr
1057  ret <8 x i32> %or
1058}
1059
1060define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind {
1061; AVX1-LABEL: constant_rotate_v16i16:
1062; AVX1:       # %bb.0:
1063; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1064; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
1065; AVX1-NEXT:    vpmulhuw %xmm2, %xmm1, %xmm3
1066; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
1067; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
1068; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
1069; AVX1-NEXT:    vpmulhuw %xmm2, %xmm0, %xmm3
1070; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
1071; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
1072; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1073; AVX1-NEXT:    retq
1074;
1075; AVX2-LABEL: constant_rotate_v16i16:
1076; AVX2:       # %bb.0:
1077; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1078; AVX2-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2
1079; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1080; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
1081; AVX2-NEXT:    retq
1082;
1083; AVX512F-LABEL: constant_rotate_v16i16:
1084; AVX512F:       # %bb.0:
1085; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1086; AVX512F-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2
1087; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1088; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
1089; AVX512F-NEXT:    retq
1090;
1091; AVX512VL-LABEL: constant_rotate_v16i16:
1092; AVX512VL:       # %bb.0:
1093; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
1094; AVX512VL-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2
1095; AVX512VL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1096; AVX512VL-NEXT:    vpor %ymm2, %ymm0, %ymm0
1097; AVX512VL-NEXT:    retq
1098;
1099; AVX512BW-LABEL: constant_rotate_v16i16:
1100; AVX512BW:       # %bb.0:
1101; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1102; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1103; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
1104; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm2
1105; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
1106; AVX512BW-NEXT:    vpor %ymm2, %ymm0, %ymm0
1107; AVX512BW-NEXT:    retq
1108;
1109; AVX512VLBW-LABEL: constant_rotate_v16i16:
1110; AVX512VLBW:       # %bb.0:
1111; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm1
1112; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0
1113; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
1114; AVX512VLBW-NEXT:    retq
1115;
1116; XOPAVX1-LABEL: constant_rotate_v16i16:
1117; XOPAVX1:       # %bb.0:
1118; XOPAVX1-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm1
1119; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1120; XOPAVX1-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm0
1121; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1122; XOPAVX1-NEXT:    retq
1123;
1124; XOPAVX2-LABEL: constant_rotate_v16i16:
1125; XOPAVX2:       # %bb.0:
1126; XOPAVX2-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm1
1127; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1128; XOPAVX2-NEXT:    vprotw {{.*}}(%rip), %xmm0, %xmm0
1129; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1130; XOPAVX2-NEXT:    retq
1131  %shl = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1132  %lshr = lshr <16 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
1133  %or = or <16 x i16> %shl, %lshr
1134  ret <16 x i16> %or
1135}
1136
1137define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
1138; AVX1-LABEL: constant_rotate_v32i8:
1139; AVX1:       # %bb.0:
1140; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1141; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
1142; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
1143; AVX1-NEXT:    vmovdqa {{.*#+}} xmm9 = [256,128,64,32,16,8,4,2]
1144; AVX1-NEXT:    vpmullw %xmm9, %xmm3, %xmm3
1145; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
1146; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1147; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [256,2,4,8,16,32,64,128]
1148; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm7
1149; AVX1-NEXT:    vpsrlw $8, %xmm7, %xmm7
1150; AVX1-NEXT:    vpackuswb %xmm3, %xmm7, %xmm3
1151; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1152; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2]
1153; AVX1-NEXT:    vpmullw %xmm7, %xmm1, %xmm1
1154; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1155; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
1156; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
1157; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm5
1158; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm5
1159; AVX1-NEXT:    vpackuswb %xmm1, %xmm5, %xmm1
1160; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
1161; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
1162; AVX1-NEXT:    vpmullw %xmm9, %xmm3, %xmm3
1163; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
1164; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1165; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm6
1166; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm6
1167; AVX1-NEXT:    vpackuswb %xmm3, %xmm6, %xmm3
1168; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1169; AVX1-NEXT:    vpmullw %xmm7, %xmm0, %xmm0
1170; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1171; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm4
1172; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm2
1173; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
1174; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
1175; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1176; AVX1-NEXT:    retq
1177;
1178; AVX2-LABEL: constant_rotate_v32i8:
1179; AVX2:       # %bb.0:
1180; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm1
1181; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1182; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1183; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1184; AVX2-NEXT:    vpsllw $2, %ymm1, %ymm3
1185; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
1186; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1187; AVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1188; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
1189; AVX2-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1190; AVX2-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1191; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1192; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1193; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
1194; AVX2-NEXT:    vpsrlw $8, %ymm3, %ymm3
1195; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1196; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
1197; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1198; AVX2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
1199; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
1200; AVX2-NEXT:    retq
1201;
1202; AVX512F-LABEL: constant_rotate_v32i8:
1203; AVX512F:       # %bb.0:
1204; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm1
1205; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1206; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1207; AVX512F-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1208; AVX512F-NEXT:    vpsllw $2, %ymm1, %ymm3
1209; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
1210; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1211; AVX512F-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1212; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
1213; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1214; AVX512F-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1215; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1216; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1217; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm3, %ymm3
1218; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
1219; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1220; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
1221; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
1222; AVX512F-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
1223; AVX512F-NEXT:    vpor %ymm0, %ymm1, %ymm0
1224; AVX512F-NEXT:    retq
1225;
1226; AVX512VL-LABEL: constant_rotate_v32i8:
1227; AVX512VL:       # %bb.0:
1228; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm1
1229; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1230; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
1231; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
1232; AVX512VL-NEXT:    vpsllw $2, %ymm1, %ymm3
1233; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm3, %ymm3
1234; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1235; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1236; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm3
1237; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm2
1238; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
1239; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1240; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
1241; AVX512VL-NEXT:    vpmullw {{.*}}(%rip), %ymm2, %ymm2
1242; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
1243; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1244; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1245; AVX512VL-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
1246; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1247; AVX512VL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
1248; AVX512VL-NEXT:    vpor %ymm0, %ymm1, %ymm0
1249; AVX512VL-NEXT:    retq
1250;
1251; AVX512BW-LABEL: constant_rotate_v32i8:
1252; AVX512BW:       # %bb.0:
1253; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1254; AVX512BW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm1
1255; AVX512BW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
1256; AVX512BW-NEXT:    vporq %zmm0, %zmm1, %zmm0
1257; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1258; AVX512BW-NEXT:    retq
1259;
1260; AVX512VLBW-LABEL: constant_rotate_v32i8:
1261; AVX512VLBW:       # %bb.0:
1262; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1263; AVX512VLBW-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm1
1264; AVX512VLBW-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
1265; AVX512VLBW-NEXT:    vporq %zmm0, %zmm1, %zmm0
1266; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
1267; AVX512VLBW-NEXT:    retq
1268;
1269; XOPAVX1-LABEL: constant_rotate_v32i8:
1270; XOPAVX1:       # %bb.0:
1271; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1272; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1273; XOPAVX1-NEXT:    vprotb %xmm2, %xmm1, %xmm1
1274; XOPAVX1-NEXT:    vprotb %xmm2, %xmm0, %xmm0
1275; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1276; XOPAVX1-NEXT:    retq
1277;
1278; XOPAVX2-LABEL: constant_rotate_v32i8:
1279; XOPAVX2:       # %bb.0:
1280; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1281; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1282; XOPAVX2-NEXT:    vprotb %xmm2, %xmm1, %xmm1
1283; XOPAVX2-NEXT:    vprotb %xmm2, %xmm0, %xmm0
1284; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1285; XOPAVX2-NEXT:    retq
1286  %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
1287  %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1288  %or = or <32 x i8> %shl, %lshr
1289  ret <32 x i8> %or
1290}
1291
1292;
1293; Uniform Constant Rotates
1294;
1295
1296define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind {
1297; AVX1-LABEL: splatconstant_rotate_v4i64:
1298; AVX1:       # %bb.0:
1299; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm1
1300; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1301; AVX1-NEXT:    vpsllq $14, %xmm2, %xmm3
1302; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
1303; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm0
1304; AVX1-NEXT:    vpsrlq $50, %xmm2, %xmm2
1305; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1306; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
1307; AVX1-NEXT:    retq
1308;
1309; AVX2-LABEL: splatconstant_rotate_v4i64:
1310; AVX2:       # %bb.0:
1311; AVX2-NEXT:    vpsllq $14, %ymm0, %ymm1
1312; AVX2-NEXT:    vpsrlq $50, %ymm0, %ymm0
1313; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
1314; AVX2-NEXT:    retq
1315;
1316; AVX512F-LABEL: splatconstant_rotate_v4i64:
1317; AVX512F:       # %bb.0:
1318; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1319; AVX512F-NEXT:    vprolq $14, %zmm0, %zmm0
1320; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1321; AVX512F-NEXT:    retq
1322;
1323; AVX512VL-LABEL: splatconstant_rotate_v4i64:
1324; AVX512VL:       # %bb.0:
1325; AVX512VL-NEXT:    vprolq $14, %ymm0, %ymm0
1326; AVX512VL-NEXT:    retq
1327;
1328; AVX512BW-LABEL: splatconstant_rotate_v4i64:
1329; AVX512BW:       # %bb.0:
1330; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1331; AVX512BW-NEXT:    vprolq $14, %zmm0, %zmm0
1332; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1333; AVX512BW-NEXT:    retq
1334;
1335; AVX512VLBW-LABEL: splatconstant_rotate_v4i64:
1336; AVX512VLBW:       # %bb.0:
1337; AVX512VLBW-NEXT:    vprolq $14, %ymm0, %ymm0
1338; AVX512VLBW-NEXT:    retq
1339;
1340; XOPAVX1-LABEL: splatconstant_rotate_v4i64:
1341; XOPAVX1:       # %bb.0:
1342; XOPAVX1-NEXT:    vprotq $14, %xmm0, %xmm1
1343; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1344; XOPAVX1-NEXT:    vprotq $14, %xmm0, %xmm0
1345; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1346; XOPAVX1-NEXT:    retq
1347;
1348; XOPAVX2-LABEL: splatconstant_rotate_v4i64:
1349; XOPAVX2:       # %bb.0:
1350; XOPAVX2-NEXT:    vprotq $14, %xmm0, %xmm1
1351; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1352; XOPAVX2-NEXT:    vprotq $14, %xmm0, %xmm0
1353; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1354; XOPAVX2-NEXT:    retq
1355  %shl = shl <4 x i64> %a, <i64 14, i64 14, i64 14, i64 14>
1356  %lshr = lshr <4 x i64> %a, <i64 50, i64 50, i64 50, i64 50>
1357  %or = or <4 x i64> %shl, %lshr
1358  ret <4 x i64> %or
1359}
1360
1361define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind {
1362; AVX1-LABEL: splatconstant_rotate_v8i32:
1363; AVX1:       # %bb.0:
1364; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1365; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm2
1366; AVX1-NEXT:    vpslld $4, %xmm1, %xmm1
1367; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1368; AVX1-NEXT:    vpsrld $28, %xmm0, %xmm2
1369; AVX1-NEXT:    vpslld $4, %xmm0, %xmm0
1370; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1371; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1372; AVX1-NEXT:    retq
1373;
1374; AVX2-LABEL: splatconstant_rotate_v8i32:
1375; AVX2:       # %bb.0:
1376; AVX2-NEXT:    vpsrld $28, %ymm0, %ymm1
1377; AVX2-NEXT:    vpslld $4, %ymm0, %ymm0
1378; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1379; AVX2-NEXT:    retq
1380;
1381; AVX512F-LABEL: splatconstant_rotate_v8i32:
1382; AVX512F:       # %bb.0:
1383; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1384; AVX512F-NEXT:    vprold $4, %zmm0, %zmm0
1385; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1386; AVX512F-NEXT:    retq
1387;
1388; AVX512VL-LABEL: splatconstant_rotate_v8i32:
1389; AVX512VL:       # %bb.0:
1390; AVX512VL-NEXT:    vprold $4, %ymm0, %ymm0
1391; AVX512VL-NEXT:    retq
1392;
1393; AVX512BW-LABEL: splatconstant_rotate_v8i32:
1394; AVX512BW:       # %bb.0:
1395; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1396; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
1397; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1398; AVX512BW-NEXT:    retq
1399;
1400; AVX512VLBW-LABEL: splatconstant_rotate_v8i32:
1401; AVX512VLBW:       # %bb.0:
1402; AVX512VLBW-NEXT:    vprold $4, %ymm0, %ymm0
1403; AVX512VLBW-NEXT:    retq
1404;
1405; XOPAVX1-LABEL: splatconstant_rotate_v8i32:
1406; XOPAVX1:       # %bb.0:
1407; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm1
1408; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1409; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm0
1410; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1411; XOPAVX1-NEXT:    retq
1412;
1413; XOPAVX2-LABEL: splatconstant_rotate_v8i32:
1414; XOPAVX2:       # %bb.0:
1415; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm1
1416; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1417; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm0
1418; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1419; XOPAVX2-NEXT:    retq
1420  %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
1421  %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
1422  %or = or <8 x i32> %shl, %lshr
1423  ret <8 x i32> %or
1424}
1425
1426define <16 x i16> @splatconstant_rotate_v16i16(<16 x i16> %a) nounwind {
1427; AVX1-LABEL: splatconstant_rotate_v16i16:
1428; AVX1:       # %bb.0:
1429; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1430; AVX1-NEXT:    vpsrlw $9, %xmm1, %xmm2
1431; AVX1-NEXT:    vpsllw $7, %xmm1, %xmm1
1432; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1433; AVX1-NEXT:    vpsrlw $9, %xmm0, %xmm2
1434; AVX1-NEXT:    vpsllw $7, %xmm0, %xmm0
1435; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1436; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1437; AVX1-NEXT:    retq
1438;
1439; AVX2-LABEL: splatconstant_rotate_v16i16:
1440; AVX2:       # %bb.0:
1441; AVX2-NEXT:    vpsrlw $9, %ymm0, %ymm1
1442; AVX2-NEXT:    vpsllw $7, %ymm0, %ymm0
1443; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1444; AVX2-NEXT:    retq
1445;
1446; AVX512-LABEL: splatconstant_rotate_v16i16:
1447; AVX512:       # %bb.0:
1448; AVX512-NEXT:    vpsrlw $9, %ymm0, %ymm1
1449; AVX512-NEXT:    vpsllw $7, %ymm0, %ymm0
1450; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
1451; AVX512-NEXT:    retq
1452;
1453; XOPAVX1-LABEL: splatconstant_rotate_v16i16:
1454; XOPAVX1:       # %bb.0:
1455; XOPAVX1-NEXT:    vprotw $7, %xmm0, %xmm1
1456; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1457; XOPAVX1-NEXT:    vprotw $7, %xmm0, %xmm0
1458; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1459; XOPAVX1-NEXT:    retq
1460;
1461; XOPAVX2-LABEL: splatconstant_rotate_v16i16:
1462; XOPAVX2:       # %bb.0:
1463; XOPAVX2-NEXT:    vprotw $7, %xmm0, %xmm1
1464; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1465; XOPAVX2-NEXT:    vprotw $7, %xmm0, %xmm0
1466; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1467; XOPAVX2-NEXT:    retq
1468  %shl = shl <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1469  %lshr = lshr <16 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
1470  %or = or <16 x i16> %shl, %lshr
1471  ret <16 x i16> %or
1472}
1473
1474define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
1475; AVX1-LABEL: splatconstant_rotate_v32i8:
1476; AVX1:       # %bb.0:
1477; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1478; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm2
1479; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1480; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1481; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
1482; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1483; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
1484; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1485; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
1486; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1487; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm0
1488; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1489; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1490; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1491; AVX1-NEXT:    retq
1492;
1493; AVX2-LABEL: splatconstant_rotate_v32i8:
1494; AVX2:       # %bb.0:
1495; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm1
1496; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1497; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm0
1498; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1499; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1500; AVX2-NEXT:    retq
1501;
1502; AVX512F-LABEL: splatconstant_rotate_v32i8:
1503; AVX512F:       # %bb.0:
1504; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm1
1505; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1506; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm0
1507; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1508; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
1509; AVX512F-NEXT:    retq
1510;
1511; AVX512VL-LABEL: splatconstant_rotate_v32i8:
1512; AVX512VL:       # %bb.0:
1513; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm1
1514; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1515; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm0
1516; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1517; AVX512VL-NEXT:    vpor %ymm1, %ymm0, %ymm0
1518; AVX512VL-NEXT:    retq
1519;
1520; AVX512BW-LABEL: splatconstant_rotate_v32i8:
1521; AVX512BW:       # %bb.0:
1522; AVX512BW-NEXT:    vpsllw $4, %ymm0, %ymm1
1523; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1524; AVX512BW-NEXT:    vpsrlw $4, %ymm0, %ymm0
1525; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1526; AVX512BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
1527; AVX512BW-NEXT:    retq
1528;
1529; AVX512VLBW-LABEL: splatconstant_rotate_v32i8:
1530; AVX512VLBW:       # %bb.0:
1531; AVX512VLBW-NEXT:    vpsllw $4, %ymm0, %ymm1
1532; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1533; AVX512VLBW-NEXT:    vpsrlw $4, %ymm0, %ymm0
1534; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1535; AVX512VLBW-NEXT:    vpor %ymm0, %ymm1, %ymm0
1536; AVX512VLBW-NEXT:    retq
1537;
1538; XOPAVX1-LABEL: splatconstant_rotate_v32i8:
1539; XOPAVX1:       # %bb.0:
1540; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm1
1541; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1542; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm0
1543; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1544; XOPAVX1-NEXT:    retq
1545;
1546; XOPAVX2-LABEL: splatconstant_rotate_v32i8:
1547; XOPAVX2:       # %bb.0:
1548; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm1
1549; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1550; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm0
1551; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1552; XOPAVX2-NEXT:    retq
1553  %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1554  %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1555  %or = or <32 x i8> %shl, %lshr
1556  ret <32 x i8> %or
1557}
1558
1559;
1560; Masked Uniform Constant Rotates
1561;
1562
1563define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind {
1564; AVX1-LABEL: splatconstant_rotate_mask_v4i64:
1565; AVX1:       # %bb.0:
1566; AVX1-NEXT:    vpsrlq $49, %xmm0, %xmm1
1567; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1568; AVX1-NEXT:    vpsrlq $49, %xmm0, %xmm0
1569; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1570; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
1571; AVX1-NEXT:    retq
1572;
1573; AVX2-LABEL: splatconstant_rotate_mask_v4i64:
1574; AVX2:       # %bb.0:
1575; AVX2-NEXT:    vpsrlq $49, %ymm0, %ymm0
1576; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1577; AVX2-NEXT:    retq
1578;
1579; AVX512F-LABEL: splatconstant_rotate_mask_v4i64:
1580; AVX512F:       # %bb.0:
1581; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1582; AVX512F-NEXT:    vprolq $15, %zmm0, %zmm0
1583; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1584; AVX512F-NEXT:    retq
1585;
1586; AVX512VL-LABEL: splatconstant_rotate_mask_v4i64:
1587; AVX512VL:       # %bb.0:
1588; AVX512VL-NEXT:    vprolq $15, %ymm0, %ymm0
1589; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1590; AVX512VL-NEXT:    retq
1591;
1592; AVX512BW-LABEL: splatconstant_rotate_mask_v4i64:
1593; AVX512BW:       # %bb.0:
1594; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1595; AVX512BW-NEXT:    vprolq $15, %zmm0, %zmm0
1596; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1597; AVX512BW-NEXT:    retq
1598;
1599; AVX512VLBW-LABEL: splatconstant_rotate_mask_v4i64:
1600; AVX512VLBW:       # %bb.0:
1601; AVX512VLBW-NEXT:    vprolq $15, %ymm0, %ymm0
1602; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1603; AVX512VLBW-NEXT:    retq
1604;
1605; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64:
1606; XOPAVX1:       # %bb.0:
1607; XOPAVX1-NEXT:    vprotq $15, %xmm0, %xmm1
1608; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1609; XOPAVX1-NEXT:    vprotq $15, %xmm0, %xmm0
1610; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1611; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
1612; XOPAVX1-NEXT:    retq
1613;
1614; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i64:
1615; XOPAVX2:       # %bb.0:
1616; XOPAVX2-NEXT:    vprotq $15, %xmm0, %xmm1
1617; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1618; XOPAVX2-NEXT:    vprotq $15, %xmm0, %xmm0
1619; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1620; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1621; XOPAVX2-NEXT:    retq
1622  %shl = shl <4 x i64> %a, <i64 15, i64 15, i64 15, i64 15>
1623  %lshr = lshr <4 x i64> %a, <i64 49, i64 49, i64 49, i64 49>
1624  %rmask = and <4 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255>
1625  %lmask = and <4 x i64> %shl, <i64 33, i64 65, i64 129, i64 257>
1626  %or = or <4 x i64> %lmask, %rmask
1627  ret <4 x i64> %or
1628}
1629
1630define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind {
1631; AVX1-LABEL: splatconstant_rotate_mask_v8i32:
1632; AVX1:       # %bb.0:
1633; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1634; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm2
1635; AVX1-NEXT:    vpslld $4, %xmm1, %xmm1
1636; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1637; AVX1-NEXT:    vpsrld $28, %xmm0, %xmm2
1638; AVX1-NEXT:    vpslld $4, %xmm0, %xmm0
1639; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1640; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1641; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
1642; AVX1-NEXT:    retq
1643;
1644; AVX2-LABEL: splatconstant_rotate_mask_v8i32:
1645; AVX2:       # %bb.0:
1646; AVX2-NEXT:    vpsrld $28, %ymm0, %ymm1
1647; AVX2-NEXT:    vpslld $4, %ymm0, %ymm0
1648; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1649; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1650; AVX2-NEXT:    retq
1651;
1652; AVX512F-LABEL: splatconstant_rotate_mask_v8i32:
1653; AVX512F:       # %bb.0:
1654; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1655; AVX512F-NEXT:    vprold $4, %zmm0, %zmm0
1656; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1657; AVX512F-NEXT:    retq
1658;
1659; AVX512VL-LABEL: splatconstant_rotate_mask_v8i32:
1660; AVX512VL:       # %bb.0:
1661; AVX512VL-NEXT:    vprold $4, %ymm0, %ymm0
1662; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1663; AVX512VL-NEXT:    retq
1664;
1665; AVX512BW-LABEL: splatconstant_rotate_mask_v8i32:
1666; AVX512BW:       # %bb.0:
1667; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1668; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
1669; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1670; AVX512BW-NEXT:    retq
1671;
1672; AVX512VLBW-LABEL: splatconstant_rotate_mask_v8i32:
1673; AVX512VLBW:       # %bb.0:
1674; AVX512VLBW-NEXT:    vprold $4, %ymm0, %ymm0
1675; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1676; AVX512VLBW-NEXT:    retq
1677;
1678; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32:
1679; XOPAVX1:       # %bb.0:
1680; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm1
1681; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1682; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm0
1683; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1684; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
1685; XOPAVX1-NEXT:    retq
1686;
1687; XOPAVX2-LABEL: splatconstant_rotate_mask_v8i32:
1688; XOPAVX2:       # %bb.0:
1689; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm1
1690; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1691; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm0
1692; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1693; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1694; XOPAVX2-NEXT:    retq
1695  %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
1696  %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
1697  %rmask = and <8 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511>
1698  %lmask = and <8 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3>
1699  %or = or <8 x i32> %lmask, %rmask
1700  ret <8 x i32> %or
1701}
1702
1703define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
1704; AVX1-LABEL: splatconstant_rotate_mask_v16i16:
1705; AVX1:       # %bb.0:
1706; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1707; AVX1-NEXT:    vpsrlw $11, %xmm1, %xmm2
1708; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
1709; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1710; AVX1-NEXT:    vpsrlw $11, %xmm0, %xmm2
1711; AVX1-NEXT:    vpsllw $5, %xmm0, %xmm0
1712; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1713; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1714; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
1715; AVX1-NEXT:    retq
1716;
1717; AVX2-LABEL: splatconstant_rotate_mask_v16i16:
1718; AVX2:       # %bb.0:
1719; AVX2-NEXT:    vpsrlw $11, %ymm0, %ymm1
1720; AVX2-NEXT:    vpsllw $5, %ymm0, %ymm0
1721; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1722; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1723; AVX2-NEXT:    retq
1724;
1725; AVX512-LABEL: splatconstant_rotate_mask_v16i16:
1726; AVX512:       # %bb.0:
1727; AVX512-NEXT:    vpsrlw $11, %ymm0, %ymm1
1728; AVX512-NEXT:    vpsllw $5, %ymm0, %ymm0
1729; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
1730; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1731; AVX512-NEXT:    retq
1732;
1733; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16:
1734; XOPAVX1:       # %bb.0:
1735; XOPAVX1-NEXT:    vprotw $5, %xmm0, %xmm1
1736; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1737; XOPAVX1-NEXT:    vprotw $5, %xmm0, %xmm0
1738; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1739; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
1740; XOPAVX1-NEXT:    retq
1741;
1742; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i16:
1743; XOPAVX2:       # %bb.0:
1744; XOPAVX2-NEXT:    vprotw $5, %xmm0, %xmm1
1745; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1746; XOPAVX2-NEXT:    vprotw $5, %xmm0, %xmm0
1747; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1748; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1749; XOPAVX2-NEXT:    retq
1750  %shl = shl <16 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
1751  %lshr = lshr <16 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
1752  %rmask = and <16 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
1753  %lmask = and <16 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
1754  %or = or <16 x i16> %lmask, %rmask
1755  ret <16 x i16> %or
1756}
1757
1758define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
1759; AVX1-LABEL: splatconstant_rotate_mask_v32i8:
1760; AVX1:       # %bb.0:
1761; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1762; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm2
1763; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1764; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1765; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
1766; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1767; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
1768; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1769; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
1770; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1771; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm0
1772; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1773; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1774; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1775; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
1776; AVX1-NEXT:    retq
1777;
1778; AVX2-LABEL: splatconstant_rotate_mask_v32i8:
1779; AVX2:       # %bb.0:
1780; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm1
1781; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1782; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm0
1783; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1784; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1785; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1786; AVX2-NEXT:    retq
1787;
1788; AVX512F-LABEL: splatconstant_rotate_mask_v32i8:
1789; AVX512F:       # %bb.0:
1790; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm1
1791; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1792; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm0
1793; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1794; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
1795; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1796; AVX512F-NEXT:    retq
1797;
1798; AVX512VL-LABEL: splatconstant_rotate_mask_v32i8:
1799; AVX512VL:       # %bb.0:
1800; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm1
1801; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1802; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm0
1803; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1804; AVX512VL-NEXT:    vpor %ymm1, %ymm0, %ymm0
1805; AVX512VL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1806; AVX512VL-NEXT:    retq
1807;
1808; AVX512BW-LABEL: splatconstant_rotate_mask_v32i8:
1809; AVX512BW:       # %bb.0:
1810; AVX512BW-NEXT:    vpsllw $4, %ymm0, %ymm1
1811; AVX512BW-NEXT:    vpsrlw $4, %ymm0, %ymm0
1812; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1813; AVX512BW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1814; AVX512BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
1815; AVX512BW-NEXT:    retq
1816;
1817; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i8:
1818; AVX512VLBW:       # %bb.0:
1819; AVX512VLBW-NEXT:    vpsllw $4, %ymm0, %ymm1
1820; AVX512VLBW-NEXT:    vpsrlw $4, %ymm0, %ymm0
1821; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1822; AVX512VLBW-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
1823; AVX512VLBW-NEXT:    vpor %ymm0, %ymm1, %ymm0
1824; AVX512VLBW-NEXT:    retq
1825;
1826; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8:
1827; XOPAVX1:       # %bb.0:
1828; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm1
1829; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1830; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm0
1831; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1832; XOPAVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
1833; XOPAVX1-NEXT:    retq
1834;
1835; XOPAVX2-LABEL: splatconstant_rotate_mask_v32i8:
1836; XOPAVX2:       # %bb.0:
1837; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm1
1838; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1839; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm0
1840; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1841; XOPAVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
1842; XOPAVX2-NEXT:    retq
1843  %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1844  %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1845  %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
1846  %lmask = and <32 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
1847  %or = or <32 x i8> %lmask, %rmask
1848  ret <32 x i8> %or
1849}
1850