1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
14
15; Just one 32-bit run to make sure we do reasonable things for i64 rotates.
16; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE2
17
18;
19; Variable Rotates
20;
21
22define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
23; SSE2-LABEL: var_rotate_v2i64:
24; SSE2:       # %bb.0:
25; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [64,64]
26; SSE2-NEXT:    psubq %xmm1, %xmm2
27; SSE2-NEXT:    movdqa %xmm0, %xmm3
28; SSE2-NEXT:    psllq %xmm1, %xmm3
29; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
30; SSE2-NEXT:    movdqa %xmm0, %xmm4
31; SSE2-NEXT:    psllq %xmm1, %xmm4
32; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
33; SSE2-NEXT:    movdqa %xmm0, %xmm1
34; SSE2-NEXT:    psrlq %xmm2, %xmm1
35; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
36; SSE2-NEXT:    psrlq %xmm2, %xmm0
37; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
38; SSE2-NEXT:    orpd %xmm4, %xmm0
39; SSE2-NEXT:    retq
40;
41; SSE41-LABEL: var_rotate_v2i64:
42; SSE41:       # %bb.0:
43; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [64,64]
44; SSE41-NEXT:    psubq %xmm1, %xmm2
45; SSE41-NEXT:    movdqa %xmm0, %xmm3
46; SSE41-NEXT:    psllq %xmm1, %xmm3
47; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
48; SSE41-NEXT:    movdqa %xmm0, %xmm4
49; SSE41-NEXT:    psllq %xmm1, %xmm4
50; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4,5,6,7]
51; SSE41-NEXT:    movdqa %xmm0, %xmm1
52; SSE41-NEXT:    psrlq %xmm2, %xmm1
53; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
54; SSE41-NEXT:    psrlq %xmm2, %xmm0
55; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
56; SSE41-NEXT:    por %xmm4, %xmm0
57; SSE41-NEXT:    retq
58;
59; AVX1-LABEL: var_rotate_v2i64:
60; AVX1:       # %bb.0:
61; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
62; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
63; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm3
64; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
65; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
66; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
67; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm3
68; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
69; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
70; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
71; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
72; AVX1-NEXT:    retq
73;
74; AVX2-LABEL: var_rotate_v2i64:
75; AVX2:       # %bb.0:
76; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
77; AVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
78; AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm1
79; AVX2-NEXT:    vpsrlvq %xmm2, %xmm0, %xmm0
80; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
81; AVX2-NEXT:    retq
82;
83; AVX512F-LABEL: var_rotate_v2i64:
84; AVX512F:       # %bb.0:
85; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
86; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
87; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
88; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
89; AVX512F-NEXT:    vzeroupper
90; AVX512F-NEXT:    retq
91;
92; AVX512VL-LABEL: var_rotate_v2i64:
93; AVX512VL:       # %bb.0:
94; AVX512VL-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
95; AVX512VL-NEXT:    retq
96;
97; AVX512BW-LABEL: var_rotate_v2i64:
98; AVX512BW:       # %bb.0:
99; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
100; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
101; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
102; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
103; AVX512BW-NEXT:    vzeroupper
104; AVX512BW-NEXT:    retq
105;
106; AVX512VLBW-LABEL: var_rotate_v2i64:
107; AVX512VLBW:       # %bb.0:
108; AVX512VLBW-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
109; AVX512VLBW-NEXT:    retq
110;
111; AVX512VBMI2-LABEL: var_rotate_v2i64:
112; AVX512VBMI2:       # %bb.0:
113; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
114; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
115; AVX512VBMI2-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
116; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
117; AVX512VBMI2-NEXT:    vzeroupper
118; AVX512VBMI2-NEXT:    retq
119;
120; AVX512VLVBMI2-LABEL: var_rotate_v2i64:
121; AVX512VLVBMI2:       # %bb.0:
122; AVX512VLVBMI2-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
123; AVX512VLVBMI2-NEXT:    retq
124;
125; XOP-LABEL: var_rotate_v2i64:
126; XOP:       # %bb.0:
127; XOP-NEXT:    vprotq %xmm1, %xmm0, %xmm0
128; XOP-NEXT:    retq
129;
130; X86-SSE2-LABEL: var_rotate_v2i64:
131; X86-SSE2:       # %bb.0:
132; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [64,0,64,0]
133; X86-SSE2-NEXT:    psubq %xmm1, %xmm2
134; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
135; X86-SSE2-NEXT:    psllq %xmm1, %xmm3
136; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
137; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
138; X86-SSE2-NEXT:    psllq %xmm1, %xmm4
139; X86-SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
140; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
141; X86-SSE2-NEXT:    psrlq %xmm2, %xmm1
142; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
143; X86-SSE2-NEXT:    psrlq %xmm2, %xmm0
144; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
145; X86-SSE2-NEXT:    orpd %xmm4, %xmm0
146; X86-SSE2-NEXT:    retl
147  %b64 = sub <2 x i64> <i64 64, i64 64>, %b
148  %shl = shl <2 x i64> %a, %b
149  %lshr = lshr <2 x i64> %a, %b64
150  %or = or <2 x i64> %shl, %lshr
151  ret <2 x i64> %or
152}
153
154define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
155; SSE2-LABEL: var_rotate_v4i32:
156; SSE2:       # %bb.0:
157; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
158; SSE2-NEXT:    pslld $23, %xmm1
159; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
160; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
161; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
162; SSE2-NEXT:    pmuludq %xmm1, %xmm0
163; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
164; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
165; SSE2-NEXT:    pmuludq %xmm2, %xmm1
166; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
167; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
168; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
169; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
170; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
171; SSE2-NEXT:    por %xmm3, %xmm0
172; SSE2-NEXT:    retq
173;
174; SSE41-LABEL: var_rotate_v4i32:
175; SSE41:       # %bb.0:
176; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
177; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
178; SSE41-NEXT:    pslld $23, %xmm1
179; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
180; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
181; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
182; SSE41-NEXT:    pmuludq %xmm2, %xmm3
183; SSE41-NEXT:    pmuludq %xmm1, %xmm0
184; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
185; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
186; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
187; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
188; SSE41-NEXT:    por %xmm1, %xmm0
189; SSE41-NEXT:    retq
190;
191; AVX1-LABEL: var_rotate_v4i32:
192; AVX1:       # %bb.0:
193; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
194; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
195; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
196; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
197; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
198; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
199; AVX1-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
200; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
201; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
202; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
203; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
204; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
205; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
206; AVX1-NEXT:    retq
207;
208; AVX2-LABEL: var_rotate_v4i32:
209; AVX2:       # %bb.0:
210; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
211; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
212; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm2
213; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
214; AVX2-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
215; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
216; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
217; AVX2-NEXT:    retq
218;
219; AVX512F-LABEL: var_rotate_v4i32:
220; AVX512F:       # %bb.0:
221; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
222; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
223; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
224; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
225; AVX512F-NEXT:    vzeroupper
226; AVX512F-NEXT:    retq
227;
228; AVX512VL-LABEL: var_rotate_v4i32:
229; AVX512VL:       # %bb.0:
230; AVX512VL-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
231; AVX512VL-NEXT:    retq
232;
233; AVX512BW-LABEL: var_rotate_v4i32:
234; AVX512BW:       # %bb.0:
235; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
236; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
237; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
238; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
239; AVX512BW-NEXT:    vzeroupper
240; AVX512BW-NEXT:    retq
241;
242; AVX512VLBW-LABEL: var_rotate_v4i32:
243; AVX512VLBW:       # %bb.0:
244; AVX512VLBW-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
245; AVX512VLBW-NEXT:    retq
246;
247; AVX512VBMI2-LABEL: var_rotate_v4i32:
248; AVX512VBMI2:       # %bb.0:
249; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
250; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
251; AVX512VBMI2-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
252; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
253; AVX512VBMI2-NEXT:    vzeroupper
254; AVX512VBMI2-NEXT:    retq
255;
256; AVX512VLVBMI2-LABEL: var_rotate_v4i32:
257; AVX512VLVBMI2:       # %bb.0:
258; AVX512VLVBMI2-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
259; AVX512VLVBMI2-NEXT:    retq
260;
261; XOP-LABEL: var_rotate_v4i32:
262; XOP:       # %bb.0:
263; XOP-NEXT:    vprotd %xmm1, %xmm0, %xmm0
264; XOP-NEXT:    retq
265;
266; X86-SSE2-LABEL: var_rotate_v4i32:
267; X86-SSE2:       # %bb.0:
268; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
269; X86-SSE2-NEXT:    pslld $23, %xmm1
270; X86-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
271; X86-SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
272; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
273; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm0
274; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
275; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
276; X86-SSE2-NEXT:    pmuludq %xmm2, %xmm1
277; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
278; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
279; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
280; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
281; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
282; X86-SSE2-NEXT:    por %xmm3, %xmm0
283; X86-SSE2-NEXT:    retl
284  %b32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %b
285  %shl = shl <4 x i32> %a, %b
286  %lshr = lshr <4 x i32> %a, %b32
287  %or = or <4 x i32> %shl, %lshr
288  ret <4 x i32> %or
289}
290
291define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
292; SSE2-LABEL: var_rotate_v8i16:
293; SSE2:       # %bb.0:
294; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
295; SSE2-NEXT:    movdqa %xmm1, %xmm2
296; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
297; SSE2-NEXT:    pslld $23, %xmm2
298; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
299; SSE2-NEXT:    paddd %xmm3, %xmm2
300; SSE2-NEXT:    cvttps2dq %xmm2, %xmm2
301; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
302; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
303; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
304; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
305; SSE2-NEXT:    pslld $23, %xmm1
306; SSE2-NEXT:    paddd %xmm3, %xmm1
307; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
308; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
309; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
310; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
311; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
312; SSE2-NEXT:    movdqa %xmm0, %xmm2
313; SSE2-NEXT:    pmulhuw %xmm1, %xmm2
314; SSE2-NEXT:    pmullw %xmm1, %xmm0
315; SSE2-NEXT:    por %xmm2, %xmm0
316; SSE2-NEXT:    retq
317;
318; SSE41-LABEL: var_rotate_v8i16:
319; SSE41:       # %bb.0:
320; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
321; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
322; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
323; SSE41-NEXT:    pslld $23, %xmm1
324; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
325; SSE41-NEXT:    paddd %xmm3, %xmm1
326; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
327; SSE41-NEXT:    pslld $23, %xmm2
328; SSE41-NEXT:    paddd %xmm3, %xmm2
329; SSE41-NEXT:    cvttps2dq %xmm2, %xmm2
330; SSE41-NEXT:    packusdw %xmm1, %xmm2
331; SSE41-NEXT:    movdqa %xmm0, %xmm1
332; SSE41-NEXT:    pmulhuw %xmm2, %xmm1
333; SSE41-NEXT:    pmullw %xmm2, %xmm0
334; SSE41-NEXT:    por %xmm1, %xmm0
335; SSE41-NEXT:    retq
336;
337; AVX1-LABEL: var_rotate_v8i16:
338; AVX1:       # %bb.0:
339; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
340; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7]
341; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
342; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
343; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
344; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
345; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
346; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
347; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
348; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
349; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
350; AVX1-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
351; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
352; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
353; AVX1-NEXT:    retq
354;
355; AVX2-LABEL: var_rotate_v8i16:
356; AVX2:       # %bb.0:
357; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
358; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
359; AVX2-NEXT:    vpsubw %xmm1, %xmm2, %xmm2
360; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
361; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
362; AVX2-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm2
363; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
364; AVX2-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
365; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
366; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
367; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
368; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
369; AVX2-NEXT:    vpor %xmm2, %xmm0, %xmm0
370; AVX2-NEXT:    vzeroupper
371; AVX2-NEXT:    retq
372;
373; AVX512F-LABEL: var_rotate_v8i16:
374; AVX512F:       # %bb.0:
375; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
376; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
377; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
378; AVX512F-NEXT:    vpsllvd %ymm2, %ymm0, %ymm2
379; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
380; AVX512F-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
381; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
382; AVX512F-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
383; AVX512F-NEXT:    vpor %ymm0, %ymm2, %ymm0
384; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
385; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
386; AVX512F-NEXT:    vzeroupper
387; AVX512F-NEXT:    retq
388;
389; AVX512VL-LABEL: var_rotate_v8i16:
390; AVX512VL:       # %bb.0:
391; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
392; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
393; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
394; AVX512VL-NEXT:    vpsllvd %ymm2, %ymm0, %ymm2
395; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
396; AVX512VL-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
397; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
398; AVX512VL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
399; AVX512VL-NEXT:    vpor %ymm0, %ymm2, %ymm0
400; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
401; AVX512VL-NEXT:    vzeroupper
402; AVX512VL-NEXT:    retq
403;
404; AVX512BW-LABEL: var_rotate_v8i16:
405; AVX512BW:       # %bb.0:
406; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
407; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
408; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2
409; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
410; AVX512BW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
411; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
412; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
413; AVX512BW-NEXT:    vzeroupper
414; AVX512BW-NEXT:    retq
415;
416; AVX512VLBW-LABEL: var_rotate_v8i16:
417; AVX512VLBW:       # %bb.0:
418; AVX512VLBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
419; AVX512VLBW-NEXT:    vpsllvw %xmm1, %xmm0, %xmm2
420; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
421; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
422; AVX512VLBW-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0
423; AVX512VLBW-NEXT:    vpor %xmm0, %xmm2, %xmm0
424; AVX512VLBW-NEXT:    retq
425;
426; AVX512VBMI2-LABEL: var_rotate_v8i16:
427; AVX512VBMI2:       # %bb.0:
428; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
429; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
430; AVX512VBMI2-NEXT:    vpshldvw %zmm1, %zmm0, %zmm0
431; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
432; AVX512VBMI2-NEXT:    vzeroupper
433; AVX512VBMI2-NEXT:    retq
434;
435; AVX512VLVBMI2-LABEL: var_rotate_v8i16:
436; AVX512VLVBMI2:       # %bb.0:
437; AVX512VLVBMI2-NEXT:    vpshldvw %xmm1, %xmm0, %xmm0
438; AVX512VLVBMI2-NEXT:    retq
439;
440; XOP-LABEL: var_rotate_v8i16:
441; XOP:       # %bb.0:
442; XOP-NEXT:    vprotw %xmm1, %xmm0, %xmm0
443; XOP-NEXT:    retq
444;
445; X86-SSE2-LABEL: var_rotate_v8i16:
446; X86-SSE2:       # %bb.0:
447; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
448; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
449; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
450; X86-SSE2-NEXT:    pslld $23, %xmm2
451; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
452; X86-SSE2-NEXT:    paddd %xmm3, %xmm2
453; X86-SSE2-NEXT:    cvttps2dq %xmm2, %xmm2
454; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
455; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
456; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
457; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
458; X86-SSE2-NEXT:    pslld $23, %xmm1
459; X86-SSE2-NEXT:    paddd %xmm3, %xmm1
460; X86-SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
461; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
462; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
463; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
464; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
465; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
466; X86-SSE2-NEXT:    pmulhuw %xmm1, %xmm2
467; X86-SSE2-NEXT:    pmullw %xmm1, %xmm0
468; X86-SSE2-NEXT:    por %xmm2, %xmm0
469; X86-SSE2-NEXT:    retl
470  %b16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
471  %shl = shl <8 x i16> %a, %b
472  %lshr = lshr <8 x i16> %a, %b16
473  %or = or <8 x i16> %shl, %lshr
474  ret <8 x i16> %or
475}
476
477define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
478; SSE2-LABEL: var_rotate_v16i8:
479; SSE2:       # %bb.0:
480; SSE2-NEXT:    movdqa %xmm0, %xmm2
481; SSE2-NEXT:    psllw $5, %xmm1
482; SSE2-NEXT:    pxor %xmm0, %xmm0
483; SSE2-NEXT:    pxor %xmm3, %xmm3
484; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
485; SSE2-NEXT:    movdqa %xmm2, %xmm4
486; SSE2-NEXT:    psrlw $4, %xmm4
487; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
488; SSE2-NEXT:    movdqa %xmm2, %xmm5
489; SSE2-NEXT:    psllw $4, %xmm5
490; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
491; SSE2-NEXT:    por %xmm4, %xmm5
492; SSE2-NEXT:    pand %xmm3, %xmm5
493; SSE2-NEXT:    pandn %xmm2, %xmm3
494; SSE2-NEXT:    por %xmm5, %xmm3
495; SSE2-NEXT:    movdqa %xmm3, %xmm2
496; SSE2-NEXT:    psrlw $6, %xmm2
497; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
498; SSE2-NEXT:    movdqa %xmm3, %xmm4
499; SSE2-NEXT:    psllw $2, %xmm4
500; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
501; SSE2-NEXT:    por %xmm2, %xmm4
502; SSE2-NEXT:    paddb %xmm1, %xmm1
503; SSE2-NEXT:    pxor %xmm2, %xmm2
504; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
505; SSE2-NEXT:    pand %xmm2, %xmm4
506; SSE2-NEXT:    pandn %xmm3, %xmm2
507; SSE2-NEXT:    por %xmm4, %xmm2
508; SSE2-NEXT:    movdqa %xmm2, %xmm3
509; SSE2-NEXT:    paddb %xmm2, %xmm3
510; SSE2-NEXT:    movdqa %xmm2, %xmm4
511; SSE2-NEXT:    psrlw $7, %xmm4
512; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
513; SSE2-NEXT:    por %xmm3, %xmm4
514; SSE2-NEXT:    paddb %xmm1, %xmm1
515; SSE2-NEXT:    pcmpgtb %xmm1, %xmm0
516; SSE2-NEXT:    pand %xmm0, %xmm4
517; SSE2-NEXT:    pandn %xmm2, %xmm0
518; SSE2-NEXT:    por %xmm4, %xmm0
519; SSE2-NEXT:    retq
520;
521; SSE41-LABEL: var_rotate_v16i8:
522; SSE41:       # %bb.0:
523; SSE41-NEXT:    movdqa %xmm1, %xmm2
524; SSE41-NEXT:    movdqa %xmm0, %xmm1
525; SSE41-NEXT:    psrlw $4, %xmm0
526; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
527; SSE41-NEXT:    movdqa %xmm1, %xmm3
528; SSE41-NEXT:    psllw $4, %xmm3
529; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
530; SSE41-NEXT:    por %xmm0, %xmm3
531; SSE41-NEXT:    psllw $5, %xmm2
532; SSE41-NEXT:    movdqa %xmm2, %xmm0
533; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
534; SSE41-NEXT:    movdqa %xmm1, %xmm0
535; SSE41-NEXT:    psrlw $6, %xmm0
536; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
537; SSE41-NEXT:    movdqa %xmm1, %xmm3
538; SSE41-NEXT:    psllw $2, %xmm3
539; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
540; SSE41-NEXT:    por %xmm0, %xmm3
541; SSE41-NEXT:    paddb %xmm2, %xmm2
542; SSE41-NEXT:    movdqa %xmm2, %xmm0
543; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
544; SSE41-NEXT:    movdqa %xmm1, %xmm0
545; SSE41-NEXT:    paddb %xmm1, %xmm0
546; SSE41-NEXT:    movdqa %xmm1, %xmm3
547; SSE41-NEXT:    psrlw $7, %xmm3
548; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
549; SSE41-NEXT:    por %xmm0, %xmm3
550; SSE41-NEXT:    paddb %xmm2, %xmm2
551; SSE41-NEXT:    movdqa %xmm2, %xmm0
552; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
553; SSE41-NEXT:    movdqa %xmm1, %xmm0
554; SSE41-NEXT:    retq
555;
556; AVX-LABEL: var_rotate_v16i8:
557; AVX:       # %bb.0:
558; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm2
559; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
560; AVX-NEXT:    vpsllw $4, %xmm0, %xmm3
561; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
562; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
563; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
564; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
565; AVX-NEXT:    vpsrlw $6, %xmm0, %xmm2
566; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
567; AVX-NEXT:    vpsllw $2, %xmm0, %xmm3
568; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
569; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
570; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
571; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
572; AVX-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
573; AVX-NEXT:    vpsrlw $7, %xmm0, %xmm3
574; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
575; AVX-NEXT:    vpor %xmm3, %xmm2, %xmm2
576; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
577; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
578; AVX-NEXT:    retq
579;
580; AVX512F-LABEL: var_rotate_v16i8:
581; AVX512F:       # %bb.0:
582; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
583; AVX512F-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
584; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
585; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
586; AVX512F-NEXT:    vpsllvd %zmm1, %zmm0, %zmm1
587; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
588; AVX512F-NEXT:    vpsrlvd %zmm2, %zmm0, %zmm0
589; AVX512F-NEXT:    vpord %zmm0, %zmm1, %zmm0
590; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
591; AVX512F-NEXT:    vzeroupper
592; AVX512F-NEXT:    retq
593;
594; AVX512VL-LABEL: var_rotate_v16i8:
595; AVX512VL:       # %bb.0:
596; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
597; AVX512VL-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
598; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
599; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
600; AVX512VL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm1
601; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
602; AVX512VL-NEXT:    vpsrlvd %zmm2, %zmm0, %zmm0
603; AVX512VL-NEXT:    vpord %zmm0, %zmm1, %zmm0
604; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
605; AVX512VL-NEXT:    vzeroupper
606; AVX512VL-NEXT:    retq
607;
608; AVX512BW-LABEL: var_rotate_v16i8:
609; AVX512BW:       # %bb.0:
610; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
611; AVX512BW-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
612; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
613; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
614; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
615; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
616; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
617; AVX512BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
618; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
619; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
620; AVX512BW-NEXT:    vzeroupper
621; AVX512BW-NEXT:    retq
622;
623; AVX512VLBW-LABEL: var_rotate_v16i8:
624; AVX512VLBW:       # %bb.0:
625; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
626; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
627; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
628; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
629; AVX512VLBW-NEXT:    vpsllvw %ymm1, %ymm0, %ymm1
630; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
631; AVX512VLBW-NEXT:    vpsrlvw %ymm2, %ymm0, %ymm0
632; AVX512VLBW-NEXT:    vpor %ymm0, %ymm1, %ymm0
633; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
634; AVX512VLBW-NEXT:    vzeroupper
635; AVX512VLBW-NEXT:    retq
636;
637; AVX512VBMI2-LABEL: var_rotate_v16i8:
638; AVX512VBMI2:       # %bb.0:
639; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
640; AVX512VBMI2-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
641; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
642; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
643; AVX512VBMI2-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
644; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
645; AVX512VBMI2-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
646; AVX512VBMI2-NEXT:    vpor %ymm0, %ymm1, %ymm0
647; AVX512VBMI2-NEXT:    vpmovwb %zmm0, %ymm0
648; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
649; AVX512VBMI2-NEXT:    vzeroupper
650; AVX512VBMI2-NEXT:    retq
651;
652; AVX512VLVBMI2-LABEL: var_rotate_v16i8:
653; AVX512VLVBMI2:       # %bb.0:
654; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
655; AVX512VLVBMI2-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
656; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
657; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
658; AVX512VLVBMI2-NEXT:    vpsllvw %ymm1, %ymm0, %ymm1
659; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
660; AVX512VLVBMI2-NEXT:    vpsrlvw %ymm2, %ymm0, %ymm0
661; AVX512VLVBMI2-NEXT:    vpor %ymm0, %ymm1, %ymm0
662; AVX512VLVBMI2-NEXT:    vpmovwb %ymm0, %xmm0
663; AVX512VLVBMI2-NEXT:    vzeroupper
664; AVX512VLVBMI2-NEXT:    retq
665;
666; XOP-LABEL: var_rotate_v16i8:
667; XOP:       # %bb.0:
668; XOP-NEXT:    vprotb %xmm1, %xmm0, %xmm0
669; XOP-NEXT:    retq
670;
671; X86-SSE2-LABEL: var_rotate_v16i8:
672; X86-SSE2:       # %bb.0:
673; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
674; X86-SSE2-NEXT:    psllw $5, %xmm1
675; X86-SSE2-NEXT:    pxor %xmm0, %xmm0
676; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
677; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
678; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
679; X86-SSE2-NEXT:    psrlw $4, %xmm4
680; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
681; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
682; X86-SSE2-NEXT:    psllw $4, %xmm5
683; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5
684; X86-SSE2-NEXT:    por %xmm4, %xmm5
685; X86-SSE2-NEXT:    pand %xmm3, %xmm5
686; X86-SSE2-NEXT:    pandn %xmm2, %xmm3
687; X86-SSE2-NEXT:    por %xmm5, %xmm3
688; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
689; X86-SSE2-NEXT:    psrlw $6, %xmm2
690; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
691; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
692; X86-SSE2-NEXT:    psllw $2, %xmm4
693; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
694; X86-SSE2-NEXT:    por %xmm2, %xmm4
695; X86-SSE2-NEXT:    paddb %xmm1, %xmm1
696; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
697; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
698; X86-SSE2-NEXT:    pand %xmm2, %xmm4
699; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
700; X86-SSE2-NEXT:    por %xmm4, %xmm2
701; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
702; X86-SSE2-NEXT:    paddb %xmm2, %xmm3
703; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
704; X86-SSE2-NEXT:    psrlw $7, %xmm4
705; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
706; X86-SSE2-NEXT:    por %xmm3, %xmm4
707; X86-SSE2-NEXT:    paddb %xmm1, %xmm1
708; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm0
709; X86-SSE2-NEXT:    pand %xmm0, %xmm4
710; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
711; X86-SSE2-NEXT:    por %xmm4, %xmm0
712; X86-SSE2-NEXT:    retl
713  %b8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
714  %shl = shl <16 x i8> %a, %b
715  %lshr = lshr <16 x i8> %a, %b8
716  %or = or <16 x i8> %shl, %lshr
717  ret <16 x i8> %or
718}
719
720;
721; Uniform Variable Rotates
722;
723
724define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
725; SSE-LABEL: splatvar_rotate_v2i64:
726; SSE:       # %bb.0:
727; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [64,64]
728; SSE-NEXT:    psubq %xmm1, %xmm2
729; SSE-NEXT:    movdqa %xmm0, %xmm3
730; SSE-NEXT:    psllq %xmm1, %xmm3
731; SSE-NEXT:    psrlq %xmm2, %xmm0
732; SSE-NEXT:    por %xmm3, %xmm0
733; SSE-NEXT:    retq
734;
735; AVX-LABEL: splatvar_rotate_v2i64:
736; AVX:       # %bb.0:
737; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
738; AVX-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
739; AVX-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
740; AVX-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
741; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
742; AVX-NEXT:    retq
743;
744; AVX512F-LABEL: splatvar_rotate_v2i64:
745; AVX512F:       # %bb.0:
746; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
747; AVX512F-NEXT:    vpbroadcastq %xmm1, %xmm1
748; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
749; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
750; AVX512F-NEXT:    vzeroupper
751; AVX512F-NEXT:    retq
752;
753; AVX512VL-LABEL: splatvar_rotate_v2i64:
754; AVX512VL:       # %bb.0:
755; AVX512VL-NEXT:    vpbroadcastq %xmm1, %xmm1
756; AVX512VL-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
757; AVX512VL-NEXT:    retq
758;
759; AVX512BW-LABEL: splatvar_rotate_v2i64:
760; AVX512BW:       # %bb.0:
761; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
762; AVX512BW-NEXT:    vpbroadcastq %xmm1, %xmm1
763; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
764; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
765; AVX512BW-NEXT:    vzeroupper
766; AVX512BW-NEXT:    retq
767;
768; AVX512VLBW-LABEL: splatvar_rotate_v2i64:
769; AVX512VLBW:       # %bb.0:
770; AVX512VLBW-NEXT:    vpbroadcastq %xmm1, %xmm1
771; AVX512VLBW-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
772; AVX512VLBW-NEXT:    retq
773;
774; AVX512VBMI2-LABEL: splatvar_rotate_v2i64:
775; AVX512VBMI2:       # %bb.0:
776; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
777; AVX512VBMI2-NEXT:    vpbroadcastq %xmm1, %xmm1
778; AVX512VBMI2-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
779; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
780; AVX512VBMI2-NEXT:    vzeroupper
781; AVX512VBMI2-NEXT:    retq
782;
783; AVX512VLVBMI2-LABEL: splatvar_rotate_v2i64:
784; AVX512VLVBMI2:       # %bb.0:
785; AVX512VLVBMI2-NEXT:    vpbroadcastq %xmm1, %xmm1
786; AVX512VLVBMI2-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
787; AVX512VLVBMI2-NEXT:    retq
788;
789; XOPAVX1-LABEL: splatvar_rotate_v2i64:
790; XOPAVX1:       # %bb.0:
791; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
792; XOPAVX1-NEXT:    vprotq %xmm1, %xmm0, %xmm0
793; XOPAVX1-NEXT:    retq
794;
795; XOPAVX2-LABEL: splatvar_rotate_v2i64:
796; XOPAVX2:       # %bb.0:
797; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
798; XOPAVX2-NEXT:    vprotq %xmm1, %xmm0, %xmm0
799; XOPAVX2-NEXT:    retq
800;
801; X86-SSE2-LABEL: splatvar_rotate_v2i64:
802; X86-SSE2:       # %bb.0:
803; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
804; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [64,0,64,0]
805; X86-SSE2-NEXT:    psubq %xmm2, %xmm3
806; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
807; X86-SSE2-NEXT:    psllq %xmm1, %xmm2
808; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
809; X86-SSE2-NEXT:    psrlq %xmm3, %xmm1
810; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
811; X86-SSE2-NEXT:    psrlq %xmm3, %xmm0
812; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
813; X86-SSE2-NEXT:    orpd %xmm2, %xmm0
814; X86-SSE2-NEXT:    retl
815  %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
816  %splat64 = sub <2 x i64> <i64 64, i64 64>, %splat
817  %shl = shl <2 x i64> %a, %splat
818  %lshr = lshr <2 x i64> %a, %splat64
819  %or = or <2 x i64> %shl, %lshr
820  ret <2 x i64> %or
821}
822
823define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
824; SSE2-LABEL: splatvar_rotate_v4i32:
825; SSE2:       # %bb.0:
826; SSE2-NEXT:    movd %xmm1, %eax
827; SSE2-NEXT:    andl $31, %eax
828; SSE2-NEXT:    movd %eax, %xmm1
829; SSE2-NEXT:    movdqa %xmm0, %xmm2
830; SSE2-NEXT:    pslld %xmm1, %xmm2
831; SSE2-NEXT:    movl $32, %ecx
832; SSE2-NEXT:    subl %eax, %ecx
833; SSE2-NEXT:    movd %ecx, %xmm1
834; SSE2-NEXT:    psrld %xmm1, %xmm0
835; SSE2-NEXT:    por %xmm2, %xmm0
836; SSE2-NEXT:    retq
837;
838; SSE41-LABEL: splatvar_rotate_v4i32:
839; SSE41:       # %bb.0:
840; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
841; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
842; SSE41-NEXT:    movdqa %xmm0, %xmm3
843; SSE41-NEXT:    pslld %xmm2, %xmm3
844; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32]
845; SSE41-NEXT:    psubd %xmm1, %xmm2
846; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero
847; SSE41-NEXT:    psrld %xmm1, %xmm0
848; SSE41-NEXT:    por %xmm3, %xmm0
849; SSE41-NEXT:    retq
850;
851; AVX1-LABEL: splatvar_rotate_v4i32:
852; AVX1:       # %bb.0:
853; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
854; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
855; AVX1-NEXT:    vpslld %xmm2, %xmm0, %xmm2
856; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32]
857; AVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
858; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
859; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
860; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
861; AVX1-NEXT:    retq
862;
863; AVX2-LABEL: splatvar_rotate_v4i32:
864; AVX2:       # %bb.0:
865; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
866; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
867; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
868; AVX2-NEXT:    vpslld %xmm2, %xmm0, %xmm2
869; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
870; AVX2-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
871; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
872; AVX2-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
873; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
874; AVX2-NEXT:    retq
875;
876; AVX512F-LABEL: splatvar_rotate_v4i32:
877; AVX512F:       # %bb.0:
878; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
879; AVX512F-NEXT:    vpbroadcastd %xmm1, %xmm1
880; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
881; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
882; AVX512F-NEXT:    vzeroupper
883; AVX512F-NEXT:    retq
884;
885; AVX512VL-LABEL: splatvar_rotate_v4i32:
886; AVX512VL:       # %bb.0:
887; AVX512VL-NEXT:    vpbroadcastd %xmm1, %xmm1
888; AVX512VL-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
889; AVX512VL-NEXT:    retq
890;
891; AVX512BW-LABEL: splatvar_rotate_v4i32:
892; AVX512BW:       # %bb.0:
893; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
894; AVX512BW-NEXT:    vpbroadcastd %xmm1, %xmm1
895; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
896; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
897; AVX512BW-NEXT:    vzeroupper
898; AVX512BW-NEXT:    retq
899;
900; AVX512VLBW-LABEL: splatvar_rotate_v4i32:
901; AVX512VLBW:       # %bb.0:
902; AVX512VLBW-NEXT:    vpbroadcastd %xmm1, %xmm1
903; AVX512VLBW-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
904; AVX512VLBW-NEXT:    retq
905;
906; AVX512VBMI2-LABEL: splatvar_rotate_v4i32:
907; AVX512VBMI2:       # %bb.0:
908; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
909; AVX512VBMI2-NEXT:    vpbroadcastd %xmm1, %xmm1
910; AVX512VBMI2-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
911; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
912; AVX512VBMI2-NEXT:    vzeroupper
913; AVX512VBMI2-NEXT:    retq
914;
915; AVX512VLVBMI2-LABEL: splatvar_rotate_v4i32:
916; AVX512VLVBMI2:       # %bb.0:
917; AVX512VLVBMI2-NEXT:    vpbroadcastd %xmm1, %xmm1
918; AVX512VLVBMI2-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
919; AVX512VLVBMI2-NEXT:    retq
920;
921; XOPAVX1-LABEL: splatvar_rotate_v4i32:
922; XOPAVX1:       # %bb.0:
923; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
924; XOPAVX1-NEXT:    vprotd %xmm1, %xmm0, %xmm0
925; XOPAVX1-NEXT:    retq
926;
927; XOPAVX2-LABEL: splatvar_rotate_v4i32:
928; XOPAVX2:       # %bb.0:
929; XOPAVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
930; XOPAVX2-NEXT:    vprotd %xmm1, %xmm0, %xmm0
931; XOPAVX2-NEXT:    retq
932;
933; X86-SSE2-LABEL: splatvar_rotate_v4i32:
934; X86-SSE2:       # %bb.0:
935; X86-SSE2-NEXT:    movd %xmm1, %eax
936; X86-SSE2-NEXT:    andl $31, %eax
937; X86-SSE2-NEXT:    movd %eax, %xmm1
938; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
939; X86-SSE2-NEXT:    pslld %xmm1, %xmm2
940; X86-SSE2-NEXT:    movl $32, %ecx
941; X86-SSE2-NEXT:    subl %eax, %ecx
942; X86-SSE2-NEXT:    movd %ecx, %xmm1
943; X86-SSE2-NEXT:    psrld %xmm1, %xmm0
944; X86-SSE2-NEXT:    por %xmm2, %xmm0
945; X86-SSE2-NEXT:    retl
946  %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
947  %splat32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %splat
948  %shl = shl <4 x i32> %a, %splat
949  %lshr = lshr <4 x i32> %a, %splat32
950  %or = or <4 x i32> %shl, %lshr
951  ret <4 x i32> %or
952}
953
954define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
955; SSE2-LABEL: splatvar_rotate_v8i16:
956; SSE2:       # %bb.0:
957; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
958; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
959; SSE2-NEXT:    psubw %xmm1, %xmm2
960; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
961; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
962; SSE2-NEXT:    movdqa %xmm0, %xmm3
963; SSE2-NEXT:    psllw %xmm1, %xmm3
964; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
965; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
966; SSE2-NEXT:    psrlw %xmm2, %xmm0
967; SSE2-NEXT:    por %xmm3, %xmm0
968; SSE2-NEXT:    retq
969;
970; SSE41-LABEL: splatvar_rotate_v8i16:
971; SSE41:       # %bb.0:
972; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
973; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
974; SSE41-NEXT:    movdqa %xmm0, %xmm3
975; SSE41-NEXT:    psllw %xmm2, %xmm3
976; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
977; SSE41-NEXT:    psubw %xmm1, %xmm2
978; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
979; SSE41-NEXT:    psrlw %xmm1, %xmm0
980; SSE41-NEXT:    por %xmm3, %xmm0
981; SSE41-NEXT:    retq
982;
983; AVX-LABEL: splatvar_rotate_v8i16:
984; AVX:       # %bb.0:
985; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
986; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
987; AVX-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
988; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
989; AVX-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
990; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
991; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
992; AVX-NEXT:    vpor %xmm0, %xmm2, %xmm0
993; AVX-NEXT:    retq
994;
995; AVX512F-LABEL: splatvar_rotate_v8i16:
996; AVX512F:       # %bb.0:
997; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
998; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
999; AVX512F-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
1000; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
1001; AVX512F-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
1002; AVX512F-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1003; AVX512F-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1004; AVX512F-NEXT:    vpor %xmm0, %xmm2, %xmm0
1005; AVX512F-NEXT:    retq
1006;
1007; AVX512VL-LABEL: splatvar_rotate_v8i16:
1008; AVX512VL:       # %bb.0:
1009; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1010; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1011; AVX512VL-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
1012; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
1013; AVX512VL-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
1014; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1015; AVX512VL-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1016; AVX512VL-NEXT:    vpor %xmm0, %xmm2, %xmm0
1017; AVX512VL-NEXT:    retq
1018;
1019; AVX512BW-LABEL: splatvar_rotate_v8i16:
1020; AVX512BW:       # %bb.0:
1021; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1022; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1023; AVX512BW-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
1024; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
1025; AVX512BW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
1026; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1027; AVX512BW-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1028; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
1029; AVX512BW-NEXT:    retq
1030;
1031; AVX512VLBW-LABEL: splatvar_rotate_v8i16:
1032; AVX512VLBW:       # %bb.0:
1033; AVX512VLBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1034; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1035; AVX512VLBW-NEXT:    vpsllw %xmm2, %xmm0, %xmm2
1036; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
1037; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
1038; AVX512VLBW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1039; AVX512VLBW-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1040; AVX512VLBW-NEXT:    vpor %xmm0, %xmm2, %xmm0
1041; AVX512VLBW-NEXT:    retq
1042;
1043; AVX512VBMI2-LABEL: splatvar_rotate_v8i16:
1044; AVX512VBMI2:       # %bb.0:
1045; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1046; AVX512VBMI2-NEXT:    vpbroadcastw %xmm1, %xmm1
1047; AVX512VBMI2-NEXT:    vpshldvw %zmm1, %zmm0, %zmm0
1048; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1049; AVX512VBMI2-NEXT:    vzeroupper
1050; AVX512VBMI2-NEXT:    retq
1051;
1052; AVX512VLVBMI2-LABEL: splatvar_rotate_v8i16:
1053; AVX512VLVBMI2:       # %bb.0:
1054; AVX512VLVBMI2-NEXT:    vpbroadcastw %xmm1, %xmm1
1055; AVX512VLVBMI2-NEXT:    vpshldvw %xmm1, %xmm0, %xmm0
1056; AVX512VLVBMI2-NEXT:    retq
1057;
1058; XOPAVX1-LABEL: splatvar_rotate_v8i16:
1059; XOPAVX1:       # %bb.0:
1060; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1061; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1062; XOPAVX1-NEXT:    vprotw %xmm1, %xmm0, %xmm0
1063; XOPAVX1-NEXT:    retq
1064;
1065; XOPAVX2-LABEL: splatvar_rotate_v8i16:
1066; XOPAVX2:       # %bb.0:
1067; XOPAVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
1068; XOPAVX2-NEXT:    vprotw %xmm1, %xmm0, %xmm0
1069; XOPAVX2-NEXT:    retq
1070;
1071; X86-SSE2-LABEL: splatvar_rotate_v8i16:
1072; X86-SSE2:       # %bb.0:
1073; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1074; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
1075; X86-SSE2-NEXT:    psubw %xmm1, %xmm2
1076; X86-SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
1077; X86-SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1078; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
1079; X86-SSE2-NEXT:    psllw %xmm1, %xmm3
1080; X86-SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1]
1081; X86-SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1082; X86-SSE2-NEXT:    psrlw %xmm2, %xmm0
1083; X86-SSE2-NEXT:    por %xmm3, %xmm0
1084; X86-SSE2-NEXT:    retl
1085  %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
1086  %splat16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
1087  %shl = shl <8 x i16> %a, %splat
1088  %lshr = lshr <8 x i16> %a, %splat16
1089  %or = or <8 x i16> %shl, %lshr
1090  ret <8 x i16> %or
1091}
1092
1093define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
1094; SSE2-LABEL: splatvar_rotate_v16i8:
1095; SSE2:       # %bb.0:
1096; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1097; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1098; SSE2-NEXT:    psubb %xmm1, %xmm2
1099; SSE2-NEXT:    movdqa %xmm1, %xmm3
1100; SSE2-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
1101; SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1102; SSE2-NEXT:    movdqa %xmm0, %xmm1
1103; SSE2-NEXT:    psllw %xmm3, %xmm1
1104; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
1105; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
1106; SSE2-NEXT:    psllw %xmm3, %xmm5
1107; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1108; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm5[0,0,0,0,4,5,6,7]
1109; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
1110; SSE2-NEXT:    pand %xmm3, %xmm1
1111; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
1112; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1113; SSE2-NEXT:    psrlw %xmm2, %xmm0
1114; SSE2-NEXT:    psrlw %xmm2, %xmm4
1115; SSE2-NEXT:    psrlw $8, %xmm4
1116; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1117; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7]
1118; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1119; SSE2-NEXT:    pand %xmm0, %xmm2
1120; SSE2-NEXT:    por %xmm2, %xmm1
1121; SSE2-NEXT:    movdqa %xmm1, %xmm0
1122; SSE2-NEXT:    retq
1123;
1124; SSE41-LABEL: splatvar_rotate_v16i8:
1125; SSE41:       # %bb.0:
1126; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1127; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1128; SSE41-NEXT:    movdqa %xmm0, %xmm2
1129; SSE41-NEXT:    psllw %xmm3, %xmm2
1130; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
1131; SSE41-NEXT:    pcmpeqd %xmm5, %xmm5
1132; SSE41-NEXT:    psllw %xmm3, %xmm5
1133; SSE41-NEXT:    pxor %xmm3, %xmm3
1134; SSE41-NEXT:    pshufb %xmm3, %xmm5
1135; SSE41-NEXT:    pand %xmm5, %xmm2
1136; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1137; SSE41-NEXT:    psubb %xmm1, %xmm3
1138; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
1139; SSE41-NEXT:    psrlw %xmm1, %xmm0
1140; SSE41-NEXT:    psrlw %xmm1, %xmm4
1141; SSE41-NEXT:    pshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1142; SSE41-NEXT:    pand %xmm0, %xmm4
1143; SSE41-NEXT:    por %xmm4, %xmm2
1144; SSE41-NEXT:    movdqa %xmm2, %xmm0
1145; SSE41-NEXT:    retq
1146;
1147; AVX1-LABEL: splatvar_rotate_v16i8:
1148; AVX1:       # %bb.0:
1149; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1150; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1151; AVX1-NEXT:    vpsllw %xmm2, %xmm0, %xmm3
1152; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
1153; AVX1-NEXT:    vpsllw %xmm2, %xmm4, %xmm2
1154; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
1155; AVX1-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
1156; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm2
1157; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1158; AVX1-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
1159; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1160; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1161; AVX1-NEXT:    vpsrlw %xmm1, %xmm4, %xmm1
1162; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1163; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
1164; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1165; AVX1-NEXT:    retq
1166;
1167; AVX2-LABEL: splatvar_rotate_v16i8:
1168; AVX2:       # %bb.0:
1169; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1170; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1171; AVX2-NEXT:    vpsllw %xmm2, %xmm0, %xmm3
1172; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
1173; AVX2-NEXT:    vpsllw %xmm2, %xmm4, %xmm2
1174; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
1175; AVX2-NEXT:    vpand %xmm2, %xmm3, %xmm2
1176; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1177; AVX2-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
1178; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1179; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1180; AVX2-NEXT:    vpsrlw %xmm1, %xmm4, %xmm1
1181; AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
1182; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
1183; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
1184; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
1185; AVX2-NEXT:    retq
1186;
1187; AVX512F-LABEL: splatvar_rotate_v16i8:
1188; AVX512F:       # %bb.0:
1189; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1190; AVX512F-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
1191; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1192; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1193; AVX512F-NEXT:    vpslld %xmm1, %zmm0, %zmm1
1194; AVX512F-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1195; AVX512F-NEXT:    vpsrld %xmm2, %zmm0, %zmm0
1196; AVX512F-NEXT:    vpord %zmm0, %zmm1, %zmm0
1197; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
1198; AVX512F-NEXT:    vzeroupper
1199; AVX512F-NEXT:    retq
1200;
1201; AVX512VL-LABEL: splatvar_rotate_v16i8:
1202; AVX512VL:       # %bb.0:
1203; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1204; AVX512VL-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
1205; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1206; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1207; AVX512VL-NEXT:    vpslld %xmm1, %zmm0, %zmm1
1208; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1209; AVX512VL-NEXT:    vpsrld %xmm2, %zmm0, %zmm0
1210; AVX512VL-NEXT:    vpord %zmm0, %zmm1, %zmm0
1211; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
1212; AVX512VL-NEXT:    vzeroupper
1213; AVX512VL-NEXT:    retq
1214;
1215; AVX512BW-LABEL: splatvar_rotate_v16i8:
1216; AVX512BW:       # %bb.0:
1217; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1218; AVX512BW-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
1219; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1220; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1221; AVX512BW-NEXT:    vpsllw %xmm1, %ymm0, %ymm1
1222; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1223; AVX512BW-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
1224; AVX512BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
1225; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1226; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1227; AVX512BW-NEXT:    vzeroupper
1228; AVX512BW-NEXT:    retq
1229;
1230; AVX512VLBW-LABEL: splatvar_rotate_v16i8:
1231; AVX512VLBW:       # %bb.0:
1232; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1233; AVX512VLBW-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
1234; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1235; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1236; AVX512VLBW-NEXT:    vpsllw %xmm1, %ymm0, %ymm1
1237; AVX512VLBW-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1238; AVX512VLBW-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
1239; AVX512VLBW-NEXT:    vpor %ymm0, %ymm1, %ymm0
1240; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
1241; AVX512VLBW-NEXT:    vzeroupper
1242; AVX512VLBW-NEXT:    retq
1243;
1244; AVX512VBMI2-LABEL: splatvar_rotate_v16i8:
1245; AVX512VBMI2:       # %bb.0:
1246; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1247; AVX512VBMI2-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
1248; AVX512VBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1249; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1250; AVX512VBMI2-NEXT:    vpsllw %xmm1, %ymm0, %ymm1
1251; AVX512VBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1252; AVX512VBMI2-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
1253; AVX512VBMI2-NEXT:    vpor %ymm0, %ymm1, %ymm0
1254; AVX512VBMI2-NEXT:    vpmovwb %zmm0, %ymm0
1255; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1256; AVX512VBMI2-NEXT:    vzeroupper
1257; AVX512VBMI2-NEXT:    retq
1258;
1259; AVX512VLVBMI2-LABEL: splatvar_rotate_v16i8:
1260; AVX512VLVBMI2:       # %bb.0:
1261; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1262; AVX512VLVBMI2-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
1263; AVX512VLVBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1264; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1265; AVX512VLVBMI2-NEXT:    vpsllw %xmm1, %ymm0, %ymm1
1266; AVX512VLVBMI2-NEXT:    vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
1267; AVX512VLVBMI2-NEXT:    vpsrlw %xmm2, %ymm0, %ymm0
1268; AVX512VLVBMI2-NEXT:    vpor %ymm0, %ymm1, %ymm0
1269; AVX512VLVBMI2-NEXT:    vpmovwb %ymm0, %xmm0
1270; AVX512VLVBMI2-NEXT:    vzeroupper
1271; AVX512VLVBMI2-NEXT:    retq
1272;
1273; XOPAVX1-LABEL: splatvar_rotate_v16i8:
1274; XOPAVX1:       # %bb.0:
1275; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1276; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1277; XOPAVX1-NEXT:    vprotb %xmm1, %xmm0, %xmm0
1278; XOPAVX1-NEXT:    retq
1279;
1280; XOPAVX2-LABEL: splatvar_rotate_v16i8:
1281; XOPAVX2:       # %bb.0:
1282; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
1283; XOPAVX2-NEXT:    vprotb %xmm1, %xmm0, %xmm0
1284; XOPAVX2-NEXT:    retq
1285;
1286; X86-SSE2-LABEL: splatvar_rotate_v16i8:
1287; X86-SSE2:       # %bb.0:
1288; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1289; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1290; X86-SSE2-NEXT:    psubb %xmm1, %xmm2
1291; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
1292; X86-SSE2-NEXT:    pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0]
1293; X86-SSE2-NEXT:    psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1294; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1295; X86-SSE2-NEXT:    psllw %xmm3, %xmm1
1296; X86-SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
1297; X86-SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
1298; X86-SSE2-NEXT:    psllw %xmm3, %xmm5
1299; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1300; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm5[0,0,0,0,4,5,6,7]
1301; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
1302; X86-SSE2-NEXT:    pand %xmm3, %xmm1
1303; X86-SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
1304; X86-SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1305; X86-SSE2-NEXT:    psrlw %xmm2, %xmm0
1306; X86-SSE2-NEXT:    psrlw %xmm2, %xmm4
1307; X86-SSE2-NEXT:    psrlw $8, %xmm4
1308; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1309; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7]
1310; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1311; X86-SSE2-NEXT:    pand %xmm0, %xmm2
1312; X86-SSE2-NEXT:    por %xmm2, %xmm1
1313; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
1314; X86-SSE2-NEXT:    retl
1315  %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
1316  %splat8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
1317  %shl = shl <16 x i8> %a, %splat
1318  %lshr = lshr <16 x i8> %a, %splat8
1319  %or = or <16 x i8> %shl, %lshr
1320  ret <16 x i8> %or
1321}
1322
1323;
1324; Constant Rotates
1325;
1326
1327define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind {
1328; SSE2-LABEL: constant_rotate_v2i64:
1329; SSE2:       # %bb.0:
1330; SSE2-NEXT:    movdqa %xmm0, %xmm1
1331; SSE2-NEXT:    psllq $4, %xmm1
1332; SSE2-NEXT:    movdqa %xmm0, %xmm2
1333; SSE2-NEXT:    psllq $14, %xmm2
1334; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1335; SSE2-NEXT:    movdqa %xmm0, %xmm1
1336; SSE2-NEXT:    psrlq $60, %xmm1
1337; SSE2-NEXT:    psrlq $50, %xmm0
1338; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1339; SSE2-NEXT:    orpd %xmm2, %xmm0
1340; SSE2-NEXT:    retq
1341;
1342; SSE41-LABEL: constant_rotate_v2i64:
1343; SSE41:       # %bb.0:
1344; SSE41-NEXT:    movdqa %xmm0, %xmm1
1345; SSE41-NEXT:    psllq $14, %xmm1
1346; SSE41-NEXT:    movdqa %xmm0, %xmm2
1347; SSE41-NEXT:    psllq $4, %xmm2
1348; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1349; SSE41-NEXT:    movdqa %xmm0, %xmm1
1350; SSE41-NEXT:    psrlq $50, %xmm1
1351; SSE41-NEXT:    psrlq $60, %xmm0
1352; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1353; SSE41-NEXT:    por %xmm2, %xmm0
1354; SSE41-NEXT:    retq
1355;
1356; AVX1-LABEL: constant_rotate_v2i64:
1357; AVX1:       # %bb.0:
1358; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm1
1359; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm2
1360; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1361; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm2
1362; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm0
1363; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1364; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
1365; AVX1-NEXT:    retq
1366;
1367; AVX2-LABEL: constant_rotate_v2i64:
1368; AVX2:       # %bb.0:
1369; AVX2-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1370; AVX2-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1371; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
1372; AVX2-NEXT:    retq
1373;
1374; AVX512F-LABEL: constant_rotate_v2i64:
1375; AVX512F:       # %bb.0:
1376; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1377; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,14]
1378; AVX512F-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
1379; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1380; AVX512F-NEXT:    vzeroupper
1381; AVX512F-NEXT:    retq
1382;
1383; AVX512VL-LABEL: constant_rotate_v2i64:
1384; AVX512VL:       # %bb.0:
1385; AVX512VL-NEXT:    vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1386; AVX512VL-NEXT:    retq
1387;
1388; AVX512BW-LABEL: constant_rotate_v2i64:
1389; AVX512BW:       # %bb.0:
1390; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1391; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,14]
1392; AVX512BW-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
1393; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1394; AVX512BW-NEXT:    vzeroupper
1395; AVX512BW-NEXT:    retq
1396;
1397; AVX512VLBW-LABEL: constant_rotate_v2i64:
1398; AVX512VLBW:       # %bb.0:
1399; AVX512VLBW-NEXT:    vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1400; AVX512VLBW-NEXT:    retq
1401;
1402; AVX512VBMI2-LABEL: constant_rotate_v2i64:
1403; AVX512VBMI2:       # %bb.0:
1404; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1405; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,14]
1406; AVX512VBMI2-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
1407; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1408; AVX512VBMI2-NEXT:    vzeroupper
1409; AVX512VBMI2-NEXT:    retq
1410;
1411; AVX512VLVBMI2-LABEL: constant_rotate_v2i64:
1412; AVX512VLVBMI2:       # %bb.0:
1413; AVX512VLVBMI2-NEXT:    vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1414; AVX512VLVBMI2-NEXT:    retq
1415;
1416; XOP-LABEL: constant_rotate_v2i64:
1417; XOP:       # %bb.0:
1418; XOP-NEXT:    vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1419; XOP-NEXT:    retq
1420;
1421; X86-SSE2-LABEL: constant_rotate_v2i64:
1422; X86-SSE2:       # %bb.0:
1423; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1424; X86-SSE2-NEXT:    psllq $4, %xmm1
1425; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
1426; X86-SSE2-NEXT:    psllq $14, %xmm2
1427; X86-SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1428; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1429; X86-SSE2-NEXT:    psrlq $60, %xmm1
1430; X86-SSE2-NEXT:    psrlq $50, %xmm0
1431; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1432; X86-SSE2-NEXT:    orpd %xmm2, %xmm0
1433; X86-SSE2-NEXT:    retl
1434  %shl = shl <2 x i64> %a, <i64 4, i64 14>
1435  %lshr = lshr <2 x i64> %a, <i64 60, i64 50>
1436  %or = or <2 x i64> %shl, %lshr
1437  ret <2 x i64> %or
1438}
1439
1440define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
1441; SSE2-LABEL: constant_rotate_v4i32:
1442; SSE2:       # %bb.0:
1443; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
1444; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1445; SSE2-NEXT:    pmuludq %xmm1, %xmm0
1446; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
1447; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1448; SSE2-NEXT:    pmuludq %xmm2, %xmm1
1449; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
1450; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1451; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1452; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1453; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1454; SSE2-NEXT:    por %xmm3, %xmm0
1455; SSE2-NEXT:    retq
1456;
1457; SSE41-LABEL: constant_rotate_v4i32:
1458; SSE41:       # %bb.0:
1459; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
1460; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1461; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
1462; SSE41-NEXT:    pmuludq %xmm2, %xmm3
1463; SSE41-NEXT:    pmuludq %xmm1, %xmm0
1464; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1465; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1466; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
1467; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1468; SSE41-NEXT:    por %xmm1, %xmm0
1469; SSE41-NEXT:    retq
1470;
1471; AVX1-LABEL: constant_rotate_v4i32:
1472; AVX1:       # %bb.0:
1473; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [16,32,64,128]
1474; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
1475; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
1476; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
1477; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
1478; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1479; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1480; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
1481; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1482; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
1483; AVX1-NEXT:    retq
1484;
1485; AVX2-LABEL: constant_rotate_v4i32:
1486; AVX2:       # %bb.0:
1487; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1488; AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1489; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
1490; AVX2-NEXT:    retq
1491;
1492; AVX512F-LABEL: constant_rotate_v4i32:
1493; AVX512F:       # %bb.0:
1494; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1495; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
1496; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
1497; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1498; AVX512F-NEXT:    vzeroupper
1499; AVX512F-NEXT:    retq
1500;
1501; AVX512VL-LABEL: constant_rotate_v4i32:
1502; AVX512VL:       # %bb.0:
1503; AVX512VL-NEXT:    vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1504; AVX512VL-NEXT:    retq
1505;
1506; AVX512BW-LABEL: constant_rotate_v4i32:
1507; AVX512BW:       # %bb.0:
1508; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1509; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
1510; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
1511; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1512; AVX512BW-NEXT:    vzeroupper
1513; AVX512BW-NEXT:    retq
1514;
1515; AVX512VLBW-LABEL: constant_rotate_v4i32:
1516; AVX512VLBW:       # %bb.0:
1517; AVX512VLBW-NEXT:    vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1518; AVX512VLBW-NEXT:    retq
1519;
1520; AVX512VBMI2-LABEL: constant_rotate_v4i32:
1521; AVX512VBMI2:       # %bb.0:
1522; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1523; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
1524; AVX512VBMI2-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
1525; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1526; AVX512VBMI2-NEXT:    vzeroupper
1527; AVX512VBMI2-NEXT:    retq
1528;
1529; AVX512VLVBMI2-LABEL: constant_rotate_v4i32:
1530; AVX512VLVBMI2:       # %bb.0:
1531; AVX512VLVBMI2-NEXT:    vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1532; AVX512VLVBMI2-NEXT:    retq
1533;
1534; XOP-LABEL: constant_rotate_v4i32:
1535; XOP:       # %bb.0:
1536; XOP-NEXT:    vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1537; XOP-NEXT:    retq
1538;
1539; X86-SSE2-LABEL: constant_rotate_v4i32:
1540; X86-SSE2:       # %bb.0:
1541; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [16,32,64,128]
1542; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1543; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm0
1544; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
1545; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1546; X86-SSE2-NEXT:    pmuludq %xmm2, %xmm1
1547; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
1548; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1549; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1550; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1551; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1552; X86-SSE2-NEXT:    por %xmm3, %xmm0
1553; X86-SSE2-NEXT:    retl
1554  %shl = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
1555  %lshr = lshr <4 x i32> %a, <i32 28, i32 27, i32 26, i32 25>
1556  %or = or <4 x i32> %shl, %lshr
1557  ret <4 x i32> %or
1558}
1559
1560define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
1561; SSE-LABEL: constant_rotate_v8i16:
1562; SSE:       # %bb.0:
1563; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1564; SSE-NEXT:    movdqa %xmm0, %xmm2
1565; SSE-NEXT:    pmulhuw %xmm1, %xmm2
1566; SSE-NEXT:    pmullw %xmm1, %xmm0
1567; SSE-NEXT:    por %xmm2, %xmm0
1568; SSE-NEXT:    retq
1569;
1570; AVX-LABEL: constant_rotate_v8i16:
1571; AVX:       # %bb.0:
1572; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1573; AVX-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
1574; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1575; AVX-NEXT:    vpor %xmm2, %xmm0, %xmm0
1576; AVX-NEXT:    retq
1577;
1578; AVX512F-LABEL: constant_rotate_v8i16:
1579; AVX512F:       # %bb.0:
1580; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1581; AVX512F-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
1582; AVX512F-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1583; AVX512F-NEXT:    vpor %xmm2, %xmm0, %xmm0
1584; AVX512F-NEXT:    retq
1585;
1586; AVX512VL-LABEL: constant_rotate_v8i16:
1587; AVX512VL:       # %bb.0:
1588; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1589; AVX512VL-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
1590; AVX512VL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1591; AVX512VL-NEXT:    vpor %xmm2, %xmm0, %xmm0
1592; AVX512VL-NEXT:    retq
1593;
1594; AVX512BW-LABEL: constant_rotate_v8i16:
1595; AVX512BW:       # %bb.0:
1596; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1597; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
1598; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9]
1599; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm2
1600; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
1601; AVX512BW-NEXT:    vpor %xmm2, %xmm0, %xmm0
1602; AVX512BW-NEXT:    vzeroupper
1603; AVX512BW-NEXT:    retq
1604;
1605; AVX512VLBW-LABEL: constant_rotate_v8i16:
1606; AVX512VLBW:       # %bb.0:
1607; AVX512VLBW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1608; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1609; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1610; AVX512VLBW-NEXT:    retq
1611;
1612; AVX512VBMI2-LABEL: constant_rotate_v8i16:
1613; AVX512VBMI2:       # %bb.0:
1614; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1615; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
1616; AVX512VBMI2-NEXT:    vpshldvw %zmm1, %zmm0, %zmm0
1617; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1618; AVX512VBMI2-NEXT:    vzeroupper
1619; AVX512VBMI2-NEXT:    retq
1620;
1621; AVX512VLVBMI2-LABEL: constant_rotate_v8i16:
1622; AVX512VLVBMI2:       # %bb.0:
1623; AVX512VLVBMI2-NEXT:    vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1624; AVX512VLVBMI2-NEXT:    retq
1625;
1626; XOP-LABEL: constant_rotate_v8i16:
1627; XOP:       # %bb.0:
1628; XOP-NEXT:    vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1629; XOP-NEXT:    retq
1630;
1631; X86-SSE2-LABEL: constant_rotate_v8i16:
1632; X86-SSE2:       # %bb.0:
1633; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1634; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
1635; X86-SSE2-NEXT:    pmulhuw %xmm1, %xmm2
1636; X86-SSE2-NEXT:    pmullw %xmm1, %xmm0
1637; X86-SSE2-NEXT:    por %xmm2, %xmm0
1638; X86-SSE2-NEXT:    retl
1639  %shl = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
1640  %lshr = lshr <8 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9>
1641  %or = or <8 x i16> %shl, %lshr
1642  ret <8 x i16> %or
1643}
1644
1645define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
1646; SSE2-LABEL: constant_rotate_v16i8:
1647; SSE2:       # %bb.0:
1648; SSE2-NEXT:    pxor %xmm1, %xmm1
1649; SSE2-NEXT:    movdqa %xmm0, %xmm2
1650; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1651; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1652; SSE2-NEXT:    psrlw $8, %xmm2
1653; SSE2-NEXT:    movdqa %xmm0, %xmm3
1654; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1655; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
1656; SSE2-NEXT:    psrlw $8, %xmm3
1657; SSE2-NEXT:    packuswb %xmm2, %xmm3
1658; SSE2-NEXT:    movdqa %xmm0, %xmm1
1659; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1660; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1661; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1662; SSE2-NEXT:    pand %xmm2, %xmm1
1663; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1664; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1665; SSE2-NEXT:    pand %xmm2, %xmm0
1666; SSE2-NEXT:    packuswb %xmm1, %xmm0
1667; SSE2-NEXT:    por %xmm3, %xmm0
1668; SSE2-NEXT:    retq
1669;
1670; SSE41-LABEL: constant_rotate_v16i8:
1671; SSE41:       # %bb.0:
1672; SSE41-NEXT:    movdqa %xmm0, %xmm2
1673; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1674; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1675; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1676; SSE41-NEXT:    pand %xmm3, %xmm2
1677; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1678; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
1679; SSE41-NEXT:    pmullw %xmm1, %xmm4
1680; SSE41-NEXT:    pand %xmm3, %xmm4
1681; SSE41-NEXT:    packuswb %xmm2, %xmm4
1682; SSE41-NEXT:    pxor %xmm2, %xmm2
1683; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1684; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1685; SSE41-NEXT:    psrlw $8, %xmm0
1686; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1687; SSE41-NEXT:    psrlw $8, %xmm1
1688; SSE41-NEXT:    packuswb %xmm0, %xmm1
1689; SSE41-NEXT:    por %xmm4, %xmm1
1690; SSE41-NEXT:    movdqa %xmm1, %xmm0
1691; SSE41-NEXT:    retq
1692;
1693; AVX1-LABEL: constant_rotate_v16i8:
1694; AVX1:       # %bb.0:
1695; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1696; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1697; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1698; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
1699; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1700; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm4
1701; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm2
1702; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
1703; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1704; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1705; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1706; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1707; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm2
1708; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
1709; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
1710; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
1711; AVX1-NEXT:    retq
1712;
1713; AVX2-LABEL: constant_rotate_v16i8:
1714; AVX2:       # %bb.0:
1715; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1716; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1717; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
1718; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
1719; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
1720; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1721; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1722; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1723; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1724; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
1725; AVX2-NEXT:    vzeroupper
1726; AVX2-NEXT:    retq
1727;
1728; AVX512F-LABEL: constant_rotate_v16i8:
1729; AVX512F:       # %bb.0:
1730; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1731; AVX512F-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
1732; AVX512F-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1733; AVX512F-NEXT:    vpord %zmm0, %zmm1, %zmm0
1734; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
1735; AVX512F-NEXT:    vzeroupper
1736; AVX512F-NEXT:    retq
1737;
1738; AVX512VL-LABEL: constant_rotate_v16i8:
1739; AVX512VL:       # %bb.0:
1740; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1741; AVX512VL-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
1742; AVX512VL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1743; AVX512VL-NEXT:    vpord %zmm0, %zmm1, %zmm0
1744; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
1745; AVX512VL-NEXT:    vzeroupper
1746; AVX512VL-NEXT:    retq
1747;
1748; AVX512BW-LABEL: constant_rotate_v16i8:
1749; AVX512BW:       # %bb.0:
1750; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1751; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1752; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
1753; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
1754; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
1755; AVX512BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
1756; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1757; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1758; AVX512BW-NEXT:    vzeroupper
1759; AVX512BW-NEXT:    retq
1760;
1761; AVX512VLBW-LABEL: constant_rotate_v16i8:
1762; AVX512VLBW:       # %bb.0:
1763; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1764; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1765; AVX512VLBW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1766; AVX512VLBW-NEXT:    vpor %ymm0, %ymm1, %ymm0
1767; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
1768; AVX512VLBW-NEXT:    vzeroupper
1769; AVX512VLBW-NEXT:    retq
1770;
1771; AVX512VBMI2-LABEL: constant_rotate_v16i8:
1772; AVX512VBMI2:       # %bb.0:
1773; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
1774; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1775; AVX512VBMI2-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
1776; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7]
1777; AVX512VBMI2-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
1778; AVX512VBMI2-NEXT:    vpor %ymm0, %ymm1, %ymm0
1779; AVX512VBMI2-NEXT:    vpmovwb %zmm0, %ymm0
1780; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1781; AVX512VBMI2-NEXT:    vzeroupper
1782; AVX512VBMI2-NEXT:    retq
1783;
1784; AVX512VLVBMI2-LABEL: constant_rotate_v16i8:
1785; AVX512VLVBMI2:       # %bb.0:
1786; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1787; AVX512VLVBMI2-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1788; AVX512VLVBMI2-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1789; AVX512VLVBMI2-NEXT:    vpor %ymm0, %ymm1, %ymm0
1790; AVX512VLVBMI2-NEXT:    vpmovwb %ymm0, %xmm0
1791; AVX512VLVBMI2-NEXT:    vzeroupper
1792; AVX512VLVBMI2-NEXT:    retq
1793;
1794; XOP-LABEL: constant_rotate_v16i8:
1795; XOP:       # %bb.0:
1796; XOP-NEXT:    vprotb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1797; XOP-NEXT:    retq
1798;
1799; X86-SSE2-LABEL: constant_rotate_v16i8:
1800; X86-SSE2:       # %bb.0:
1801; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
1802; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
1803; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1804; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1805; X86-SSE2-NEXT:    psrlw $8, %xmm2
1806; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
1807; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1808; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
1809; X86-SSE2-NEXT:    psrlw $8, %xmm3
1810; X86-SSE2-NEXT:    packuswb %xmm2, %xmm3
1811; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1812; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1813; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1814; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1815; X86-SSE2-NEXT:    pand %xmm2, %xmm1
1816; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1817; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1818; X86-SSE2-NEXT:    pand %xmm2, %xmm0
1819; X86-SSE2-NEXT:    packuswb %xmm1, %xmm0
1820; X86-SSE2-NEXT:    por %xmm3, %xmm0
1821; X86-SSE2-NEXT:    retl
1822  %shl = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
1823  %lshr = lshr <16 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1824  %or = or <16 x i8> %shl, %lshr
1825  ret <16 x i8> %or
1826}
1827
1828;
1829; Uniform Constant Rotates
1830;
1831
1832define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind {
1833; SSE-LABEL: splatconstant_rotate_v2i64:
1834; SSE:       # %bb.0:
1835; SSE-NEXT:    movdqa %xmm0, %xmm1
1836; SSE-NEXT:    psllq $14, %xmm1
1837; SSE-NEXT:    psrlq $50, %xmm0
1838; SSE-NEXT:    por %xmm1, %xmm0
1839; SSE-NEXT:    retq
1840;
1841; AVX-LABEL: splatconstant_rotate_v2i64:
1842; AVX:       # %bb.0:
1843; AVX-NEXT:    vpsllq $14, %xmm0, %xmm1
1844; AVX-NEXT:    vpsrlq $50, %xmm0, %xmm0
1845; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1846; AVX-NEXT:    retq
1847;
1848; AVX512F-LABEL: splatconstant_rotate_v2i64:
1849; AVX512F:       # %bb.0:
1850; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1851; AVX512F-NEXT:    vprolq $14, %zmm0, %zmm0
1852; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1853; AVX512F-NEXT:    vzeroupper
1854; AVX512F-NEXT:    retq
1855;
1856; AVX512VL-LABEL: splatconstant_rotate_v2i64:
1857; AVX512VL:       # %bb.0:
1858; AVX512VL-NEXT:    vprolq $14, %xmm0, %xmm0
1859; AVX512VL-NEXT:    retq
1860;
1861; AVX512BW-LABEL: splatconstant_rotate_v2i64:
1862; AVX512BW:       # %bb.0:
1863; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1864; AVX512BW-NEXT:    vprolq $14, %zmm0, %zmm0
1865; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1866; AVX512BW-NEXT:    vzeroupper
1867; AVX512BW-NEXT:    retq
1868;
1869; AVX512VLBW-LABEL: splatconstant_rotate_v2i64:
1870; AVX512VLBW:       # %bb.0:
1871; AVX512VLBW-NEXT:    vprolq $14, %xmm0, %xmm0
1872; AVX512VLBW-NEXT:    retq
1873;
1874; AVX512VBMI2-LABEL: splatconstant_rotate_v2i64:
1875; AVX512VBMI2:       # %bb.0:
1876; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1877; AVX512VBMI2-NEXT:    vprolq $14, %zmm0, %zmm0
1878; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1879; AVX512VBMI2-NEXT:    vzeroupper
1880; AVX512VBMI2-NEXT:    retq
1881;
1882; AVX512VLVBMI2-LABEL: splatconstant_rotate_v2i64:
1883; AVX512VLVBMI2:       # %bb.0:
1884; AVX512VLVBMI2-NEXT:    vprolq $14, %xmm0, %xmm0
1885; AVX512VLVBMI2-NEXT:    retq
1886;
1887; XOP-LABEL: splatconstant_rotate_v2i64:
1888; XOP:       # %bb.0:
1889; XOP-NEXT:    vprotq $14, %xmm0, %xmm0
1890; XOP-NEXT:    retq
1891;
1892; X86-SSE2-LABEL: splatconstant_rotate_v2i64:
1893; X86-SSE2:       # %bb.0:
1894; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1895; X86-SSE2-NEXT:    psllq $14, %xmm1
1896; X86-SSE2-NEXT:    psrlq $50, %xmm0
1897; X86-SSE2-NEXT:    por %xmm1, %xmm0
1898; X86-SSE2-NEXT:    retl
1899  %shl = shl <2 x i64> %a, <i64 14, i64 14>
1900  %lshr = lshr <2 x i64> %a, <i64 50, i64 50>
1901  %or = or <2 x i64> %shl, %lshr
1902  ret <2 x i64> %or
1903}
1904
1905define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind {
1906; SSE-LABEL: splatconstant_rotate_v4i32:
1907; SSE:       # %bb.0:
1908; SSE-NEXT:    movdqa %xmm0, %xmm1
1909; SSE-NEXT:    psrld $28, %xmm1
1910; SSE-NEXT:    pslld $4, %xmm0
1911; SSE-NEXT:    por %xmm1, %xmm0
1912; SSE-NEXT:    retq
1913;
1914; AVX-LABEL: splatconstant_rotate_v4i32:
1915; AVX:       # %bb.0:
1916; AVX-NEXT:    vpsrld $28, %xmm0, %xmm1
1917; AVX-NEXT:    vpslld $4, %xmm0, %xmm0
1918; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1919; AVX-NEXT:    retq
1920;
1921; AVX512F-LABEL: splatconstant_rotate_v4i32:
1922; AVX512F:       # %bb.0:
1923; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1924; AVX512F-NEXT:    vprold $4, %zmm0, %zmm0
1925; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1926; AVX512F-NEXT:    vzeroupper
1927; AVX512F-NEXT:    retq
1928;
1929; AVX512VL-LABEL: splatconstant_rotate_v4i32:
1930; AVX512VL:       # %bb.0:
1931; AVX512VL-NEXT:    vprold $4, %xmm0, %xmm0
1932; AVX512VL-NEXT:    retq
1933;
1934; AVX512BW-LABEL: splatconstant_rotate_v4i32:
1935; AVX512BW:       # %bb.0:
1936; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1937; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
1938; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1939; AVX512BW-NEXT:    vzeroupper
1940; AVX512BW-NEXT:    retq
1941;
1942; AVX512VLBW-LABEL: splatconstant_rotate_v4i32:
1943; AVX512VLBW:       # %bb.0:
1944; AVX512VLBW-NEXT:    vprold $4, %xmm0, %xmm0
1945; AVX512VLBW-NEXT:    retq
1946;
1947; AVX512VBMI2-LABEL: splatconstant_rotate_v4i32:
1948; AVX512VBMI2:       # %bb.0:
1949; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1950; AVX512VBMI2-NEXT:    vprold $4, %zmm0, %zmm0
1951; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1952; AVX512VBMI2-NEXT:    vzeroupper
1953; AVX512VBMI2-NEXT:    retq
1954;
1955; AVX512VLVBMI2-LABEL: splatconstant_rotate_v4i32:
1956; AVX512VLVBMI2:       # %bb.0:
1957; AVX512VLVBMI2-NEXT:    vprold $4, %xmm0, %xmm0
1958; AVX512VLVBMI2-NEXT:    retq
1959;
1960; XOP-LABEL: splatconstant_rotate_v4i32:
1961; XOP:       # %bb.0:
1962; XOP-NEXT:    vprotd $4, %xmm0, %xmm0
1963; XOP-NEXT:    retq
1964;
1965; X86-SSE2-LABEL: splatconstant_rotate_v4i32:
1966; X86-SSE2:       # %bb.0:
1967; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1968; X86-SSE2-NEXT:    psrld $28, %xmm1
1969; X86-SSE2-NEXT:    pslld $4, %xmm0
1970; X86-SSE2-NEXT:    por %xmm1, %xmm0
1971; X86-SSE2-NEXT:    retl
1972  %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4>
1973  %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28>
1974  %or = or <4 x i32> %shl, %lshr
1975  ret <4 x i32> %or
1976}
1977
1978define <8 x i16> @splatconstant_rotate_v8i16(<8 x i16> %a) nounwind {
1979; SSE-LABEL: splatconstant_rotate_v8i16:
1980; SSE:       # %bb.0:
1981; SSE-NEXT:    movdqa %xmm0, %xmm1
1982; SSE-NEXT:    psrlw $9, %xmm1
1983; SSE-NEXT:    psllw $7, %xmm0
1984; SSE-NEXT:    por %xmm1, %xmm0
1985; SSE-NEXT:    retq
1986;
1987; AVX-LABEL: splatconstant_rotate_v8i16:
1988; AVX:       # %bb.0:
1989; AVX-NEXT:    vpsrlw $9, %xmm0, %xmm1
1990; AVX-NEXT:    vpsllw $7, %xmm0, %xmm0
1991; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1992; AVX-NEXT:    retq
1993;
1994; AVX512F-LABEL: splatconstant_rotate_v8i16:
1995; AVX512F:       # %bb.0:
1996; AVX512F-NEXT:    vpsrlw $9, %xmm0, %xmm1
1997; AVX512F-NEXT:    vpsllw $7, %xmm0, %xmm0
1998; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
1999; AVX512F-NEXT:    retq
2000;
2001; AVX512VL-LABEL: splatconstant_rotate_v8i16:
2002; AVX512VL:       # %bb.0:
2003; AVX512VL-NEXT:    vpsrlw $9, %xmm0, %xmm1
2004; AVX512VL-NEXT:    vpsllw $7, %xmm0, %xmm0
2005; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
2006; AVX512VL-NEXT:    retq
2007;
2008; AVX512BW-LABEL: splatconstant_rotate_v8i16:
2009; AVX512BW:       # %bb.0:
2010; AVX512BW-NEXT:    vpsrlw $9, %xmm0, %xmm1
2011; AVX512BW-NEXT:    vpsllw $7, %xmm0, %xmm0
2012; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
2013; AVX512BW-NEXT:    retq
2014;
2015; AVX512VLBW-LABEL: splatconstant_rotate_v8i16:
2016; AVX512VLBW:       # %bb.0:
2017; AVX512VLBW-NEXT:    vpsrlw $9, %xmm0, %xmm1
2018; AVX512VLBW-NEXT:    vpsllw $7, %xmm0, %xmm0
2019; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
2020; AVX512VLBW-NEXT:    retq
2021;
2022; AVX512VBMI2-LABEL: splatconstant_rotate_v8i16:
2023; AVX512VBMI2:       # %bb.0:
2024; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2025; AVX512VBMI2-NEXT:    vpshldw $7, %zmm0, %zmm0, %zmm0
2026; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2027; AVX512VBMI2-NEXT:    vzeroupper
2028; AVX512VBMI2-NEXT:    retq
2029;
2030; AVX512VLVBMI2-LABEL: splatconstant_rotate_v8i16:
2031; AVX512VLVBMI2:       # %bb.0:
2032; AVX512VLVBMI2-NEXT:    vpshldw $7, %xmm0, %xmm0, %xmm0
2033; AVX512VLVBMI2-NEXT:    retq
2034;
2035; XOP-LABEL: splatconstant_rotate_v8i16:
2036; XOP:       # %bb.0:
2037; XOP-NEXT:    vprotw $7, %xmm0, %xmm0
2038; XOP-NEXT:    retq
2039;
2040; X86-SSE2-LABEL: splatconstant_rotate_v8i16:
2041; X86-SSE2:       # %bb.0:
2042; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
2043; X86-SSE2-NEXT:    psrlw $9, %xmm1
2044; X86-SSE2-NEXT:    psllw $7, %xmm0
2045; X86-SSE2-NEXT:    por %xmm1, %xmm0
2046; X86-SSE2-NEXT:    retl
2047  %shl = shl <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
2048  %lshr = lshr <8 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
2049  %or = or <8 x i16> %shl, %lshr
2050  ret <8 x i16> %or
2051}
2052
2053define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
2054; SSE-LABEL: splatconstant_rotate_v16i8:
2055; SSE:       # %bb.0:
2056; SSE-NEXT:    movdqa %xmm0, %xmm1
2057; SSE-NEXT:    psrlw $4, %xmm1
2058; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2059; SSE-NEXT:    psllw $4, %xmm0
2060; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2061; SSE-NEXT:    por %xmm1, %xmm0
2062; SSE-NEXT:    retq
2063;
2064; AVX-LABEL: splatconstant_rotate_v16i8:
2065; AVX:       # %bb.0:
2066; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm1
2067; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2068; AVX-NEXT:    vpsllw $4, %xmm0, %xmm0
2069; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2070; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
2071; AVX-NEXT:    retq
2072;
2073; AVX512F-LABEL: splatconstant_rotate_v16i8:
2074; AVX512F:       # %bb.0:
2075; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm1
2076; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2077; AVX512F-NEXT:    vpsrlw $4, %xmm0, %xmm0
2078; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2079; AVX512F-NEXT:    vpor %xmm0, %xmm1, %xmm0
2080; AVX512F-NEXT:    retq
2081;
2082; AVX512VL-LABEL: splatconstant_rotate_v16i8:
2083; AVX512VL:       # %bb.0:
2084; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm1
2085; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm0
2086; AVX512VL-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
2087; AVX512VL-NEXT:    retq
2088;
2089; AVX512BW-LABEL: splatconstant_rotate_v16i8:
2090; AVX512BW:       # %bb.0:
2091; AVX512BW-NEXT:    vpsllw $4, %xmm0, %xmm1
2092; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2093; AVX512BW-NEXT:    vpsrlw $4, %xmm0, %xmm0
2094; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2095; AVX512BW-NEXT:    vpor %xmm0, %xmm1, %xmm0
2096; AVX512BW-NEXT:    retq
2097;
2098; AVX512VLBW-LABEL: splatconstant_rotate_v16i8:
2099; AVX512VLBW:       # %bb.0:
2100; AVX512VLBW-NEXT:    vpsllw $4, %xmm0, %xmm1
2101; AVX512VLBW-NEXT:    vpsrlw $4, %xmm0, %xmm0
2102; AVX512VLBW-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
2103; AVX512VLBW-NEXT:    retq
2104;
2105; AVX512VBMI2-LABEL: splatconstant_rotate_v16i8:
2106; AVX512VBMI2:       # %bb.0:
2107; AVX512VBMI2-NEXT:    vpsllw $4, %xmm0, %xmm1
2108; AVX512VBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2109; AVX512VBMI2-NEXT:    vpsrlw $4, %xmm0, %xmm0
2110; AVX512VBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2111; AVX512VBMI2-NEXT:    vpor %xmm0, %xmm1, %xmm0
2112; AVX512VBMI2-NEXT:    retq
2113;
2114; AVX512VLVBMI2-LABEL: splatconstant_rotate_v16i8:
2115; AVX512VLVBMI2:       # %bb.0:
2116; AVX512VLVBMI2-NEXT:    vpsllw $4, %xmm0, %xmm1
2117; AVX512VLVBMI2-NEXT:    vpsrlw $4, %xmm0, %xmm0
2118; AVX512VLVBMI2-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
2119; AVX512VLVBMI2-NEXT:    retq
2120;
2121; XOP-LABEL: splatconstant_rotate_v16i8:
2122; XOP:       # %bb.0:
2123; XOP-NEXT:    vprotb $4, %xmm0, %xmm0
2124; XOP-NEXT:    retq
2125;
2126; X86-SSE2-LABEL: splatconstant_rotate_v16i8:
2127; X86-SSE2:       # %bb.0:
2128; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
2129; X86-SSE2-NEXT:    psrlw $4, %xmm1
2130; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
2131; X86-SSE2-NEXT:    psllw $4, %xmm0
2132; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2133; X86-SSE2-NEXT:    por %xmm1, %xmm0
2134; X86-SSE2-NEXT:    retl
2135  %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
2136  %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
2137  %or = or <16 x i8> %shl, %lshr
2138  ret <16 x i8> %or
2139}
2140
2141;
2142; Masked Uniform Constant Rotates
2143;
2144
2145define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind {
2146; SSE-LABEL: splatconstant_rotate_mask_v2i64:
2147; SSE:       # %bb.0:
2148; SSE-NEXT:    psrlq $49, %xmm0
2149; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2150; SSE-NEXT:    retq
2151;
2152; AVX-LABEL: splatconstant_rotate_mask_v2i64:
2153; AVX:       # %bb.0:
2154; AVX-NEXT:    vpsrlq $49, %xmm0, %xmm0
2155; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2156; AVX-NEXT:    retq
2157;
2158; AVX512F-LABEL: splatconstant_rotate_mask_v2i64:
2159; AVX512F:       # %bb.0:
2160; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2161; AVX512F-NEXT:    vprolq $15, %zmm0, %zmm0
2162; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2163; AVX512F-NEXT:    vzeroupper
2164; AVX512F-NEXT:    retq
2165;
2166; AVX512VL-LABEL: splatconstant_rotate_mask_v2i64:
2167; AVX512VL:       # %bb.0:
2168; AVX512VL-NEXT:    vprolq $15, %xmm0, %xmm0
2169; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2170; AVX512VL-NEXT:    retq
2171;
2172; AVX512BW-LABEL: splatconstant_rotate_mask_v2i64:
2173; AVX512BW:       # %bb.0:
2174; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2175; AVX512BW-NEXT:    vprolq $15, %zmm0, %zmm0
2176; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2177; AVX512BW-NEXT:    vzeroupper
2178; AVX512BW-NEXT:    retq
2179;
2180; AVX512VLBW-LABEL: splatconstant_rotate_mask_v2i64:
2181; AVX512VLBW:       # %bb.0:
2182; AVX512VLBW-NEXT:    vprolq $15, %xmm0, %xmm0
2183; AVX512VLBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2184; AVX512VLBW-NEXT:    retq
2185;
2186; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v2i64:
2187; AVX512VBMI2:       # %bb.0:
2188; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2189; AVX512VBMI2-NEXT:    vprolq $15, %zmm0, %zmm0
2190; AVX512VBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2191; AVX512VBMI2-NEXT:    vzeroupper
2192; AVX512VBMI2-NEXT:    retq
2193;
2194; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v2i64:
2195; AVX512VLVBMI2:       # %bb.0:
2196; AVX512VLVBMI2-NEXT:    vprolq $15, %xmm0, %xmm0
2197; AVX512VLVBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2198; AVX512VLVBMI2-NEXT:    retq
2199;
2200; XOP-LABEL: splatconstant_rotate_mask_v2i64:
2201; XOP:       # %bb.0:
2202; XOP-NEXT:    vprotq $15, %xmm0, %xmm0
2203; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2204; XOP-NEXT:    retq
2205;
2206; X86-SSE2-LABEL: splatconstant_rotate_mask_v2i64:
2207; X86-SSE2:       # %bb.0:
2208; X86-SSE2-NEXT:    psrlq $49, %xmm0
2209; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2210; X86-SSE2-NEXT:    retl
2211  %shl = shl <2 x i64> %a, <i64 15, i64 15>
2212  %lshr = lshr <2 x i64> %a, <i64 49, i64 49>
2213  %rmask = and <2 x i64> %lshr, <i64 255, i64 127>
2214  %lmask = and <2 x i64> %shl, <i64 65, i64 33>
2215  %or = or <2 x i64> %lmask, %rmask
2216  ret <2 x i64> %or
2217}
2218
2219define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind {
2220; SSE-LABEL: splatconstant_rotate_mask_v4i32:
2221; SSE:       # %bb.0:
2222; SSE-NEXT:    movdqa %xmm0, %xmm1
2223; SSE-NEXT:    psrld $28, %xmm1
2224; SSE-NEXT:    pslld $4, %xmm0
2225; SSE-NEXT:    por %xmm1, %xmm0
2226; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2227; SSE-NEXT:    retq
2228;
2229; AVX-LABEL: splatconstant_rotate_mask_v4i32:
2230; AVX:       # %bb.0:
2231; AVX-NEXT:    vpsrld $28, %xmm0, %xmm1
2232; AVX-NEXT:    vpslld $4, %xmm0, %xmm0
2233; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
2234; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2235; AVX-NEXT:    retq
2236;
2237; AVX512F-LABEL: splatconstant_rotate_mask_v4i32:
2238; AVX512F:       # %bb.0:
2239; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2240; AVX512F-NEXT:    vprold $4, %zmm0, %zmm0
2241; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2242; AVX512F-NEXT:    vzeroupper
2243; AVX512F-NEXT:    retq
2244;
2245; AVX512VL-LABEL: splatconstant_rotate_mask_v4i32:
2246; AVX512VL:       # %bb.0:
2247; AVX512VL-NEXT:    vprold $4, %xmm0, %xmm0
2248; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2249; AVX512VL-NEXT:    retq
2250;
2251; AVX512BW-LABEL: splatconstant_rotate_mask_v4i32:
2252; AVX512BW:       # %bb.0:
2253; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2254; AVX512BW-NEXT:    vprold $4, %zmm0, %zmm0
2255; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2256; AVX512BW-NEXT:    vzeroupper
2257; AVX512BW-NEXT:    retq
2258;
2259; AVX512VLBW-LABEL: splatconstant_rotate_mask_v4i32:
2260; AVX512VLBW:       # %bb.0:
2261; AVX512VLBW-NEXT:    vprold $4, %xmm0, %xmm0
2262; AVX512VLBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2263; AVX512VLBW-NEXT:    retq
2264;
2265; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v4i32:
2266; AVX512VBMI2:       # %bb.0:
2267; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2268; AVX512VBMI2-NEXT:    vprold $4, %zmm0, %zmm0
2269; AVX512VBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2270; AVX512VBMI2-NEXT:    vzeroupper
2271; AVX512VBMI2-NEXT:    retq
2272;
2273; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v4i32:
2274; AVX512VLVBMI2:       # %bb.0:
2275; AVX512VLVBMI2-NEXT:    vprold $4, %xmm0, %xmm0
2276; AVX512VLVBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2277; AVX512VLVBMI2-NEXT:    retq
2278;
2279; XOP-LABEL: splatconstant_rotate_mask_v4i32:
2280; XOP:       # %bb.0:
2281; XOP-NEXT:    vprotd $4, %xmm0, %xmm0
2282; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2283; XOP-NEXT:    retq
2284;
2285; X86-SSE2-LABEL: splatconstant_rotate_mask_v4i32:
2286; X86-SSE2:       # %bb.0:
2287; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
2288; X86-SSE2-NEXT:    psrld $28, %xmm1
2289; X86-SSE2-NEXT:    pslld $4, %xmm0
2290; X86-SSE2-NEXT:    por %xmm1, %xmm0
2291; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2292; X86-SSE2-NEXT:    retl
2293  %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4>
2294  %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28>
2295  %rmask = and <4 x i32> %lshr, <i32 127, i32 255, i32 511, i32 1023>
2296  %lmask = and <4 x i32> %shl, <i32 1023, i32 511, i32 255, i32 127>
2297  %or = or <4 x i32> %lmask, %rmask
2298  ret <4 x i32> %or
2299}
2300
2301define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
2302; SSE-LABEL: splatconstant_rotate_mask_v8i16:
2303; SSE:       # %bb.0:
2304; SSE-NEXT:    movdqa %xmm0, %xmm1
2305; SSE-NEXT:    psrlw $11, %xmm1
2306; SSE-NEXT:    psllw $5, %xmm0
2307; SSE-NEXT:    por %xmm1, %xmm0
2308; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2309; SSE-NEXT:    retq
2310;
2311; AVX-LABEL: splatconstant_rotate_mask_v8i16:
2312; AVX:       # %bb.0:
2313; AVX-NEXT:    vpsrlw $11, %xmm0, %xmm1
2314; AVX-NEXT:    vpsllw $5, %xmm0, %xmm0
2315; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
2316; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2317; AVX-NEXT:    retq
2318;
2319; AVX512F-LABEL: splatconstant_rotate_mask_v8i16:
2320; AVX512F:       # %bb.0:
2321; AVX512F-NEXT:    vpsrlw $11, %xmm0, %xmm1
2322; AVX512F-NEXT:    vpsllw $5, %xmm0, %xmm0
2323; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
2324; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2325; AVX512F-NEXT:    retq
2326;
2327; AVX512VL-LABEL: splatconstant_rotate_mask_v8i16:
2328; AVX512VL:       # %bb.0:
2329; AVX512VL-NEXT:    vpsllw $5, %xmm0, %xmm1
2330; AVX512VL-NEXT:    vpsrlw $11, %xmm0, %xmm0
2331; AVX512VL-NEXT:    vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
2332; AVX512VL-NEXT:    retq
2333;
2334; AVX512BW-LABEL: splatconstant_rotate_mask_v8i16:
2335; AVX512BW:       # %bb.0:
2336; AVX512BW-NEXT:    vpsrlw $11, %xmm0, %xmm1
2337; AVX512BW-NEXT:    vpsllw $5, %xmm0, %xmm0
2338; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
2339; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2340; AVX512BW-NEXT:    retq
2341;
2342; AVX512VLBW-LABEL: splatconstant_rotate_mask_v8i16:
2343; AVX512VLBW:       # %bb.0:
2344; AVX512VLBW-NEXT:    vpsllw $5, %xmm0, %xmm1
2345; AVX512VLBW-NEXT:    vpsrlw $11, %xmm0, %xmm0
2346; AVX512VLBW-NEXT:    vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
2347; AVX512VLBW-NEXT:    retq
2348;
2349; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v8i16:
2350; AVX512VBMI2:       # %bb.0:
2351; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2352; AVX512VBMI2-NEXT:    vpshldw $5, %zmm0, %zmm0, %zmm0
2353; AVX512VBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2354; AVX512VBMI2-NEXT:    vzeroupper
2355; AVX512VBMI2-NEXT:    retq
2356;
2357; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v8i16:
2358; AVX512VLVBMI2:       # %bb.0:
2359; AVX512VLVBMI2-NEXT:    vpshldw $5, %xmm0, %xmm0, %xmm0
2360; AVX512VLVBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2361; AVX512VLVBMI2-NEXT:    retq
2362;
2363; XOP-LABEL: splatconstant_rotate_mask_v8i16:
2364; XOP:       # %bb.0:
2365; XOP-NEXT:    vprotw $5, %xmm0, %xmm0
2366; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2367; XOP-NEXT:    retq
2368;
2369; X86-SSE2-LABEL: splatconstant_rotate_mask_v8i16:
2370; X86-SSE2:       # %bb.0:
2371; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
2372; X86-SSE2-NEXT:    psrlw $11, %xmm1
2373; X86-SSE2-NEXT:    psllw $5, %xmm0
2374; X86-SSE2-NEXT:    por %xmm1, %xmm0
2375; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2376; X86-SSE2-NEXT:    retl
2377  %shl = shl <8 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
2378  %lshr = lshr <8 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
2379  %rmask = and <8 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
2380  %lmask = and <8 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
2381  %or = or <8 x i16> %lmask, %rmask
2382  ret <8 x i16> %or
2383}
2384
2385define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
2386; SSE-LABEL: splatconstant_rotate_mask_v16i8:
2387; SSE:       # %bb.0:
2388; SSE-NEXT:    movdqa %xmm0, %xmm1
2389; SSE-NEXT:    psrlw $4, %xmm1
2390; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2391; SSE-NEXT:    psllw $4, %xmm0
2392; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2393; SSE-NEXT:    por %xmm1, %xmm0
2394; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2395; SSE-NEXT:    retq
2396;
2397; AVX-LABEL: splatconstant_rotate_mask_v16i8:
2398; AVX:       # %bb.0:
2399; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm1
2400; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2401; AVX-NEXT:    vpsllw $4, %xmm0, %xmm0
2402; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2403; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
2404; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2405; AVX-NEXT:    retq
2406;
2407; AVX512F-LABEL: splatconstant_rotate_mask_v16i8:
2408; AVX512F:       # %bb.0:
2409; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm1
2410; AVX512F-NEXT:    vpsrlw $4, %xmm0, %xmm0
2411; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2412; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2413; AVX512F-NEXT:    vpor %xmm0, %xmm1, %xmm0
2414; AVX512F-NEXT:    retq
2415;
2416; AVX512VL-LABEL: splatconstant_rotate_mask_v16i8:
2417; AVX512VL:       # %bb.0:
2418; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm1
2419; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm2
2420; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
2421; AVX512VL-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
2422; AVX512VL-NEXT:    retq
2423;
2424; AVX512BW-LABEL: splatconstant_rotate_mask_v16i8:
2425; AVX512BW:       # %bb.0:
2426; AVX512BW-NEXT:    vpsllw $4, %xmm0, %xmm1
2427; AVX512BW-NEXT:    vpsrlw $4, %xmm0, %xmm0
2428; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2429; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2430; AVX512BW-NEXT:    vpor %xmm0, %xmm1, %xmm0
2431; AVX512BW-NEXT:    retq
2432;
2433; AVX512VLBW-LABEL: splatconstant_rotate_mask_v16i8:
2434; AVX512VLBW:       # %bb.0:
2435; AVX512VLBW-NEXT:    vpsllw $4, %xmm0, %xmm1
2436; AVX512VLBW-NEXT:    vpsrlw $4, %xmm0, %xmm2
2437; AVX512VLBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
2438; AVX512VLBW-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
2439; AVX512VLBW-NEXT:    retq
2440;
2441; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v16i8:
2442; AVX512VBMI2:       # %bb.0:
2443; AVX512VBMI2-NEXT:    vpsllw $4, %xmm0, %xmm1
2444; AVX512VBMI2-NEXT:    vpsrlw $4, %xmm0, %xmm0
2445; AVX512VBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2446; AVX512VBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2447; AVX512VBMI2-NEXT:    vpor %xmm0, %xmm1, %xmm0
2448; AVX512VBMI2-NEXT:    retq
2449;
2450; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v16i8:
2451; AVX512VLVBMI2:       # %bb.0:
2452; AVX512VLVBMI2-NEXT:    vpsllw $4, %xmm0, %xmm1
2453; AVX512VLVBMI2-NEXT:    vpsrlw $4, %xmm0, %xmm2
2454; AVX512VLVBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
2455; AVX512VLVBMI2-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
2456; AVX512VLVBMI2-NEXT:    retq
2457;
2458; XOP-LABEL: splatconstant_rotate_mask_v16i8:
2459; XOP:       # %bb.0:
2460; XOP-NEXT:    vprotb $4, %xmm0, %xmm0
2461; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2462; XOP-NEXT:    retq
2463;
2464; X86-SSE2-LABEL: splatconstant_rotate_mask_v16i8:
2465; X86-SSE2:       # %bb.0:
2466; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
2467; X86-SSE2-NEXT:    psrlw $4, %xmm1
2468; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
2469; X86-SSE2-NEXT:    psllw $4, %xmm0
2470; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2471; X86-SSE2-NEXT:    por %xmm1, %xmm0
2472; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2473; X86-SSE2-NEXT:    retl
2474  %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
2475  %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
2476  %rmask = and <16 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
2477  %lmask = and <16 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
2478  %or = or <16 x i8> %lmask, %rmask
2479  ret <16 x i8> %or
2480}
2481
2482define <4 x i32> @rot16_demandedbits(<4 x i32> %x, <4 x i32> %y) nounwind {
2483; X86-LABEL: rot16_demandedbits:
2484; X86:       # %bb.0:
2485; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2486; X86-NEXT:    movl %eax, %ecx
2487; X86-NEXT:    shrl $11, %ecx
2488; X86-NEXT:    shll $5, %eax
2489; X86-NEXT:    orl %ecx, %eax
2490; X86-NEXT:    andl $65536, %eax # imm = 0x10000
2491; X86-NEXT:    retl
2492;
2493; X64-LABEL: rot16_demandedbits:
2494; X64:       # %bb.0:
2495; X64-NEXT:    movl %edi, %eax
2496; X64-NEXT:    movl %edi, %ecx
2497; X64-NEXT:    shrl $11, %ecx
2498; X64-NEXT:    shll $5, %eax
2499; X64-NEXT:    orl %ecx, %eax
2500; X64-NEXT:    andl $65536, %eax # imm = 0x10000
2501; X64-NEXT:    retq
2502; SSE2-LABEL: rot16_demandedbits:
2503; SSE2:       # %bb.0:
2504; SSE2-NEXT:    movdqa %xmm0, %xmm1
2505; SSE2-NEXT:    psrld $11, %xmm1
2506; SSE2-NEXT:    pslld $11, %xmm0
2507; SSE2-NEXT:    por %xmm1, %xmm0
2508; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2509; SSE2-NEXT:    retq
2510;
2511; SSE41-LABEL: rot16_demandedbits:
2512; SSE41:       # %bb.0:
2513; SSE41-NEXT:    movdqa %xmm0, %xmm1
2514; SSE41-NEXT:    psrld $11, %xmm1
2515; SSE41-NEXT:    pslld $11, %xmm0
2516; SSE41-NEXT:    por %xmm1, %xmm0
2517; SSE41-NEXT:    pxor %xmm1, %xmm1
2518; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
2519; SSE41-NEXT:    retq
2520;
2521; AVX-LABEL: rot16_demandedbits:
2522; AVX:       # %bb.0:
2523; AVX-NEXT:    vpsrld $11, %xmm0, %xmm1
2524; AVX-NEXT:    vpslld $11, %xmm0, %xmm0
2525; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
2526; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2527; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
2528; AVX-NEXT:    retq
2529;
2530; AVX512-LABEL: rot16_demandedbits:
2531; AVX512:       # %bb.0:
2532; AVX512-NEXT:    vpsrld $11, %xmm0, %xmm1
2533; AVX512-NEXT:    vpslld $11, %xmm0, %xmm0
2534; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
2535; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2536; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
2537; AVX512-NEXT:    retq
2538;
2539; XOP-LABEL: rot16_demandedbits:
2540; XOP:       # %bb.0:
2541; XOP-NEXT:    vpsrld $11, %xmm0, %xmm1
2542; XOP-NEXT:    vpslld $11, %xmm0, %xmm0
2543; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
2544; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2545; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
2546; XOP-NEXT:    retq
2547;
2548; X86-SSE2-LABEL: rot16_demandedbits:
2549; X86-SSE2:       # %bb.0:
2550; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
2551; X86-SSE2-NEXT:    psrld $11, %xmm1
2552; X86-SSE2-NEXT:    pslld $11, %xmm0
2553; X86-SSE2-NEXT:    por %xmm1, %xmm0
2554; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2555; X86-SSE2-NEXT:    retl
2556  %t0 = lshr <4 x i32> %x, <i32 11, i32 11, i32 11, i32 11>
2557  %t1 = shl <4 x i32> %x, <i32 11, i32 11, i32 11, i32 11>
2558  %t2 = or <4 x i32> %t0, %t1
2559  %t3 = and <4 x i32> %t2, <i32 65535, i32 65535, i32 65535, i32 65535>
2560  ret <4 x i32> %t3
2561}
2562
2563define <4 x i16> @rot16_trunc(<4 x i32> %x, <4 x i32> %y) nounwind {
2564; SSE2-LABEL: rot16_trunc:
2565; SSE2:       # %bb.0:
2566; SSE2-NEXT:    movdqa %xmm0, %xmm1
2567; SSE2-NEXT:    psrld $11, %xmm1
2568; SSE2-NEXT:    pslld $5, %xmm0
2569; SSE2-NEXT:    por %xmm1, %xmm0
2570; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2571; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2572; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2573; SSE2-NEXT:    retq
2574;
2575; SSE41-LABEL: rot16_trunc:
2576; SSE41:       # %bb.0:
2577; SSE41-NEXT:    movdqa %xmm0, %xmm1
2578; SSE41-NEXT:    psrld $11, %xmm1
2579; SSE41-NEXT:    pslld $5, %xmm0
2580; SSE41-NEXT:    por %xmm1, %xmm0
2581; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2582; SSE41-NEXT:    retq
2583;
2584; AVX-LABEL: rot16_trunc:
2585; AVX:       # %bb.0:
2586; AVX-NEXT:    vpsrld $11, %xmm0, %xmm1
2587; AVX-NEXT:    vpslld $5, %xmm0, %xmm0
2588; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
2589; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2590; AVX-NEXT:    retq
2591;
2592; AVX512-LABEL: rot16_trunc:
2593; AVX512:       # %bb.0:
2594; AVX512-NEXT:    vpsrld $11, %xmm0, %xmm1
2595; AVX512-NEXT:    vpslld $5, %xmm0, %xmm0
2596; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
2597; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2598; AVX512-NEXT:    retq
2599;
2600; XOP-LABEL: rot16_trunc:
2601; XOP:       # %bb.0:
2602; XOP-NEXT:    vpsrld $11, %xmm0, %xmm1
2603; XOP-NEXT:    vpslld $5, %xmm0, %xmm0
2604; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
2605; XOP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
2606; XOP-NEXT:    retq
2607;
2608; X86-SSE2-LABEL: rot16_trunc:
2609; X86-SSE2:       # %bb.0:
2610; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
2611; X86-SSE2-NEXT:    psrld $11, %xmm1
2612; X86-SSE2-NEXT:    pslld $5, %xmm0
2613; X86-SSE2-NEXT:    por %xmm1, %xmm0
2614; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2615; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2616; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2617; X86-SSE2-NEXT:    retl
2618  %t0 = lshr <4 x i32> %x, <i32 11, i32 11, i32 11, i32 11>
2619  %t1 = shl <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
2620  %t2 = or <4 x i32> %t0, %t1
2621  %t3 = trunc <4 x i32> %t2 to <4 x i16>
2622  ret <4 x i16> %t3
2623}
2624