1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=ALL,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=ALL,SSE,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL,AVX512VLBW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL,AVX512VLBW
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL,AVX512VLVBMI
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512vbmi,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL,AVX512VLVBMI
13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,XOP,XOPAVX1
14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,XOP,XOPAVX2
15
16define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
17; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
18; SSE2:       # %bb.0:
19; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
20; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
21; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
22; SSE2-NEXT:    retq
23;
24; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
25; SSSE3:       # %bb.0:
26; SSSE3-NEXT:    pxor %xmm1, %xmm1
27; SSSE3-NEXT:    pshufb %xmm1, %xmm0
28; SSSE3-NEXT:    retq
29;
30; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
31; SSE41:       # %bb.0:
32; SSE41-NEXT:    pxor %xmm1, %xmm1
33; SSE41-NEXT:    pshufb %xmm1, %xmm0
34; SSE41-NEXT:    retq
35;
36; AVX1-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
37; AVX1:       # %bb.0:
38; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
39; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
40; AVX1-NEXT:    retq
41;
42; AVX2OR512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
43; AVX2OR512VL:       # %bb.0:
44; AVX2OR512VL-NEXT:    vpbroadcastb %xmm0, %xmm0
45; AVX2OR512VL-NEXT:    retq
46;
47; XOPAVX1-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
48; XOPAVX1:       # %bb.0:
49; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
50; XOPAVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
51; XOPAVX1-NEXT:    retq
52;
53; XOPAVX2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
54; XOPAVX2:       # %bb.0:
55; XOPAVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
56; XOPAVX2-NEXT:    retq
57  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
58  ret <16 x i8> %shuffle
59}
60
61define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(<16 x i8> %a, <16 x i8> %b) {
62; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
63; SSE2:       # %bb.0:
64; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
65; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
66; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
67; SSE2-NEXT:    retq
68;
69; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
70; SSSE3:       # %bb.0:
71; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
72; SSSE3-NEXT:    retq
73;
74; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
75; SSE41:       # %bb.0:
76; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
77; SSE41-NEXT:    retq
78;
79; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
80; AVX:       # %bb.0:
81; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
82; AVX-NEXT:    retq
83  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
84  ret <16 x i8> %shuffle
85}
86
87define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) {
88; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
89; SSE2:       # %bb.0:
90; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
91; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
92; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
93; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
94; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
95; SSE2-NEXT:    retq
96;
97; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
98; SSSE3:       # %bb.0:
99; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
100; SSSE3-NEXT:    retq
101;
102; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
103; SSE41:       # %bb.0:
104; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
105; SSE41-NEXT:    retq
106;
107; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
108; AVX:       # %bb.0:
109; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
110; AVX-NEXT:    retq
111  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
112  ret <16 x i8> %shuffle
113}
114
115define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) {
116; SSE-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
117; SSE:       # %bb.0:
118; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
119; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
120; SSE-NEXT:    retq
121;
122; AVX1-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
123; AVX1:       # %bb.0:
124; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
125; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
126; AVX1-NEXT:    retq
127;
128; AVX2-SLOW-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
129; AVX2-SLOW:       # %bb.0:
130; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
131; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
132; AVX2-SLOW-NEXT:    retq
133;
134; AVX2-FAST-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
135; AVX2-FAST:       # %bb.0:
136; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
137; AVX2-FAST-NEXT:    retq
138;
139; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
140; AVX512VL:       # %bb.0:
141; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
142; AVX512VL-NEXT:    retq
143;
144; XOP-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
145; XOP:       # %bb.0:
146; XOP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
147; XOP-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
148; XOP-NEXT:    retq
149  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
150  ret <16 x i8> %shuffle
151}
152
153define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(<16 x i8> %a, <16 x i8> %b) {
154; SSE-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
155; SSE:       # %bb.0:
156; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
157; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
158; SSE-NEXT:    retq
159;
160; AVX1-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
161; AVX1:       # %bb.0:
162; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
163; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
164; AVX1-NEXT:    retq
165;
166; AVX2-SLOW-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
167; AVX2-SLOW:       # %bb.0:
168; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
169; AVX2-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
170; AVX2-SLOW-NEXT:    retq
171;
172; AVX2-FAST-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
173; AVX2-FAST:       # %bb.0:
174; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
175; AVX2-FAST-NEXT:    retq
176;
177; AVX512VL-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
178; AVX512VL:       # %bb.0:
179; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
180; AVX512VL-NEXT:    retq
181;
182; XOP-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
183; XOP:       # %bb.0:
184; XOP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
185; XOP-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
186; XOP-NEXT:    retq
187  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>
188  ret <16 x i8> %shuffle
189}
190
191define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) {
192; SSE2-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
193; SSE2:       # %bb.0:
194; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
195; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
196; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
197; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
198; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
199; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
200; SSE2-NEXT:    retq
201;
202; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
203; SSSE3:       # %bb.0:
204; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
205; SSSE3-NEXT:    retq
206;
207; SSE41-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
208; SSE41:       # %bb.0:
209; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
210; SSE41-NEXT:    retq
211;
212; AVX-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
213; AVX:       # %bb.0:
214; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
215; AVX-NEXT:    retq
216  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
217  ret <16 x i8> %shuffle
218}
219
220define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(<16 x i8> %a, <16 x i8> %b) {
221; SSE-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07:
222; SSE:       # %bb.0:
223; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
224; SSE-NEXT:    retq
225;
226; AVX-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07:
227; AVX:       # %bb.0:
228; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
229; AVX-NEXT:    retq
230  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
231  ret <16 x i8> %shuffle
232}
233
234define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
235; SSE-LABEL: shuffle_v16i8_0101010101010101:
236; SSE:       # %bb.0:
237; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
238; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
239; SSE-NEXT:    retq
240;
241; AVX1-LABEL: shuffle_v16i8_0101010101010101:
242; AVX1:       # %bb.0:
243; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
244; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
245; AVX1-NEXT:    retq
246;
247; AVX2OR512VL-LABEL: shuffle_v16i8_0101010101010101:
248; AVX2OR512VL:       # %bb.0:
249; AVX2OR512VL-NEXT:    vpbroadcastw %xmm0, %xmm0
250; AVX2OR512VL-NEXT:    retq
251;
252; XOPAVX1-LABEL: shuffle_v16i8_0101010101010101:
253; XOPAVX1:       # %bb.0:
254; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
255; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
256; XOPAVX1-NEXT:    retq
257;
258; XOPAVX2-LABEL: shuffle_v16i8_0101010101010101:
259; XOPAVX2:       # %bb.0:
260; XOPAVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
261; XOPAVX2-NEXT:    retq
262  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
263  ret <16 x i8> %shuffle
264}
265
266define <16 x i8> @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i8> %a, <16 x i8> %b) {
267; SSE-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
268; SSE:       # %bb.0:
269; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
270; SSE-NEXT:    retq
271;
272; AVX-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
273; AVX:       # %bb.0:
274; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
275; AVX-NEXT:    retq
276  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
277  ret <16 x i8> %shuffle
278}
279
280define <16 x i8> @shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31(<16 x i8> %a, <16 x i8> %b) {
281; SSE-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31:
282; SSE:       # %bb.0:
283; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
284; SSE-NEXT:    retq
285;
286; AVX-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31:
287; AVX:       # %bb.0:
288; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
289; AVX-NEXT:    retq
290  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
291  ret <16 x i8> %shuffle
292}
293
294define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) {
295; SSE-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
296; SSE:       # %bb.0:
297; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
298; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
299; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
300; SSE-NEXT:    movdqa %xmm1, %xmm0
301; SSE-NEXT:    retq
302;
303; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
304; AVX1:       # %bb.0:
305; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
306; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
307; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
308; AVX1-NEXT:    retq
309;
310; AVX2OR512VL-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
311; AVX2OR512VL:       # %bb.0:
312; AVX2OR512VL-NEXT:    vpbroadcastb %xmm1, %xmm1
313; AVX2OR512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
314; AVX2OR512VL-NEXT:    retq
315;
316; XOPAVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
317; XOPAVX1:       # %bb.0:
318; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[0],xmm0[1],xmm1[0],xmm0[2],xmm1[0],xmm0[3],xmm1[0],xmm0[4],xmm1[0],xmm0[5],xmm1[0],xmm0[6],xmm1[0],xmm0[7]
319; XOPAVX1-NEXT:    retq
320;
321; XOPAVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
322; XOPAVX2:       # %bb.0:
323; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
324; XOPAVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
325; XOPAVX2-NEXT:    retq
326  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7>
327  ret <16 x i8> %shuffle
328}
329
330define <16 x i8> @shuffle_v16i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00(<16 x i8> %a, <16 x i8> %b) {
331; SSE2-LABEL: shuffle_v16i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
332; SSE2:       # %bb.0:
333; SSE2-NEXT:    pxor %xmm1, %xmm1
334; SSE2-NEXT:    movdqa %xmm0, %xmm2
335; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
336; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
337; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
338; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
339; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
340; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
341; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
342; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
343; SSE2-NEXT:    packuswb %xmm2, %xmm0
344; SSE2-NEXT:    retq
345;
346; SSSE3-LABEL: shuffle_v16i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
347; SSSE3:       # %bb.0:
348; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
349; SSSE3-NEXT:    retq
350;
351; SSE41-LABEL: shuffle_v16i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
352; SSE41:       # %bb.0:
353; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
354; SSE41-NEXT:    retq
355;
356; AVX-LABEL: shuffle_v16i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
357; AVX:       # %bb.0:
358; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
359; AVX-NEXT:    retq
360  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
361  ret <16 x i8> %shuffle
362}
363
364define <16 x i8> @shuffle_v16i8_07_06_05_04_03_02_01_00_15_14_13_12_11_10_09_08(<16 x i8> %a, <16 x i8> %b) {
365; SSE2-LABEL: shuffle_v16i8_07_06_05_04_03_02_01_00_15_14_13_12_11_10_09_08:
366; SSE2:       # %bb.0:
367; SSE2-NEXT:    pxor %xmm1, %xmm1
368; SSE2-NEXT:    movdqa %xmm0, %xmm2
369; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
370; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
371; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
372; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
373; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
374; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
375; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
376; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
377; SSE2-NEXT:    packuswb %xmm2, %xmm0
378; SSE2-NEXT:    retq
379;
380; SSSE3-LABEL: shuffle_v16i8_07_06_05_04_03_02_01_00_15_14_13_12_11_10_09_08:
381; SSSE3:       # %bb.0:
382; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
383; SSSE3-NEXT:    retq
384;
385; SSE41-LABEL: shuffle_v16i8_07_06_05_04_03_02_01_00_15_14_13_12_11_10_09_08:
386; SSE41:       # %bb.0:
387; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
388; SSE41-NEXT:    retq
389;
390; AVX-LABEL: shuffle_v16i8_07_06_05_04_03_02_01_00_15_14_13_12_11_10_09_08:
391; AVX:       # %bb.0:
392; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
393; AVX-NEXT:    retq
394  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
395  ret <16 x i8> %shuffle
396}
397
398define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(<16 x i8> %a, <16 x i8> %b) {
399; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
400; SSE2:       # %bb.0:
401; SSE2-NEXT:    pxor %xmm1, %xmm1
402; SSE2-NEXT:    movdqa %xmm0, %xmm2
403; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
404; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
405; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
406; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
407; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
408; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
409; SSE2-NEXT:    packuswb %xmm2, %xmm0
410; SSE2-NEXT:    retq
411;
412; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
413; SSSE3:       # %bb.0:
414; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
415; SSSE3-NEXT:    retq
416;
417; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
418; SSE41:       # %bb.0:
419; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
420; SSE41-NEXT:    retq
421;
422; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
423; AVX:       # %bb.0:
424; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
425; AVX-NEXT:    retq
426  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
427  ret <16 x i8> %shuffle
428}
429
430define <16 x i8> @shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14(<16 x i8> %a, <16 x i8> %b) {
431; SSE2-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
432; SSE2:       # %bb.0:
433; SSE2-NEXT:    movdqa %xmm0, %xmm1
434; SSE2-NEXT:    psrlw $8, %xmm1
435; SSE2-NEXT:    psllw $8, %xmm0
436; SSE2-NEXT:    por %xmm1, %xmm0
437; SSE2-NEXT:    retq
438;
439; SSSE3-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
440; SSSE3:       # %bb.0:
441; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
442; SSSE3-NEXT:    retq
443;
444; SSE41-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
445; SSE41:       # %bb.0:
446; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
447; SSE41-NEXT:    retq
448;
449; AVX1-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
450; AVX1:       # %bb.0:
451; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
452; AVX1-NEXT:    retq
453;
454; AVX2OR512VL-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
455; AVX2OR512VL:       # %bb.0:
456; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
457; AVX2OR512VL-NEXT:    retq
458;
459; XOP-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14:
460; XOP:       # %bb.0:
461; XOP-NEXT:    vprotw $8, %xmm0, %xmm0
462; XOP-NEXT:    retq
463  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
464  ret <16 x i8> %shuffle
465}
466
467define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
468; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
469; SSE2:       # %bb.0:
470; SSE2-NEXT:    pxor %xmm2, %xmm2
471; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
472; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
473; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
474; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
475; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
476; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
477; SSE2-NEXT:    packuswb %xmm1, %xmm0
478; SSE2-NEXT:    retq
479;
480; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
481; SSSE3:       # %bb.0:
482; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
483; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
484; SSSE3-NEXT:    retq
485;
486; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
487; SSE41:       # %bb.0:
488; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
489; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
490; SSE41-NEXT:    retq
491;
492; AVX1-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
493; AVX1:       # %bb.0:
494; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
495; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
496; AVX1-NEXT:    retq
497;
498; AVX2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
499; AVX2:       # %bb.0:
500; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
501; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
502; AVX2-NEXT:    retq
503;
504; AVX512VLBW-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
505; AVX512VLBW:       # %bb.0:
506; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
507; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
508; AVX512VLBW-NEXT:    retq
509;
510; AVX512VLVBMI-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
511; AVX512VLVBMI:       # %bb.0:
512; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,19,18,17,16,23,22,21,20]
513; AVX512VLVBMI-NEXT:    vpermt2b %xmm1, %xmm2, %xmm0
514; AVX512VLVBMI-NEXT:    retq
515;
516; XOP-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
517; XOP:       # %bb.0:
518; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],xmm1[3,2,1,0,7,6,5,4]
519; XOP-NEXT:    retq
520  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20>
521  ret <16 x i8> %shuffle
522}
523
524define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
525; SSE2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
526; SSE2:       # %bb.0:
527; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
528; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
529; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
530; SSE2-NEXT:    pxor %xmm1, %xmm1
531; SSE2-NEXT:    movdqa %xmm0, %xmm2
532; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
533; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[3,2,1,0,4,5,6,7]
534; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
535; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
536; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
537; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[3,2,1,0,4,5,6,7]
538; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,7,6,5,4]
539; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
540; SSE2-NEXT:    packuswb %xmm1, %xmm0
541; SSE2-NEXT:    retq
542;
543; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
544; SSSE3:       # %bb.0:
545; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
546; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
547; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
548; SSSE3-NEXT:    retq
549;
550; SSE41-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
551; SSE41:       # %bb.0:
552; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
553; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
554; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
555; SSE41-NEXT:    retq
556;
557; AVX1-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
558; AVX1:       # %bb.0:
559; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
560; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
561; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
562; AVX1-NEXT:    retq
563;
564; AVX2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
565; AVX2:       # %bb.0:
566; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
567; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
568; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
569; AVX2-NEXT:    retq
570;
571; AVX512VLBW-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
572; AVX512VLBW:       # %bb.0:
573; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
574; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
575; AVX512VLBW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
576; AVX512VLBW-NEXT:    retq
577;
578; AVX512VLVBMI-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
579; AVX512VLVBMI:       # %bb.0:
580; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,31,30,29,28,11,10,9,8,23,22,21,20]
581; AVX512VLVBMI-NEXT:    vpermt2b %xmm1, %xmm2, %xmm0
582; AVX512VLVBMI-NEXT:    retq
583;
584; XOP-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
585; XOP:       # %bb.0:
586; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[3,2,1,0],xmm1[15,14,13,12],xmm0[11,10,9,8],xmm1[7,6,5,4]
587; XOP-NEXT:    retq
588  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 31, i32 30, i32 29, i32 28, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20>
589  ret <16 x i8> %shuffle
590}
591
592define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i8> %a, <16 x i8> %b) {
593; SSE2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
594; SSE2:       # %bb.0:
595; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
596; SSE2-NEXT:    andps %xmm2, %xmm0
597; SSE2-NEXT:    andnps %xmm1, %xmm2
598; SSE2-NEXT:    orps %xmm2, %xmm0
599; SSE2-NEXT:    retq
600;
601; SSSE3-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
602; SSSE3:       # %bb.0:
603; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
604; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
605; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
606; SSSE3-NEXT:    retq
607;
608; SSE41-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
609; SSE41:       # %bb.0:
610; SSE41-NEXT:    movdqa %xmm0, %xmm2
611; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
612; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
613; SSE41-NEXT:    movdqa %xmm1, %xmm0
614; SSE41-NEXT:    retq
615;
616; AVX1OR2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
617; AVX1OR2:       # %bb.0:
618; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
619; AVX1OR2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
620; AVX1OR2-NEXT:    retq
621;
622; AVX512VL-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
623; AVX512VL:       # %bb.0:
624; AVX512VL-NEXT:    movw $-21846, %ax # imm = 0xAAAA
625; AVX512VL-NEXT:    kmovd %eax, %k1
626; AVX512VL-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
627; AVX512VL-NEXT:    retq
628  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
629  ret <16 x i8> %shuffle
630}
631
632define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(<16 x i8> %a, <16 x i8> %b) {
633; SSE2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
634; SSE2:       # %bb.0:
635; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
636; SSE2-NEXT:    andps %xmm2, %xmm0
637; SSE2-NEXT:    andnps %xmm1, %xmm2
638; SSE2-NEXT:    orps %xmm2, %xmm0
639; SSE2-NEXT:    retq
640;
641; SSSE3-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
642; SSSE3:       # %bb.0:
643; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[15]
644; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6],zero,xmm0[8,9,10],zero,xmm0[12,13,14],zero
645; SSSE3-NEXT:    por %xmm1, %xmm0
646; SSSE3-NEXT:    retq
647;
648; SSE41-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
649; SSE41:       # %bb.0:
650; SSE41-NEXT:    movdqa %xmm0, %xmm2
651; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
652; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
653; SSE41-NEXT:    movdqa %xmm1, %xmm0
654; SSE41-NEXT:    retq
655;
656; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
657; AVX1OR2:       # %bb.0:
658; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
659; AVX1OR2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
660; AVX1OR2-NEXT:    retq
661;
662; AVX512VL-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
663; AVX512VL:       # %bb.0:
664; AVX512VL-NEXT:    movw $-30584, %ax # imm = 0x8888
665; AVX512VL-NEXT:    kmovd %eax, %k1
666; AVX512VL-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
667; AVX512VL-NEXT:    retq
668  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
669  ret <16 x i8> %shuffle
670}
671
672define <16 x i8> @shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz(<16 x i8> %a) {
673; SSE-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
674; SSE:       # %bb.0:
675; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
676; SSE-NEXT:    retq
677;
678; AVX-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
679; AVX:       # %bb.0:
680; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
681; AVX-NEXT:    retq
682  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
683  ret <16 x i8> %shuffle
684}
685
686define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(<16 x i8> %a, <16 x i8> %b) {
687; SSE2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
688; SSE2:       # %bb.0:
689; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
690; SSE2-NEXT:    andps %xmm2, %xmm0
691; SSE2-NEXT:    andnps %xmm1, %xmm2
692; SSE2-NEXT:    orps %xmm2, %xmm0
693; SSE2-NEXT:    retq
694;
695; SSSE3-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
696; SSSE3:       # %bb.0:
697; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[4],zero,zero,xmm1[7],zero,zero,zero,zero,xmm1[12],zero,zero,xmm1[15]
698; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,xmm0[5,6],zero,xmm0[8,9,10,11],zero,xmm0[13,14],zero
699; SSSE3-NEXT:    por %xmm1, %xmm0
700; SSSE3-NEXT:    retq
701;
702; SSE41-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
703; SSE41:       # %bb.0:
704; SSE41-NEXT:    movdqa %xmm0, %xmm2
705; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
706; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
707; SSE41-NEXT:    movdqa %xmm1, %xmm0
708; SSE41-NEXT:    retq
709;
710; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
711; AVX1OR2:       # %bb.0:
712; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
713; AVX1OR2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
714; AVX1OR2-NEXT:    retq
715;
716; AVX512VL-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
717; AVX512VL:       # %bb.0:
718; AVX512VL-NEXT:    movw $-28528, %ax # imm = 0x9090
719; AVX512VL-NEXT:    kmovd %eax, %k1
720; AVX512VL-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
721; AVX512VL-NEXT:    retq
722  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 13, i32 14, i32 31>
723  ret <16 x i8> %shuffle
724}
725
726define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(<16 x i8> %a, <16 x i8> %b) {
727; SSE2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
728; SSE2:       # %bb.0:
729; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
730; SSE2-NEXT:    andps %xmm2, %xmm1
731; SSE2-NEXT:    andnps %xmm0, %xmm2
732; SSE2-NEXT:    orps %xmm1, %xmm2
733; SSE2-NEXT:    movaps %xmm2, %xmm0
734; SSE2-NEXT:    retq
735;
736; SSSE3-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
737; SSSE3:       # %bb.0:
738; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[4,5,6,7],zero,zero,xmm0[10,11],zero,xmm0[13],zero,xmm0[15]
739; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[8,9],zero,zero,xmm1[12],zero,xmm1[14],zero
740; SSSE3-NEXT:    por %xmm1, %xmm0
741; SSSE3-NEXT:    retq
742;
743; SSE41-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
744; SSE41:       # %bb.0:
745; SSE41-NEXT:    movdqa %xmm0, %xmm2
746; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
747; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
748; SSE41-NEXT:    movdqa %xmm2, %xmm0
749; SSE41-NEXT:    retq
750;
751; AVX1OR2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
752; AVX1OR2:       # %bb.0:
753; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
754; AVX1OR2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
755; AVX1OR2-NEXT:    retq
756;
757; AVX512VL-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
758; AVX512VL:       # %bb.0:
759; AVX512VL-NEXT:    movw $-21264, %ax # imm = 0xACF0
760; AVX512VL-NEXT:    kmovd %eax, %k1
761; AVX512VL-NEXT:    vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
762; AVX512VL-NEXT:    retq
763  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 10, i32 11, i32 28, i32 13, i32 30, i32 15>
764  ret <16 x i8> %shuffle
765}
766
767define <16 x i8> @shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %a, <16 x i8> %b)  {
768; SSE2-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
769; SSE2:       # %bb.0:
770; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
771; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
772; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
773; SSE2-NEXT:    psrlq $16, %xmm0
774; SSE2-NEXT:    packuswb %xmm0, %xmm0
775; SSE2-NEXT:    retq
776;
777; SSSE3-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
778; SSSE3:       # %bb.0:
779; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
780; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
781; SSSE3-NEXT:    retq
782;
783; SSE41-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
784; SSE41:       # %bb.0:
785; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
786; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
787; SSE41-NEXT:    retq
788;
789; AVX1-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
790; AVX1:       # %bb.0:
791; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
792; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
793; AVX1-NEXT:    retq
794;
795; AVX2-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
796; AVX2:       # %bb.0:
797; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
798; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
799; AVX2-NEXT:    retq
800;
801; AVX512VLBW-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
802; AVX512VLBW:       # %bb.0:
803; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
804; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
805; AVX512VLBW-NEXT:    retq
806;
807; AVX512VLVBMI-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
808; AVX512VLVBMI:       # %bb.0:
809; AVX512VLVBMI-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [5122,5122,5122,5122,5122,5122,5122,5122]
810; AVX512VLVBMI-NEXT:    vpermt2b %xmm1, %xmm2, %xmm0
811; AVX512VLVBMI-NEXT:    retq
812;
813; XOP-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
814; XOP:       # %bb.0:
815; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[2],xmm1[4],xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
816; XOP-NEXT:    retq
817  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 20, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
818  ret <16 x i8> %shuffle
819}
820
821; PR39387
822define <16 x i8> @shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4(<16 x i8> %a, <16 x i8> %b) {
823; SSE2-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
824; SSE2:       # %bb.0:
825; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,255]
826; SSE2-NEXT:    movdqa %xmm0, %xmm3
827; SSE2-NEXT:    pand %xmm2, %xmm3
828; SSE2-NEXT:    pandn %xmm1, %xmm2
829; SSE2-NEXT:    por %xmm3, %xmm2
830; SSE2-NEXT:    pxor %xmm1, %xmm1
831; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
832; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
833; SSE2-NEXT:    movdqa %xmm0, %xmm1
834; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,65535]
835; SSE2-NEXT:    pand %xmm3, %xmm0
836; SSE2-NEXT:    pandn %xmm2, %xmm3
837; SSE2-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9]
838; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
839; SSE2-NEXT:    por %xmm2, %xmm1
840; SSE2-NEXT:    por %xmm0, %xmm3
841; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,1,2,0]
842; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,5,7]
843; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
844; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,0,0,3,4,5,6,7]
845; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
846; SSE2-NEXT:    packuswb %xmm0, %xmm1
847; SSE2-NEXT:    movdqa %xmm1, %xmm0
848; SSE2-NEXT:    retq
849;
850; SSSE3-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
851; SSSE3:       # %bb.0:
852; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
853; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
854; SSSE3-NEXT:    retq
855;
856; SSE41-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
857; SSE41:       # %bb.0:
858; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
859; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
860; SSE41-NEXT:    retq
861;
862; AVX1-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
863; AVX1:       # %bb.0:
864; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
865; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
866; AVX1-NEXT:    retq
867;
868; AVX2-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
869; AVX2:       # %bb.0:
870; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
871; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
872; AVX2-NEXT:    retq
873;
874; AVX512VLBW-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
875; AVX512VLBW:       # %bb.0:
876; AVX512VLBW-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
877; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9]
878; AVX512VLBW-NEXT:    retq
879;
880; AVX512VLVBMI-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
881; AVX512VLVBMI:       # %bb.0:
882; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} xmm2 = [5,6,7,8,9,10,27,28,29,30,30,1,1,2,3,4]
883; AVX512VLVBMI-NEXT:    vpermt2b %xmm1, %xmm2, %xmm0
884; AVX512VLVBMI-NEXT:    retq
885;
886; XOP-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4:
887; XOP:       # %bb.0:
888; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10],xmm1[11,12,13,14,14],xmm0[1,1,2,3,4]
889; XOP-NEXT:    retq
890  %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 27, i32 28, i32 29, i32 30, i32 30, i32 1, i32 1, i32 2, i32 3, i32 4>
891  ret <16 x i8> %1
892}
893
894; PR27780 - https://bugs.llvm.org/show_bug.cgi?id=27780
895
896define <16 x i8> @load_fold_pblendvb(<16 x i8>* %px, <16 x i8> %y) {
897; SSE2-LABEL: load_fold_pblendvb:
898; SSE2:       # %bb.0:
899; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
900; SSE2-NEXT:    andps %xmm1, %xmm0
901; SSE2-NEXT:    andnps (%rdi), %xmm1
902; SSE2-NEXT:    orps %xmm1, %xmm0
903; SSE2-NEXT:    retq
904;
905; SSSE3-LABEL: load_fold_pblendvb:
906; SSSE3:       # %bb.0:
907; SSSE3-NEXT:    movdqa (%rdi), %xmm1
908; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[3],zero,zero,zero,xmm0[7,8,9],zero,xmm0[11],zero,zero,zero,xmm0[15]
909; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = zero,zero,xmm1[2],zero,xmm1[4,5,6],zero,zero,zero,xmm1[10],zero,xmm1[12,13,14],zero
910; SSSE3-NEXT:    por %xmm1, %xmm0
911; SSSE3-NEXT:    retq
912;
913; SSE41-LABEL: load_fold_pblendvb:
914; SSE41:       # %bb.0:
915; SSE41-NEXT:    movdqa %xmm0, %xmm1
916; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
917; SSE41-NEXT:    pblendvb %xmm0, (%rdi), %xmm1
918; SSE41-NEXT:    movdqa %xmm1, %xmm0
919; SSE41-NEXT:    retq
920;
921; AVX1OR2-LABEL: load_fold_pblendvb:
922; AVX1OR2:       # %bb.0:
923; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
924; AVX1OR2-NEXT:    vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
925; AVX1OR2-NEXT:    retq
926;
927; AVX512VL-LABEL: load_fold_pblendvb:
928; AVX512VL:       # %bb.0:
929; AVX512VL-NEXT:    movw $29812, %ax # imm = 0x7474
930; AVX512VL-NEXT:    kmovd %eax, %k1
931; AVX512VL-NEXT:    vmovdqu8 (%rdi), %xmm0 {%k1}
932; AVX512VL-NEXT:    retq
933  %x = load <16 x i8>, <16 x i8>* %px, align 16
934  %select = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
935  ret <16 x i8> %select
936}
937
938define <16 x i8> @load_fold_pblendvb_commute(<16 x i8>* %px, <16 x i8> %y) {
939; SSE2-LABEL: load_fold_pblendvb_commute:
940; SSE2:       # %bb.0:
941; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
942; SSE2-NEXT:    movaps %xmm1, %xmm2
943; SSE2-NEXT:    andnps %xmm0, %xmm2
944; SSE2-NEXT:    andps (%rdi), %xmm1
945; SSE2-NEXT:    orps %xmm2, %xmm1
946; SSE2-NEXT:    movaps %xmm1, %xmm0
947; SSE2-NEXT:    retq
948;
949; SSSE3-LABEL: load_fold_pblendvb_commute:
950; SSSE3:       # %bb.0:
951; SSSE3-NEXT:    movdqa (%rdi), %xmm1
952; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[2],zero,xmm0[4,5,6],zero,zero,zero,xmm0[10],zero,xmm0[12,13,14],zero
953; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3],zero,zero,zero,xmm1[7,8,9],zero,xmm1[11],zero,zero,zero,xmm1[15]
954; SSSE3-NEXT:    por %xmm1, %xmm0
955; SSSE3-NEXT:    retq
956;
957; SSE41-LABEL: load_fold_pblendvb_commute:
958; SSE41:       # %bb.0:
959; SSE41-NEXT:    movdqa %xmm0, %xmm1
960; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
961; SSE41-NEXT:    pblendvb %xmm0, (%rdi), %xmm1
962; SSE41-NEXT:    movdqa %xmm1, %xmm0
963; SSE41-NEXT:    retq
964;
965; AVX1OR2-LABEL: load_fold_pblendvb_commute:
966; AVX1OR2:       # %bb.0:
967; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
968; AVX1OR2-NEXT:    vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
969; AVX1OR2-NEXT:    retq
970;
971; AVX512VL-LABEL: load_fold_pblendvb_commute:
972; AVX512VL:       # %bb.0:
973; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm1
974; AVX512VL-NEXT:    movw $29812, %ax # imm = 0x7474
975; AVX512VL-NEXT:    kmovd %eax, %k1
976; AVX512VL-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
977; AVX512VL-NEXT:    vmovdqa %xmm1, %xmm0
978; AVX512VL-NEXT:    retq
979  %x = load <16 x i8>, <16 x i8>* %px, align 16
980  %select = shufflevector <16 x i8> %y, <16 x i8> %x, <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
981  ret <16 x i8> %select
982}
983
984define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
985; SSE2-LABEL: trunc_v4i32_shuffle:
986; SSE2:       # %bb.0:
987; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
988; SSE2-NEXT:    packuswb %xmm0, %xmm0
989; SSE2-NEXT:    packuswb %xmm0, %xmm0
990; SSE2-NEXT:    retq
991;
992; SSSE3-LABEL: trunc_v4i32_shuffle:
993; SSSE3:       # %bb.0:
994; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
995; SSSE3-NEXT:    retq
996;
997; SSE41-LABEL: trunc_v4i32_shuffle:
998; SSE41:       # %bb.0:
999; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
1000; SSE41-NEXT:    retq
1001;
1002; AVX1OR2-LABEL: trunc_v4i32_shuffle:
1003; AVX1OR2:       # %bb.0:
1004; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
1005; AVX1OR2-NEXT:    retq
1006;
1007; AVX512VL-LABEL: trunc_v4i32_shuffle:
1008; AVX512VL:       # %bb.0:
1009; AVX512VL-NEXT:    vpmovdb %xmm0, %xmm0
1010; AVX512VL-NEXT:    retq
1011  %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1012  ret <16 x i8> %shuffle
1013}
1014
1015define <16 x i8> @stress_test0(<16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i8> %s.0.7, <16 x i8> %s.0.8, <16 x i8> %s.0.9) {
1016; We don't have anything useful to check here. This generates 100s of
1017; instructions. Instead, just make sure we survived codegen.
1018; ALL-LABEL: stress_test0:
1019; ALL:         retq
1020entry:
1021  %s.1.4 = shufflevector <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i32> <i32 1, i32 22, i32 21, i32 28, i32 3, i32 16, i32 6, i32 1, i32 19, i32 29, i32 12, i32 31, i32 2, i32 3, i32 3, i32 6>
1022  %s.1.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i32> <i32 31, i32 20, i32 12, i32 19, i32 2, i32 15, i32 12, i32 31, i32 2, i32 28, i32 2, i32 30, i32 7, i32 8, i32 17, i32 28>
1023  %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> %s.0.9, <16 x i32> <i32 14, i32 10, i32 17, i32 5, i32 17, i32 9, i32 17, i32 21, i32 31, i32 24, i32 16, i32 6, i32 20, i32 28, i32 23, i32 8>
1024  %s.2.2 = shufflevector <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i32> <i32 20, i32 9, i32 21, i32 11, i32 11, i32 4, i32 3, i32 18, i32 3, i32 30, i32 4, i32 31, i32 11, i32 24, i32 13, i32 29>
1025  %s.3.2 = shufflevector <16 x i8> %s.2.2, <16 x i8> %s.1.4, <16 x i32> <i32 15, i32 13, i32 5, i32 11, i32 7, i32 17, i32 14, i32 22, i32 22, i32 16, i32 7, i32 24, i32 16, i32 22, i32 7, i32 29>
1026  %s.5.4 = shufflevector <16 x i8> %s.1.5, <16 x i8> %s.1.8, <16 x i32> <i32 3, i32 13, i32 19, i32 7, i32 23, i32 11, i32 1, i32 9, i32 16, i32 25, i32 2, i32 7, i32 0, i32 21, i32 23, i32 17>
1027  %s.6.1 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.3.2, <16 x i32> <i32 11, i32 2, i32 28, i32 31, i32 27, i32 3, i32 9, i32 27, i32 25, i32 25, i32 14, i32 7, i32 12, i32 28, i32 12, i32 23>
1028  %s.7.1 = shufflevector <16 x i8> %s.6.1, <16 x i8> %s.3.2, <16 x i32> <i32 15, i32 29, i32 14, i32 0, i32 29, i32 15, i32 26, i32 30, i32 6, i32 7, i32 2, i32 8, i32 12, i32 10, i32 29, i32 17>
1029  %s.7.2 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.5.4, <16 x i32> <i32 3, i32 29, i32 3, i32 19, i32 undef, i32 20, i32 undef, i32 3, i32 27, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
1030  %s.16.0 = shufflevector <16 x i8> %s.7.1, <16 x i8> %s.7.2, <16 x i32> <i32 13, i32 1, i32 16, i32 16, i32 6, i32 7, i32 29, i32 18, i32 19, i32 28, i32 undef, i32 undef, i32 31, i32 1, i32 undef, i32 10>
1031  ret <16 x i8> %s.16.0
1032}
1033
1034define <16 x i8> @undef_test1(<16 x i8> %s.0.5, <16 x i8> %s.0.8, <16 x i8> %s.0.9) noinline nounwind {
1035; There is nothing interesting to check about these instructions other than
1036; that they survive codegen. However, we actually do better and delete all of
1037; them because the result is 'undef'.
1038;
1039; ALL-LABEL: undef_test1:
1040; ALL:       # %bb.0: # %entry
1041; ALL-NEXT:    retq
1042entry:
1043  %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> undef, <16 x i32> <i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 6, i32 undef, i32 6, i32 undef, i32 14, i32 14, i32 undef, i32 undef, i32 0>
1044  %s.2.4 = shufflevector <16 x i8> undef, <16 x i8> %s.0.5, <16 x i32> <i32 21, i32 undef, i32 undef, i32 19, i32 undef, i32 undef, i32 29, i32 24, i32 21, i32 23, i32 21, i32 17, i32 19, i32 undef, i32 20, i32 22>
1045  %s.2.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> undef, <16 x i32> <i32 3, i32 8, i32 undef, i32 7, i32 undef, i32 10, i32 8, i32 0, i32 15, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 9>
1046  %s.2.9 = shufflevector <16 x i8> %s.0.9, <16 x i8> undef, <16 x i32> <i32 7, i32 undef, i32 14, i32 7, i32 8, i32 undef, i32 7, i32 8, i32 5, i32 15, i32 undef, i32 1, i32 11, i32 undef, i32 undef, i32 11>
1047  %s.3.4 = shufflevector <16 x i8> %s.2.4, <16 x i8> %s.0.5, <16 x i32> <i32 5, i32 0, i32 21, i32 6, i32 15, i32 27, i32 22, i32 21, i32 4, i32 22, i32 19, i32 26, i32 9, i32 26, i32 8, i32 29>
1048  %s.3.9 = shufflevector <16 x i8> %s.2.9, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 8, i32 1, i32 undef, i32 4, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 undef>
1049  %s.4.7 = shufflevector <16 x i8> %s.1.8, <16 x i8> %s.2.9, <16 x i32> <i32 9, i32 0, i32 22, i32 20, i32 24, i32 7, i32 21, i32 17, i32 20, i32 12, i32 19, i32 23, i32 2, i32 9, i32 17, i32 10>
1050  %s.4.8 = shufflevector <16 x i8> %s.2.9, <16 x i8> %s.3.9, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 10, i32 undef, i32 0, i32 5, i32 undef, i32 9, i32 undef>
1051  %s.5.7 = shufflevector <16 x i8> %s.4.7, <16 x i8> %s.4.8, <16 x i32> <i32 16, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1052  %s.8.4 = shufflevector <16 x i8> %s.3.4, <16 x i8> %s.5.7, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 28, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1053  %s.9.4 = shufflevector <16 x i8> %s.8.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 5>
1054  %s.10.4 = shufflevector <16 x i8> %s.9.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1055  %s.12.4 = shufflevector <16 x i8> %s.10.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef>
1056
1057  ret <16 x i8> %s.12.4
1058}
1059
1060define <16 x i8> @PR20540(<8 x i8> %a) {
1061; SSE-LABEL: PR20540:
1062; SSE:       # %bb.0:
1063; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
1064; SSE-NEXT:    retq
1065;
1066; AVX-LABEL: PR20540:
1067; AVX:       # %bb.0:
1068; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
1069; AVX-NEXT:    retq
1070  %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
1071  ret <16 x i8> %shuffle
1072}
1073
1074define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
1075; SSE-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1076; SSE:       # %bb.0:
1077; SSE-NEXT:    movzbl %dil, %eax
1078; SSE-NEXT:    movd %eax, %xmm0
1079; SSE-NEXT:    retq
1080;
1081; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1082; AVX:       # %bb.0:
1083; AVX-NEXT:    movzbl %dil, %eax
1084; AVX-NEXT:    vmovd %eax, %xmm0
1085; AVX-NEXT:    retq
1086  %a = insertelement <16 x i8> undef, i8 %i, i32 0
1087  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1088  ret <16 x i8> %shuffle
1089}
1090
1091define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
1092; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1093; SSE2:       # %bb.0:
1094; SSE2-NEXT:    shll $8, %edi
1095; SSE2-NEXT:    pxor %xmm0, %xmm0
1096; SSE2-NEXT:    pinsrw $2, %edi, %xmm0
1097; SSE2-NEXT:    retq
1098;
1099; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1100; SSSE3:       # %bb.0:
1101; SSSE3-NEXT:    shll $8, %edi
1102; SSSE3-NEXT:    pxor %xmm0, %xmm0
1103; SSSE3-NEXT:    pinsrw $2, %edi, %xmm0
1104; SSSE3-NEXT:    retq
1105;
1106; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1107; SSE41:       # %bb.0:
1108; SSE41-NEXT:    pxor %xmm0, %xmm0
1109; SSE41-NEXT:    pinsrb $5, %edi, %xmm0
1110; SSE41-NEXT:    retq
1111;
1112; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1113; AVX:       # %bb.0:
1114; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1115; AVX-NEXT:    vpinsrb $5, %edi, %xmm0, %xmm0
1116; AVX-NEXT:    retq
1117  %a = insertelement <16 x i8> undef, i8 %i, i32 0
1118  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
1119  ret <16 x i8> %shuffle
1120}
1121
1122define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) {
1123; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
1124; SSE2:       # %bb.0:
1125; SSE2-NEXT:    shll $8, %edi
1126; SSE2-NEXT:    pxor %xmm0, %xmm0
1127; SSE2-NEXT:    pinsrw $7, %edi, %xmm0
1128; SSE2-NEXT:    retq
1129;
1130; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
1131; SSSE3:       # %bb.0:
1132; SSSE3-NEXT:    shll $8, %edi
1133; SSSE3-NEXT:    pxor %xmm0, %xmm0
1134; SSSE3-NEXT:    pinsrw $7, %edi, %xmm0
1135; SSSE3-NEXT:    retq
1136;
1137; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
1138; SSE41:       # %bb.0:
1139; SSE41-NEXT:    pxor %xmm0, %xmm0
1140; SSE41-NEXT:    pinsrb $15, %edi, %xmm0
1141; SSE41-NEXT:    retq
1142;
1143; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
1144; AVX:       # %bb.0:
1145; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1146; AVX-NEXT:    vpinsrb $15, %edi, %xmm0, %xmm0
1147; AVX-NEXT:    retq
1148  %a = insertelement <16 x i8> undef, i8 %i, i32 0
1149  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
1150  ret <16 x i8> %shuffle
1151}
1152
1153define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
1154; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1155; SSE2:       # %bb.0:
1156; SSE2-NEXT:    movzbl %dil, %eax
1157; SSE2-NEXT:    pxor %xmm0, %xmm0
1158; SSE2-NEXT:    pinsrw $1, %eax, %xmm0
1159; SSE2-NEXT:    retq
1160;
1161; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1162; SSSE3:       # %bb.0:
1163; SSSE3-NEXT:    movzbl %dil, %eax
1164; SSSE3-NEXT:    pxor %xmm0, %xmm0
1165; SSSE3-NEXT:    pinsrw $1, %eax, %xmm0
1166; SSSE3-NEXT:    retq
1167;
1168; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1169; SSE41:       # %bb.0:
1170; SSE41-NEXT:    pxor %xmm0, %xmm0
1171; SSE41-NEXT:    pinsrb $2, %edi, %xmm0
1172; SSE41-NEXT:    retq
1173;
1174; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1175; AVX:       # %bb.0:
1176; AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1177; AVX-NEXT:    vpinsrb $2, %edi, %xmm0, %xmm0
1178; AVX-NEXT:    retq
1179  %a = insertelement <16 x i8> undef, i8 %i, i32 3
1180  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1181  ret <16 x i8> %shuffle
1182}
1183
1184define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu(<16 x i8> %a) {
1185; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
1186; SSE:       # %bb.0:
1187; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
1188; SSE-NEXT:    retq
1189;
1190; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
1191; AVX:       # %bb.0:
1192; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
1193; AVX-NEXT:    retq
1194  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 undef, i32 18, i32 undef>
1195  ret <16 x i8> %shuffle
1196}
1197
1198define <16 x i8> @shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
1199; SSE-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1200; SSE:       # %bb.0:
1201; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1202; SSE-NEXT:    retq
1203;
1204; AVX-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1205; AVX:       # %bb.0:
1206; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1207; AVX-NEXT:    retq
1208  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 28, i32 undef, i32 30, i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0>
1209  ret <16 x i8> %shuffle
1210}
1211
1212define <16 x i8> @shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
1213; SSE2-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1214; SSE2:       # %bb.0:
1215; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1216; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1217; SSE2-NEXT:    por %xmm1, %xmm0
1218; SSE2-NEXT:    retq
1219;
1220; SSSE3-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1221; SSSE3:       # %bb.0:
1222; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1223; SSSE3-NEXT:    retq
1224;
1225; SSE41-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1226; SSE41:       # %bb.0:
1227; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1228; SSE41-NEXT:    retq
1229;
1230; AVX-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1231; AVX:       # %bb.0:
1232; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1233; AVX-NEXT:    retq
1234  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
1235  ret <16 x i8> %shuffle
1236}
1237
1238define <16 x i8> @shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
1239; SSE2-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1240; SSE2:       # %bb.0:
1241; SSE2-NEXT:    movdqa %xmm0, %xmm1
1242; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1243; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1244; SSE2-NEXT:    por %xmm1, %xmm0
1245; SSE2-NEXT:    retq
1246;
1247; SSSE3-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1248; SSSE3:       # %bb.0:
1249; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1250; SSSE3-NEXT:    retq
1251;
1252; SSE41-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1253; SSE41:       # %bb.0:
1254; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1255; SSE41-NEXT:    retq
1256;
1257; AVX-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
1258; AVX:       # %bb.0:
1259; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1260; AVX-NEXT:    retq
1261  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
1262  ret <16 x i8> %shuffle
1263}
1264
1265define <16 x i8> @shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00(<16 x i8> %a, <16 x i8> %b) {
1266; SSE2-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
1267; SSE2:       # %bb.0:
1268; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
1269; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
1270; SSE2-NEXT:    por %xmm1, %xmm0
1271; SSE2-NEXT:    retq
1272;
1273; SSSE3-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
1274; SSSE3:       # %bb.0:
1275; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1276; SSSE3-NEXT:    retq
1277;
1278; SSE41-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
1279; SSE41:       # %bb.0:
1280; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1281; SSE41-NEXT:    retq
1282;
1283; AVX-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
1284; AVX:       # %bb.0:
1285; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
1286; AVX-NEXT:    retq
1287  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0>
1288  ret <16 x i8> %shuffle
1289}
1290
1291define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i8> %a, <16 x i8> %b) {
1292; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
1293; SSE2:       # %bb.0:
1294; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
1295; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1296; SSE2-NEXT:    por %xmm1, %xmm0
1297; SSE2-NEXT:    retq
1298;
1299; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
1300; SSSE3:       # %bb.0:
1301; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
1302; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1303; SSSE3-NEXT:    retq
1304;
1305; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
1306; SSE41:       # %bb.0:
1307; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
1308; SSE41-NEXT:    movdqa %xmm1, %xmm0
1309; SSE41-NEXT:    retq
1310;
1311; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
1312; AVX:       # %bb.0:
1313; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
1314; AVX-NEXT:    retq
1315  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
1316  ret <16 x i8> %shuffle
1317}
1318
1319define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i8> %a, <16 x i8> %b) {
1320; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
1321; SSE2:       # %bb.0:
1322; SSE2-NEXT:    movdqa %xmm0, %xmm1
1323; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
1324; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
1325; SSE2-NEXT:    por %xmm1, %xmm0
1326; SSE2-NEXT:    retq
1327;
1328; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
1329; SSSE3:       # %bb.0:
1330; SSSE3-NEXT:    palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
1331; SSSE3-NEXT:    retq
1332;
1333; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
1334; SSE41:       # %bb.0:
1335; SSE41-NEXT:    palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
1336; SSE41-NEXT:    retq
1337;
1338; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
1339; AVX:       # %bb.0:
1340; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
1341; AVX-NEXT:    retq
1342  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0>
1343  ret <16 x i8> %shuffle
1344}
1345
1346define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<16 x i8> %a, <16 x i8> %b) {
1347; SSE2-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
1348; SSE2:       # %bb.0:
1349; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1350; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1351; SSE2-NEXT:    por %xmm1, %xmm0
1352; SSE2-NEXT:    retq
1353;
1354; SSSE3-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
1355; SSSE3:       # %bb.0:
1356; SSSE3-NEXT:    palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1357; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1358; SSSE3-NEXT:    retq
1359;
1360; SSE41-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
1361; SSE41:       # %bb.0:
1362; SSE41-NEXT:    palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1363; SSE41-NEXT:    movdqa %xmm1, %xmm0
1364; SSE41-NEXT:    retq
1365;
1366; AVX-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
1367; AVX:       # %bb.0:
1368; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
1369; AVX-NEXT:    retq
1370  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
1371  ret <16 x i8> %shuffle
1372}
1373
1374; PR31151
1375define <16 x i8> @shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23(<16 x i8> %val1, <16 x i8> %val2) {
1376; SSE-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
1377; SSE:       # %bb.0:
1378; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1379; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1380; SSE-NEXT:    retq
1381;
1382; AVX-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
1383; AVX:       # %bb.0:
1384; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1385; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1386; AVX-NEXT:    retq
1387  %shuffle = shufflevector <16 x i8> %val1, <16 x i8> %val2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23>
1388  ret <16 x i8> %shuffle
1389}
1390
1391define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %a) {
1392; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
1393; SSE2:       # %bb.0:
1394; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1395; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1396; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
1397; SSE2-NEXT:    retq
1398;
1399; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
1400; SSSE3:       # %bb.0:
1401; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,u,u,u,u,u,u,u,1,u,u,u,u,u,u,u]
1402; SSSE3-NEXT:    retq
1403;
1404; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
1405; SSE41:       # %bb.0:
1406; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1407; SSE41-NEXT:    retq
1408;
1409; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
1410; AVX:       # %bb.0:
1411; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1412; AVX-NEXT:    retq
1413  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1414  ret <16 x i8> %shuffle
1415}
1416
1417define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
1418; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
1419; SSE2:       # %bb.0:
1420; SSE2-NEXT:    pxor %xmm1, %xmm1
1421; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1422; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1423; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1424; SSE2-NEXT:    retq
1425;
1426; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
1427; SSSE3:       # %bb.0:
1428; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1429; SSSE3-NEXT:    retq
1430;
1431; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
1432; SSE41:       # %bb.0:
1433; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1434; SSE41-NEXT:    retq
1435;
1436; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
1437; AVX:       # %bb.0:
1438; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
1439; AVX-NEXT:    retq
1440  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1441  ret <16 x i8> %shuffle
1442}
1443
1444define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu(<16 x i8> %a) {
1445; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
1446; SSE2:       # %bb.0:
1447; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1448; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1449; SSE2-NEXT:    retq
1450;
1451; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
1452; SSSE3:       # %bb.0:
1453; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1454; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1455; SSSE3-NEXT:    retq
1456;
1457; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
1458; SSE41:       # %bb.0:
1459; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1460; SSE41-NEXT:    retq
1461;
1462; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
1463; AVX:       # %bb.0:
1464; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1465; AVX-NEXT:    retq
1466  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef>
1467  ret <16 x i8> %shuffle
1468}
1469
1470define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) {
1471; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
1472; SSE2:       # %bb.0:
1473; SSE2-NEXT:    pxor %xmm1, %xmm1
1474; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1475; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1476; SSE2-NEXT:    retq
1477;
1478; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
1479; SSSE3:       # %bb.0:
1480; SSSE3-NEXT:    pxor %xmm1, %xmm1
1481; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1482; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1483; SSSE3-NEXT:    retq
1484;
1485; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
1486; SSE41:       # %bb.0:
1487; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1488; SSE41-NEXT:    retq
1489;
1490; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
1491; AVX:       # %bb.0:
1492; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1493; AVX-NEXT:    retq
1494  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
1495  ret <16 x i8> %shuffle
1496}
1497
1498define <16 x i8> @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu(<16 x i8> %a) {
1499; SSE2-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
1500; SSE2:       # %bb.0:
1501; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1502; SSE2-NEXT:    retq
1503;
1504; SSSE3-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
1505; SSSE3:       # %bb.0:
1506; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1507; SSSE3-NEXT:    retq
1508;
1509; SSE41-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
1510; SSE41:       # %bb.0:
1511; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1512; SSE41-NEXT:    retq
1513;
1514; AVX-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
1515; AVX:       # %bb.0:
1516; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1517; AVX-NEXT:    retq
1518  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7, i32 undef>
1519  ret <16 x i8> %shuffle
1520}
1521
1522define <16 x i8> @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz(<16 x i8> %a) {
1523; SSE2-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
1524; SSE2:       # %bb.0:
1525; SSE2-NEXT:    pxor %xmm1, %xmm1
1526; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1527; SSE2-NEXT:    retq
1528;
1529; SSSE3-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
1530; SSSE3:       # %bb.0:
1531; SSSE3-NEXT:    pxor %xmm1, %xmm1
1532; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1533; SSSE3-NEXT:    retq
1534;
1535; SSE41-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
1536; SSE41:       # %bb.0:
1537; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1538; SSE41-NEXT:    retq
1539;
1540; AVX-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
1541; AVX:       # %bb.0:
1542; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1543; AVX-NEXT:    retq
1544  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
1545  ret <16 x i8> %shuffle
1546}
1547
1548define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(<16 x i8> %a, <16 x i8> %b) {
1549; SSE2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1550; SSE2:       # %bb.0: # %entry
1551; SSE2-NEXT:    pxor %xmm2, %xmm2
1552; SSE2-NEXT:    movdqa %xmm0, %xmm3
1553; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
1554; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[1,3,2,0,4,5,6,7]
1555; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1]
1556; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,0,0,65535]
1557; SSE2-NEXT:    pand %xmm5, %xmm4
1558; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1559; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,3,0,1]
1560; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
1561; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
1562; SSE2-NEXT:    pandn %xmm2, %xmm5
1563; SSE2-NEXT:    por %xmm4, %xmm5
1564; SSE2-NEXT:    psrlq $16, %xmm0
1565; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
1566; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,1,1,3]
1567; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
1568; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,4]
1569; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
1570; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1571; SSE2-NEXT:    packuswb %xmm5, %xmm2
1572; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
1573; SSE2-NEXT:    pand %xmm0, %xmm2
1574; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
1575; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
1576; SSE2-NEXT:    pandn %xmm1, %xmm0
1577; SSE2-NEXT:    por %xmm2, %xmm0
1578; SSE2-NEXT:    retq
1579;
1580; SSSE3-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1581; SSSE3:       # %bb.0: # %entry
1582; SSSE3-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1583; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1584; SSSE3-NEXT:    por %xmm1, %xmm0
1585; SSSE3-NEXT:    retq
1586;
1587; SSE41-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1588; SSE41:       # %bb.0: # %entry
1589; SSE41-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1590; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1591; SSE41-NEXT:    por %xmm1, %xmm0
1592; SSE41-NEXT:    retq
1593;
1594; AVX1-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1595; AVX1:       # %bb.0: # %entry
1596; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1597; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1598; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
1599; AVX1-NEXT:    retq
1600;
1601; AVX2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1602; AVX2:       # %bb.0: # %entry
1603; AVX2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1604; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1605; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
1606; AVX2-NEXT:    retq
1607;
1608; AVX512VLBW-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1609; AVX512VLBW:       # %bb.0: # %entry
1610; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
1611; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
1612; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1613; AVX512VLBW-NEXT:    retq
1614;
1615; AVX512VLVBMI-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1616; AVX512VLVBMI:       # %bb.0: # %entry
1617; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,10,2,7,22,14,7,2,18,3,1,14,18,9,11,0>
1618; AVX512VLVBMI-NEXT:    vpermt2b %xmm1, %xmm2, %xmm0
1619; AVX512VLVBMI-NEXT:    retq
1620;
1621; XOP-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
1622; XOP:       # %bb.0: # %entry
1623; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[u,10,2,7],xmm1[6],xmm0[14,7,2],xmm1[2],xmm0[3,1,14],xmm1[2],xmm0[9,11,0]
1624; XOP-NEXT:    retq
1625entry:
1626  %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0>
1627
1628  ret <16 x i8> %shuffle
1629}
1630
1631define <16 x i8> @shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30(<8 x i16> %a0, <8 x i16> %a1) {
1632; SSE-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
1633; SSE:       # %bb.0:
1634; SSE-NEXT:    psrlw $8, %xmm0
1635; SSE-NEXT:    psrlw $8, %xmm1
1636; SSE-NEXT:    packuswb %xmm1, %xmm0
1637; SSE-NEXT:    retq
1638;
1639; AVX1-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
1640; AVX1:       # %bb.0:
1641; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1642; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1643; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1644; AVX1-NEXT:    retq
1645;
1646; AVX2OR512VL-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
1647; AVX2OR512VL:       # %bb.0:
1648; AVX2OR512VL-NEXT:    vpsrlw $8, %xmm0, %xmm0
1649; AVX2OR512VL-NEXT:    vpsrlw $8, %xmm1, %xmm1
1650; AVX2OR512VL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1651; AVX2OR512VL-NEXT:    retq
1652;
1653; XOP-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
1654; XOP:       # %bb.0:
1655; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15],xmm1[1,3,5,7,9,11,13,15]
1656; XOP-NEXT:    retq
1657  %1 = lshr <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1658  %2 = lshr <8 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1659  %3 = bitcast <8 x i16> %1 to <16 x i8>
1660  %4 = bitcast <8 x i16> %2 to <16 x i8>
1661  %5 = shufflevector <16 x i8> %3, <16 x i8> %4, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
1662  ret <16 x i8> %5
1663}
1664
1665define <16 x i8> @stress_test2(<16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i8> %s.0.2) {
1666; Nothing interesting to test here. Just make sure we didn't crashe.
1667; ALL-LABEL: stress_test2:
1668; ALL:         retq
1669entry:
1670  %s.1.0 = shufflevector <16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i32> <i32 29, i32 30, i32 2, i32 16, i32 26, i32 21, i32 11, i32 26, i32 26, i32 3, i32 4, i32 5, i32 30, i32 28, i32 15, i32 5>
1671  %s.1.1 = shufflevector <16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i32> <i32 31, i32 1, i32 24, i32 12, i32 28, i32 5, i32 2, i32 9, i32 29, i32 1, i32 31, i32 5, i32 6, i32 17, i32 15, i32 22>
1672  %s.2.0 = shufflevector <16 x i8> %s.1.0, <16 x i8> %s.1.1, <16 x i32> <i32 22, i32 1, i32 12, i32 3, i32 30, i32 4, i32 30, i32 undef, i32 1, i32 10, i32 14, i32 18, i32 27, i32 13, i32 16, i32 19>
1673
1674  ret <16 x i8> %s.2.0
1675}
1676
1677define void @constant_gets_selected(<4 x i32>* %ptr1, <4 x i32>* %ptr2) {
1678; SSE-LABEL: constant_gets_selected:
1679; SSE:       # %bb.0: # %entry
1680; SSE-NEXT:    xorps %xmm0, %xmm0
1681; SSE-NEXT:    movaps %xmm0, (%rdi)
1682; SSE-NEXT:    movaps %xmm0, (%rsi)
1683; SSE-NEXT:    retq
1684;
1685; AVX-LABEL: constant_gets_selected:
1686; AVX:       # %bb.0: # %entry
1687; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
1688; AVX-NEXT:    vmovaps %xmm0, (%rdi)
1689; AVX-NEXT:    vmovaps %xmm0, (%rsi)
1690; AVX-NEXT:    retq
1691entry:
1692  %weird_zero = bitcast <4 x i32> zeroinitializer to <16 x i8>
1693  %shuffle.i = shufflevector <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %weird_zero, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
1694  %weirder_zero = bitcast <16 x i8> %shuffle.i to <4 x i32>
1695  store <4 x i32> %weirder_zero, <4 x i32>* %ptr1, align 16
1696  store <4 x i32> zeroinitializer, <4 x i32>* %ptr2, align 16
1697  ret void
1698}
1699
1700;
1701; Shuffle to logical bit shifts
1702;
1703
1704define <16 x i8> @shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i8> %a, <16 x i8> %b) {
1705; SSE-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
1706; SSE:       # %bb.0:
1707; SSE-NEXT:    psllw $8, %xmm0
1708; SSE-NEXT:    retq
1709;
1710; AVX-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
1711; AVX:       # %bb.0:
1712; AVX-NEXT:    vpsllw $8, %xmm0, %xmm0
1713; AVX-NEXT:    retq
1714  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>
1715  ret <16 x i8> %shuffle
1716}
1717
1718define <16 x i8> @shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i8> %a, <16 x i8> %b) {
1719; SSE-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
1720; SSE:       # %bb.0:
1721; SSE-NEXT:    pslld $24, %xmm0
1722; SSE-NEXT:    retq
1723;
1724; AVX-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
1725; AVX:       # %bb.0:
1726; AVX-NEXT:    vpslld $24, %xmm0, %xmm0
1727; AVX-NEXT:    retq
1728  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 12>
1729  ret <16 x i8> %shuffle
1730}
1731
1732define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08(<16 x i8> %a, <16 x i8> %b) {
1733; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08:
1734; SSE:       # %bb.0:
1735; SSE-NEXT:    psllq $56, %xmm0
1736; SSE-NEXT:    retq
1737;
1738; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08:
1739; AVX:       # %bb.0:
1740; AVX-NEXT:    vpsllq $56, %xmm0, %xmm0
1741; AVX-NEXT:    retq
1742  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 8>
1743  ret <16 x i8> %shuffle
1744}
1745
1746define <16 x i8> @shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
1747; SSE-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14:
1748; SSE:       # %bb.0:
1749; SSE-NEXT:    psllq $8, %xmm0
1750; SSE-NEXT:    retq
1751;
1752; AVX-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14:
1753; AVX:       # %bb.0:
1754; AVX-NEXT:    vpsllq $8, %xmm0, %xmm0
1755; AVX-NEXT:    retq
1756  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 undef, i32 2, i32 3, i32 undef, i32 5, i32 6, i32 16, i32 8, i32 9, i32 undef, i32 11, i32 12, i32 13, i32 14>
1757  ret <16 x i8> %shuffle
1758}
1759
1760define <16 x i8> @shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz(<16 x i8> %a, <16 x i8> %b) {
1761; SSE-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz:
1762; SSE:       # %bb.0:
1763; SSE-NEXT:    psrlw $8, %xmm0
1764; SSE-NEXT:    retq
1765;
1766; AVX-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz:
1767; AVX:       # %bb.0:
1768; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
1769; AVX-NEXT:    retq
1770  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 undef, i32 16, i32 undef, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>
1771  ret <16 x i8> %shuffle
1772}
1773
1774define <16 x i8> @shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz(<16 x i8> %a, <16 x i8> %b) {
1775; SSE-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz:
1776; SSE:       # %bb.0:
1777; SSE-NEXT:    psrld $16, %xmm0
1778; SSE-NEXT:    retq
1779;
1780; AVX-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz:
1781; AVX:       # %bb.0:
1782; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
1783; AVX-NEXT:    retq
1784  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 14, i32 15, i32 16, i32 16>
1785  ret <16 x i8> %shuffle
1786}
1787
1788define <16 x i8> @shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz(<16 x i8> %a, <16 x i8> %b) {
1789; SSE-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz:
1790; SSE:       # %bb.0:
1791; SSE-NEXT:    psrlq $56, %xmm0
1792; SSE-NEXT:    retq
1793;
1794; AVX-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz:
1795; AVX:       # %bb.0:
1796; AVX-NEXT:    vpsrlq $56, %xmm0, %xmm0
1797; AVX-NEXT:    retq
1798  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16>
1799  ret <16 x i8> %shuffle
1800}
1801
1802define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
1803; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz:
1804; SSE2:       # %bb.0:
1805; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
1806; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1807; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
1808; SSE2-NEXT:    retq
1809;
1810; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz:
1811; SSSE3:       # %bb.0:
1812; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero
1813; SSSE3-NEXT:    retq
1814;
1815; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz:
1816; SSE41:       # %bb.0:
1817; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero
1818; SSE41-NEXT:    retq
1819;
1820; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz:
1821; AVX:       # %bb.0:
1822; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero
1823; AVX-NEXT:    retq
1824  %shuffle = shufflevector <16 x i8> %a, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1825  ret <16 x i8> %shuffle
1826}
1827
1828define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
1829; SSE-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1830; SSE:       # %bb.0:
1831; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
1832; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1833; SSE-NEXT:    retq
1834;
1835; AVX1-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1836; AVX1:       # %bb.0:
1837; AVX1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
1838; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1839; AVX1-NEXT:    retq
1840;
1841; AVX2-SLOW-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1842; AVX2-SLOW:       # %bb.0:
1843; AVX2-SLOW-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
1844; AVX2-SLOW-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1845; AVX2-SLOW-NEXT:    retq
1846;
1847; AVX2-FAST-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1848; AVX2-FAST:       # %bb.0:
1849; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1850; AVX2-FAST-NEXT:    retq
1851;
1852; AVX512VL-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1853; AVX512VL:       # %bb.0:
1854; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1855; AVX512VL-NEXT:    retq
1856;
1857; XOP-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
1858; XOP:       # %bb.0:
1859; XOP-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
1860; XOP-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1861; XOP-NEXT:    retq
1862  %shuffle = shufflevector <16 x i8> %a, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1863  ret <16 x i8> %shuffle
1864}
1865
1866define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06(<16 x i8> %a) {
1867; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
1868; SSE:       # %bb.0:
1869; SSE-NEXT:    psrlq $8, %xmm0
1870; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
1871; SSE-NEXT:    retq
1872;
1873; AVX1-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
1874; AVX1:       # %bb.0:
1875; AVX1-NEXT:    vpsrlq $8, %xmm0, %xmm0
1876; AVX1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
1877; AVX1-NEXT:    retq
1878;
1879; AVX2-SLOW-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
1880; AVX2-SLOW:       # %bb.0:
1881; AVX2-SLOW-NEXT:    vpsrlq $8, %xmm0, %xmm0
1882; AVX2-SLOW-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
1883; AVX2-SLOW-NEXT:    retq
1884;
1885; AVX2-FAST-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
1886; AVX2-FAST:       # %bb.0:
1887; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,2,3,4,5,6]
1888; AVX2-FAST-NEXT:    retq
1889;
1890; AVX512VL-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
1891; AVX512VL:       # %bb.0:
1892; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,2,3,4,5,6]
1893; AVX512VL-NEXT:    retq
1894;
1895; XOP-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06:
1896; XOP:       # %bb.0:
1897; XOP-NEXT:    vpsrlq $8, %xmm0, %xmm0
1898; XOP-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
1899; XOP-NEXT:    retq
1900  %shuffle = shufflevector <16 x i8> %a, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
1901  ret <16 x i8> %shuffle
1902}
1903
1904define <16 x i8> @shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14(<16 x i8> %a) {
1905; SSE2-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
1906; SSE2:       # %bb.0:
1907; SSE2-NEXT:    movdqa %xmm0, %xmm1
1908; SSE2-NEXT:    psrld $24, %xmm1
1909; SSE2-NEXT:    pslld $8, %xmm0
1910; SSE2-NEXT:    por %xmm1, %xmm0
1911; SSE2-NEXT:    retq
1912;
1913; SSSE3-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
1914; SSSE3:       # %bb.0:
1915; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14]
1916; SSSE3-NEXT:    retq
1917;
1918; SSE41-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
1919; SSE41:       # %bb.0:
1920; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14]
1921; SSE41-NEXT:    retq
1922;
1923; AVX1-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
1924; AVX1:       # %bb.0:
1925; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14]
1926; AVX1-NEXT:    retq
1927;
1928; AVX2-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
1929; AVX2:       # %bb.0:
1930; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14]
1931; AVX2-NEXT:    retq
1932;
1933; AVX512VL-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
1934; AVX512VL:       # %bb.0:
1935; AVX512VL-NEXT:    vprold $8, %xmm0, %xmm0
1936; AVX512VL-NEXT:    retq
1937;
1938; XOP-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14:
1939; XOP:       # %bb.0:
1940; XOP-NEXT:    vprotd $8, %xmm0, %xmm0
1941; XOP-NEXT:    retq
1942  %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 11, i32 8, i32 9, i32 10, i32 15, i32 12, i32 13, i32 14>
1943  ret <16 x i8> %shuffle
1944}
1945
1946; PR44379
1947define <16 x i8> @shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09(<16 x i8> %a) {
1948; SSE-LABEL: shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09:
1949; SSE:       # %bb.0:
1950; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
1951; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
1952; SSE-NEXT:    retq
1953;
1954; AVX1-LABEL: shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09:
1955; AVX1:       # %bb.0:
1956; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
1957; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
1958; AVX1-NEXT:    retq
1959;
1960; AVX2-SLOW-LABEL: shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09:
1961; AVX2-SLOW:       # %bb.0:
1962; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
1963; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
1964; AVX2-SLOW-NEXT:    retq
1965;
1966; AVX2-FAST-LABEL: shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09:
1967; AVX2-FAST:       # %bb.0:
1968; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9]
1969; AVX2-FAST-NEXT:    retq
1970;
1971; AVX512VL-LABEL: shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09:
1972; AVX512VL:       # %bb.0:
1973; AVX512VL-NEXT:    vprolq $48, %xmm0, %xmm0
1974; AVX512VL-NEXT:    retq
1975;
1976; XOP-LABEL: shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09:
1977; XOP:       # %bb.0:
1978; XOP-NEXT:    vprotq $48, %xmm0, %xmm0
1979; XOP-NEXT:    retq
1980  %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9>
1981  ret <16 x i8> %shuffle
1982}
1983
1984define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) {
1985; SSE-LABEL: PR12412:
1986; SSE:       # %bb.0: # %entry
1987; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1988; SSE-NEXT:    pand %xmm2, %xmm1
1989; SSE-NEXT:    pand %xmm2, %xmm0
1990; SSE-NEXT:    packuswb %xmm1, %xmm0
1991; SSE-NEXT:    retq
1992;
1993; AVX1-LABEL: PR12412:
1994; AVX1:       # %bb.0: # %entry
1995; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
1996; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
1997; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1998; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1999; AVX1-NEXT:    retq
2000;
2001; AVX2-LABEL: PR12412:
2002; AVX2:       # %bb.0: # %entry
2003; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
2004; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
2005; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
2006; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2007; AVX2-NEXT:    retq
2008;
2009; AVX512VL-LABEL: PR12412:
2010; AVX512VL:       # %bb.0: # %entry
2011; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
2012; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
2013; AVX512VL-NEXT:    vpmovwb %ymm0, %xmm0
2014; AVX512VL-NEXT:    vzeroupper
2015; AVX512VL-NEXT:    retq
2016;
2017; XOP-LABEL: PR12412:
2018; XOP:       # %bb.0: # %entry
2019; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14]
2020; XOP-NEXT:    retq
2021entry:
2022  %0 = shufflevector <16 x i8> %inval1, <16 x i8> %inval2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
2023  ret <16 x i8> %0
2024}
2025
2026define <16 x i8> @shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz(<16 x i8> %a) {
2027; SSE-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz:
2028; SSE:       # %bb.0:
2029; SSE-NEXT:    psrld $8, %xmm0
2030; SSE-NEXT:    retq
2031;
2032; AVX-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz:
2033; AVX:       # %bb.0:
2034; AVX-NEXT:    vpsrld $8, %xmm0, %xmm0
2035; AVX-NEXT:    retq
2036  %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 undef, i32 2, i32 3, i32 16, i32 undef, i32 6, i32 7, i32 16, i32 undef, i32 10, i32 11, i32 16, i32 undef, i32 14, i32 15, i32 16>
2037  ret <16 x i8> %shuffle
2038}
2039
2040define <16 x i8> @shuffle_v16i8_bitcast_unpack(<16 x i8> %a, <16 x i8> %b) {
2041; SSE-LABEL: shuffle_v16i8_bitcast_unpack:
2042; SSE:       # %bb.0:
2043; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2044; SSE-NEXT:    retq
2045;
2046; AVX-LABEL: shuffle_v16i8_bitcast_unpack:
2047; AVX:       # %bb.0:
2048; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2049; AVX-NEXT:    retq
2050  %shuffle8  = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 7, i32 23, i32 6, i32 22, i32 5, i32 21, i32 4, i32 20, i32 3, i32 19, i32 2, i32 18, i32 1, i32 17, i32 0, i32 16>
2051  %bitcast32 = bitcast <16 x i8> %shuffle8 to <4 x float>
2052  %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
2053  %bitcast16 = bitcast <4 x float> %shuffle32 to <8 x i16>
2054  %shuffle16 = shufflevector <8 x i16> %bitcast16, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
2055  %bitcast8  = bitcast <8 x i16> %shuffle16 to <16 x i8>
2056  ret <16 x i8> %bitcast8
2057}
2058
2059define <16 x i8> @insert_dup_mem_v16i8_i32(i32* %ptr) {
2060; SSE2-LABEL: insert_dup_mem_v16i8_i32:
2061; SSE2:       # %bb.0:
2062; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2063; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2064; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2065; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2066; SSE2-NEXT:    retq
2067;
2068; SSSE3-LABEL: insert_dup_mem_v16i8_i32:
2069; SSSE3:       # %bb.0:
2070; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2071; SSSE3-NEXT:    pxor %xmm1, %xmm1
2072; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2073; SSSE3-NEXT:    retq
2074;
2075; SSE41-LABEL: insert_dup_mem_v16i8_i32:
2076; SSE41:       # %bb.0:
2077; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2078; SSE41-NEXT:    pxor %xmm1, %xmm1
2079; SSE41-NEXT:    pshufb %xmm1, %xmm0
2080; SSE41-NEXT:    retq
2081;
2082; AVX1-LABEL: insert_dup_mem_v16i8_i32:
2083; AVX1:       # %bb.0:
2084; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2085; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2086; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
2087; AVX1-NEXT:    retq
2088;
2089; AVX2OR512VL-LABEL: insert_dup_mem_v16i8_i32:
2090; AVX2OR512VL:       # %bb.0:
2091; AVX2OR512VL-NEXT:    vpbroadcastb (%rdi), %xmm0
2092; AVX2OR512VL-NEXT:    retq
2093;
2094; XOPAVX1-LABEL: insert_dup_mem_v16i8_i32:
2095; XOPAVX1:       # %bb.0:
2096; XOPAVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2097; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2098; XOPAVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
2099; XOPAVX1-NEXT:    retq
2100;
2101; XOPAVX2-LABEL: insert_dup_mem_v16i8_i32:
2102; XOPAVX2:       # %bb.0:
2103; XOPAVX2-NEXT:    vpbroadcastb (%rdi), %xmm0
2104; XOPAVX2-NEXT:    retq
2105  %tmp = load i32, i32* %ptr, align 4
2106  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
2107  %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
2108  %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> zeroinitializer
2109  ret <16 x i8> %tmp3
2110}
2111
2112define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) {
2113; SSE2-LABEL: insert_dup_mem_v16i8_sext_i8:
2114; SSE2:       # %bb.0:
2115; SSE2-NEXT:    movzbl (%rdi), %eax
2116; SSE2-NEXT:    movd %eax, %xmm0
2117; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2118; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2119; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2120; SSE2-NEXT:    retq
2121;
2122; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8:
2123; SSSE3:       # %bb.0:
2124; SSSE3-NEXT:    movzbl (%rdi), %eax
2125; SSSE3-NEXT:    movd %eax, %xmm0
2126; SSSE3-NEXT:    pxor %xmm1, %xmm1
2127; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2128; SSSE3-NEXT:    retq
2129;
2130; SSE41-LABEL: insert_dup_mem_v16i8_sext_i8:
2131; SSE41:       # %bb.0:
2132; SSE41-NEXT:    movzbl (%rdi), %eax
2133; SSE41-NEXT:    movd %eax, %xmm0
2134; SSE41-NEXT:    pxor %xmm1, %xmm1
2135; SSE41-NEXT:    pshufb %xmm1, %xmm0
2136; SSE41-NEXT:    retq
2137;
2138; AVX1-LABEL: insert_dup_mem_v16i8_sext_i8:
2139; AVX1:       # %bb.0:
2140; AVX1-NEXT:    movzbl (%rdi), %eax
2141; AVX1-NEXT:    vmovd %eax, %xmm0
2142; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2143; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
2144; AVX1-NEXT:    retq
2145;
2146; AVX2OR512VL-LABEL: insert_dup_mem_v16i8_sext_i8:
2147; AVX2OR512VL:       # %bb.0:
2148; AVX2OR512VL-NEXT:    vpbroadcastb (%rdi), %xmm0
2149; AVX2OR512VL-NEXT:    retq
2150;
2151; XOPAVX1-LABEL: insert_dup_mem_v16i8_sext_i8:
2152; XOPAVX1:       # %bb.0:
2153; XOPAVX1-NEXT:    movzbl (%rdi), %eax
2154; XOPAVX1-NEXT:    vmovd %eax, %xmm0
2155; XOPAVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2156; XOPAVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
2157; XOPAVX1-NEXT:    retq
2158;
2159; XOPAVX2-LABEL: insert_dup_mem_v16i8_sext_i8:
2160; XOPAVX2:       # %bb.0:
2161; XOPAVX2-NEXT:    vpbroadcastb (%rdi), %xmm0
2162; XOPAVX2-NEXT:    retq
2163  %tmp = load i8, i8* %ptr, align 1
2164  %tmp1 = sext i8 %tmp to i32
2165  %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
2166  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
2167  %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> zeroinitializer
2168  ret <16 x i8> %tmp4
2169}
2170
2171define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(i32* %ptr) {
2172; SSE2-LABEL: insert_dup_elt1_mem_v16i8_i32:
2173; SSE2:       # %bb.0:
2174; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2175; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2176; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
2177; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2178; SSE2-NEXT:    retq
2179;
2180; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_i32:
2181; SSSE3:       # %bb.0:
2182; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2183; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2184; SSSE3-NEXT:    retq
2185;
2186; SSE41-LABEL: insert_dup_elt1_mem_v16i8_i32:
2187; SSE41:       # %bb.0:
2188; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2189; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2190; SSE41-NEXT:    retq
2191;
2192; AVX1-LABEL: insert_dup_elt1_mem_v16i8_i32:
2193; AVX1:       # %bb.0:
2194; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2195; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2196; AVX1-NEXT:    retq
2197;
2198; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v16i8_i32:
2199; AVX2OR512VL:       # %bb.0:
2200; AVX2OR512VL-NEXT:    vpbroadcastb 1(%rdi), %xmm0
2201; AVX2OR512VL-NEXT:    retq
2202;
2203; XOPAVX1-LABEL: insert_dup_elt1_mem_v16i8_i32:
2204; XOPAVX1:       # %bb.0:
2205; XOPAVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2206; XOPAVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2207; XOPAVX1-NEXT:    retq
2208;
2209; XOPAVX2-LABEL: insert_dup_elt1_mem_v16i8_i32:
2210; XOPAVX2:       # %bb.0:
2211; XOPAVX2-NEXT:    vpbroadcastb 1(%rdi), %xmm0
2212; XOPAVX2-NEXT:    retq
2213  %tmp = load i32, i32* %ptr, align 4
2214  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
2215  %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
2216  %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2217  ret <16 x i8> %tmp3
2218}
2219
2220define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(i32* %ptr) {
2221; SSE2-LABEL: insert_dup_elt2_mem_v16i8_i32:
2222; SSE2:       # %bb.0:
2223; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2224; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2225; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
2226; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2227; SSE2-NEXT:    retq
2228;
2229; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_i32:
2230; SSSE3:       # %bb.0:
2231; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2232; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2233; SSSE3-NEXT:    retq
2234;
2235; SSE41-LABEL: insert_dup_elt2_mem_v16i8_i32:
2236; SSE41:       # %bb.0:
2237; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2238; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2239; SSE41-NEXT:    retq
2240;
2241; AVX1-LABEL: insert_dup_elt2_mem_v16i8_i32:
2242; AVX1:       # %bb.0:
2243; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2244; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2245; AVX1-NEXT:    retq
2246;
2247; AVX2OR512VL-LABEL: insert_dup_elt2_mem_v16i8_i32:
2248; AVX2OR512VL:       # %bb.0:
2249; AVX2OR512VL-NEXT:    vpbroadcastb 2(%rdi), %xmm0
2250; AVX2OR512VL-NEXT:    retq
2251;
2252; XOPAVX1-LABEL: insert_dup_elt2_mem_v16i8_i32:
2253; XOPAVX1:       # %bb.0:
2254; XOPAVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2255; XOPAVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2256; XOPAVX1-NEXT:    retq
2257;
2258; XOPAVX2-LABEL: insert_dup_elt2_mem_v16i8_i32:
2259; XOPAVX2:       # %bb.0:
2260; XOPAVX2-NEXT:    vpbroadcastb 2(%rdi), %xmm0
2261; XOPAVX2-NEXT:    retq
2262  %tmp = load i32, i32* %ptr, align 4
2263  %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0
2264  %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8>
2265  %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2266  ret <16 x i8> %tmp3
2267}
2268
2269define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(i8* %ptr) {
2270; SSE2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
2271; SSE2:       # %bb.0:
2272; SSE2-NEXT:    movsbl (%rdi), %eax
2273; SSE2-NEXT:    movd %eax, %xmm0
2274; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2275; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
2276; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2277; SSE2-NEXT:    retq
2278;
2279; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
2280; SSSE3:       # %bb.0:
2281; SSSE3-NEXT:    movsbl (%rdi), %eax
2282; SSSE3-NEXT:    movd %eax, %xmm0
2283; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2284; SSSE3-NEXT:    retq
2285;
2286; SSE41-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
2287; SSE41:       # %bb.0:
2288; SSE41-NEXT:    movsbl (%rdi), %eax
2289; SSE41-NEXT:    movd %eax, %xmm0
2290; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2291; SSE41-NEXT:    retq
2292;
2293; AVX1-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
2294; AVX1:       # %bb.0:
2295; AVX1-NEXT:    movsbl (%rdi), %eax
2296; AVX1-NEXT:    vmovd %eax, %xmm0
2297; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2298; AVX1-NEXT:    retq
2299;
2300; AVX2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
2301; AVX2:       # %bb.0:
2302; AVX2-NEXT:    movsbl (%rdi), %eax
2303; AVX2-NEXT:    shrl $8, %eax
2304; AVX2-NEXT:    vmovd %eax, %xmm0
2305; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
2306; AVX2-NEXT:    retq
2307;
2308; AVX512VL-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
2309; AVX512VL:       # %bb.0:
2310; AVX512VL-NEXT:    movsbl (%rdi), %eax
2311; AVX512VL-NEXT:    shrl $8, %eax
2312; AVX512VL-NEXT:    vpbroadcastb %eax, %xmm0
2313; AVX512VL-NEXT:    retq
2314;
2315; XOPAVX1-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
2316; XOPAVX1:       # %bb.0:
2317; XOPAVX1-NEXT:    movsbl (%rdi), %eax
2318; XOPAVX1-NEXT:    vmovd %eax, %xmm0
2319; XOPAVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
2320; XOPAVX1-NEXT:    retq
2321;
2322; XOPAVX2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
2323; XOPAVX2:       # %bb.0:
2324; XOPAVX2-NEXT:    movsbl (%rdi), %eax
2325; XOPAVX2-NEXT:    shrl $8, %eax
2326; XOPAVX2-NEXT:    vmovd %eax, %xmm0
2327; XOPAVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
2328; XOPAVX2-NEXT:    retq
2329  %tmp = load i8, i8* %ptr, align 1
2330  %tmp1 = sext i8 %tmp to i32
2331  %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
2332  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
2333  %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
2334  ret <16 x i8> %tmp4
2335}
2336
2337define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(i8* %ptr) {
2338; SSE2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
2339; SSE2:       # %bb.0:
2340; SSE2-NEXT:    movsbl (%rdi), %eax
2341; SSE2-NEXT:    movd %eax, %xmm0
2342; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2343; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
2344; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2345; SSE2-NEXT:    retq
2346;
2347; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
2348; SSSE3:       # %bb.0:
2349; SSSE3-NEXT:    movsbl (%rdi), %eax
2350; SSSE3-NEXT:    movd %eax, %xmm0
2351; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2352; SSSE3-NEXT:    retq
2353;
2354; SSE41-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
2355; SSE41:       # %bb.0:
2356; SSE41-NEXT:    movsbl (%rdi), %eax
2357; SSE41-NEXT:    movd %eax, %xmm0
2358; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2359; SSE41-NEXT:    retq
2360;
2361; AVX1-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
2362; AVX1:       # %bb.0:
2363; AVX1-NEXT:    movsbl (%rdi), %eax
2364; AVX1-NEXT:    vmovd %eax, %xmm0
2365; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2366; AVX1-NEXT:    retq
2367;
2368; AVX2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
2369; AVX2:       # %bb.0:
2370; AVX2-NEXT:    movsbl (%rdi), %eax
2371; AVX2-NEXT:    shrl $16, %eax
2372; AVX2-NEXT:    vmovd %eax, %xmm0
2373; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
2374; AVX2-NEXT:    retq
2375;
2376; AVX512VL-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
2377; AVX512VL:       # %bb.0:
2378; AVX512VL-NEXT:    movsbl (%rdi), %eax
2379; AVX512VL-NEXT:    shrl $16, %eax
2380; AVX512VL-NEXT:    vpbroadcastb %eax, %xmm0
2381; AVX512VL-NEXT:    retq
2382;
2383; XOPAVX1-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
2384; XOPAVX1:       # %bb.0:
2385; XOPAVX1-NEXT:    movsbl (%rdi), %eax
2386; XOPAVX1-NEXT:    vmovd %eax, %xmm0
2387; XOPAVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
2388; XOPAVX1-NEXT:    retq
2389;
2390; XOPAVX2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
2391; XOPAVX2:       # %bb.0:
2392; XOPAVX2-NEXT:    movsbl (%rdi), %eax
2393; XOPAVX2-NEXT:    shrl $16, %eax
2394; XOPAVX2-NEXT:    vmovd %eax, %xmm0
2395; XOPAVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
2396; XOPAVX2-NEXT:    retq
2397  %tmp = load i8, i8* %ptr, align 1
2398  %tmp1 = sext i8 %tmp to i32
2399  %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0
2400  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
2401  %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
2402  ret <16 x i8> %tmp4
2403}
2404
2405define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b) {
2406; SSE2-LABEL: PR31364:
2407; SSE2:       # %bb.0:
2408; SSE2-NEXT:    movzbl (%rdi), %eax
2409; SSE2-NEXT:    movzbl (%rsi), %ecx
2410; SSE2-NEXT:    shll $8, %ecx
2411; SSE2-NEXT:    orl %eax, %ecx
2412; SSE2-NEXT:    movd %ecx, %xmm1
2413; SSE2-NEXT:    pxor %xmm0, %xmm0
2414; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2415; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,3,4,5,6,7]
2416; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
2417; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
2418; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
2419; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,4,4]
2420; SSE2-NEXT:    packuswb %xmm1, %xmm0
2421; SSE2-NEXT:    retq
2422;
2423; SSSE3-LABEL: PR31364:
2424; SSSE3:       # %bb.0:
2425; SSSE3-NEXT:    movzbl (%rdi), %eax
2426; SSSE3-NEXT:    movzbl (%rsi), %ecx
2427; SSSE3-NEXT:    shll $8, %ecx
2428; SSSE3-NEXT:    orl %eax, %ecx
2429; SSSE3-NEXT:    movd %ecx, %xmm0
2430; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
2431; SSSE3-NEXT:    retq
2432;
2433; SSE41-LABEL: PR31364:
2434; SSE41:       # %bb.0:
2435; SSE41-NEXT:    movzbl (%rdi), %eax
2436; SSE41-NEXT:    movd %eax, %xmm0
2437; SSE41-NEXT:    pinsrb $1, (%rsi), %xmm0
2438; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
2439; SSE41-NEXT:    retq
2440;
2441; AVX-LABEL: PR31364:
2442; AVX:       # %bb.0:
2443; AVX-NEXT:    movzbl (%rdi), %eax
2444; AVX-NEXT:    vmovd %eax, %xmm0
2445; AVX-NEXT:    vpinsrb $1, (%rsi), %xmm0, %xmm0
2446; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
2447; AVX-NEXT:    retq
2448  %v0 = load i8, i8* %a, align 1
2449  %vecins = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %v0, i32 0
2450  %v1 = load i8, i8* %b, align 1
2451  %vecins2 = insertelement <16 x i8> %vecins, i8 %v1, i32 1
2452  %result = shufflevector <16 x i8> %vecins2, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 3, i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0>
2453  ret <16 x i8> %result
2454}
2455
2456define <16 x i8> @PR31301(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
2457; SSE2-LABEL: PR31301:
2458; SSE2:       # %bb.0: # %entry
2459; SSE2-NEXT:    movzbl (%rdi), %eax
2460; SSE2-NEXT:    movd %eax, %xmm0
2461; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2462; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
2463; SSE2-NEXT:    movzbl (%rsi), %eax
2464; SSE2-NEXT:    movd %eax, %xmm1
2465; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2466; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
2467; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2468; SSE2-NEXT:    retq
2469;
2470; SSSE3-LABEL: PR31301:
2471; SSSE3:       # %bb.0: # %entry
2472; SSSE3-NEXT:    movzbl (%rdi), %eax
2473; SSSE3-NEXT:    movd %eax, %xmm0
2474; SSSE3-NEXT:    pxor %xmm1, %xmm1
2475; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2476; SSSE3-NEXT:    movzbl (%rsi), %eax
2477; SSSE3-NEXT:    movd %eax, %xmm2
2478; SSSE3-NEXT:    pshufb %xmm1, %xmm2
2479; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2480; SSSE3-NEXT:    retq
2481;
2482; SSE41-LABEL: PR31301:
2483; SSE41:       # %bb.0: # %entry
2484; SSE41-NEXT:    movzbl (%rdi), %eax
2485; SSE41-NEXT:    movd %eax, %xmm0
2486; SSE41-NEXT:    pxor %xmm1, %xmm1
2487; SSE41-NEXT:    pshufb %xmm1, %xmm0
2488; SSE41-NEXT:    movzbl (%rsi), %eax
2489; SSE41-NEXT:    movd %eax, %xmm2
2490; SSE41-NEXT:    pshufb %xmm1, %xmm2
2491; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2492; SSE41-NEXT:    retq
2493;
2494; AVX1-LABEL: PR31301:
2495; AVX1:       # %bb.0: # %entry
2496; AVX1-NEXT:    movzbl (%rdi), %eax
2497; AVX1-NEXT:    vmovd %eax, %xmm0
2498; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2499; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
2500; AVX1-NEXT:    movzbl (%rsi), %eax
2501; AVX1-NEXT:    vmovd %eax, %xmm2
2502; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
2503; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2504; AVX1-NEXT:    retq
2505;
2506; AVX2OR512VL-LABEL: PR31301:
2507; AVX2OR512VL:       # %bb.0: # %entry
2508; AVX2OR512VL-NEXT:    vpbroadcastb (%rdi), %xmm0
2509; AVX2OR512VL-NEXT:    vpbroadcastb (%rsi), %xmm1
2510; AVX2OR512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2511; AVX2OR512VL-NEXT:    retq
2512;
2513; XOPAVX1-LABEL: PR31301:
2514; XOPAVX1:       # %bb.0: # %entry
2515; XOPAVX1-NEXT:    movzbl (%rdi), %eax
2516; XOPAVX1-NEXT:    vmovd %eax, %xmm0
2517; XOPAVX1-NEXT:    movzbl (%rsi), %eax
2518; XOPAVX1-NEXT:    vmovd %eax, %xmm1
2519; XOPAVX1-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[0],xmm1[0],xmm0[0],xmm1[0],xmm0[0],xmm1[0],xmm0[0],xmm1[0],xmm0[0],xmm1[0],xmm0[0],xmm1[0],xmm0[0],xmm1[0]
2520; XOPAVX1-NEXT:    retq
2521;
2522; XOPAVX2-LABEL: PR31301:
2523; XOPAVX2:       # %bb.0: # %entry
2524; XOPAVX2-NEXT:    vpbroadcastb (%rdi), %xmm0
2525; XOPAVX2-NEXT:    vpbroadcastb (%rsi), %xmm1
2526; XOPAVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2527; XOPAVX2-NEXT:    retq
2528entry:
2529  %0 = load i8, i8* %x, align 1
2530  %1 = insertelement <16 x i8> undef, i8 %0, i32 0
2531  %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2532  %2 = load i8, i8* %y, align 1
2533  %3 = insertelement <16 x i8> undef, i8 %2, i32 0
2534  %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2535  %vzip.i = shufflevector <16 x i8> %lane, <16 x i8> %lane3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
2536  ret <16 x i8> %vzip.i
2537}
2538