1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck --check-prefixes=AVX2-SLOW %s
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2-FAST,AVX2-FAST-ALL %s
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2-FAST,AVX2-FAST-PERLANE %s
5
6; These patterns are produced by LoopVectorizer for interleaved stores.
7
8define void @vf2(<2 x i16>* %in.vecptr0, <2 x i16>* %in.vecptr1, <2 x i16>* %in.vecptr2, <2 x i16>* %in.vecptr3, <2 x i16>* %in.vecptr4, <10 x i16>* %out.vec) nounwind {
9; AVX2-SLOW-LABEL: vf2:
10; AVX2-SLOW:       # %bb.0:
11; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
12; AVX2-SLOW-NEXT:    vmovdqa (%rdx), %xmm1
13; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
14; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
15; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
16; AVX2-SLOW-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
17; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,2,3,6,7,10,11,u,u,18,19,u,u,u,u,u,u,u,u,u,u,u,u]
18; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
19; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,3,0,3,4,7,4,7]
20; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
21; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
22; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
23; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
24; AVX2-SLOW-NEXT:    vmovd %xmm1, 16(%r9)
25; AVX2-SLOW-NEXT:    vmovdqa %xmm0, (%r9)
26; AVX2-SLOW-NEXT:    vzeroupper
27; AVX2-SLOW-NEXT:    retq
28;
29; AVX2-FAST-LABEL: vf2:
30; AVX2-FAST:       # %bb.0:
31; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
32; AVX2-FAST-NEXT:    vmovdqa (%rdx), %xmm1
33; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
34; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
35; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
36; AVX2-FAST-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
37; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,ymm0[2,3,6,7,10,11],zero,zero,ymm0[18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
38; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
39; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[30,31],zero,zero,ymm0[30,31,30,31,16,17,18,19,28,29,30,31]
40; AVX2-FAST-NEXT:    vpor %ymm0, %ymm1, %ymm0
41; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
42; AVX2-FAST-NEXT:    vmovd %xmm1, 16(%r9)
43; AVX2-FAST-NEXT:    vmovdqa %xmm0, (%r9)
44; AVX2-FAST-NEXT:    vzeroupper
45; AVX2-FAST-NEXT:    retq
46  %in.vec0 = load <2 x i16>, <2 x i16>* %in.vecptr0, align 32
47  %in.vec1 = load <2 x i16>, <2 x i16>* %in.vecptr1, align 32
48  %in.vec2 = load <2 x i16>, <2 x i16>* %in.vecptr2, align 32
49  %in.vec3 = load <2 x i16>, <2 x i16>* %in.vecptr3, align 32
50  %in.vec4 = load <2 x i16>, <2 x i16>* %in.vecptr4, align 32
51
52  %concat01 = shufflevector <2 x i16> %in.vec0, <2 x i16> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
53  %concat23 = shufflevector <2 x i16> %in.vec2, <2 x i16> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
54  %concat0123 = shufflevector <4 x i16> %concat01, <4 x i16> %concat23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
55  %concat4uuu = shufflevector <2 x i16> %in.vec4, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
56  %concat01234 = shufflevector <8 x i16> %concat0123, <8 x i16> %concat4uuu, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
57  %interleaved.vec = shufflevector <10 x i16> %concat01234, <10 x i16> poison, <10 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 1, i32 3, i32 5, i32 7, i32 9>
58
59  store <10 x i16> %interleaved.vec, <10 x i16>* %out.vec, align 32
60
61  ret void
62}
63
64define void @vf4(<4 x i16>* %in.vecptr0, <4 x i16>* %in.vecptr1, <4 x i16>* %in.vecptr2, <4 x i16>* %in.vecptr3, <4 x i16>* %in.vecptr4, <20 x i16>* %out.vec) nounwind {
65; AVX2-SLOW-LABEL: vf4:
66; AVX2-SLOW:       # %bb.0:
67; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
68; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
69; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
70; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
71; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
72; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
73; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
74; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
75; AVX2-SLOW-NEXT:    vpbroadcastq %xmm3, %ymm3
76; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u]
77; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
78; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23]
79; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15]
80; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
81; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
82; AVX2-SLOW-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
83; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
84; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7]
85; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
86; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
87; AVX2-SLOW-NEXT:    vmovq %xmm0, 32(%r9)
88; AVX2-SLOW-NEXT:    vmovdqa %ymm2, (%r9)
89; AVX2-SLOW-NEXT:    vzeroupper
90; AVX2-SLOW-NEXT:    retq
91;
92; AVX2-FAST-LABEL: vf4:
93; AVX2-FAST:       # %bb.0:
94; AVX2-FAST-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
95; AVX2-FAST-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
96; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
97; AVX2-FAST-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
98; AVX2-FAST-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
99; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
100; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm2
101; AVX2-FAST-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
102; AVX2-FAST-NEXT:    vpbroadcastq %xmm3, %ymm3
103; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u]
104; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
105; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23]
106; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15]
107; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
108; AVX2-FAST-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
109; AVX2-FAST-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
110; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u]
111; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
112; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
113; AVX2-FAST-NEXT:    vmovq %xmm0, 32(%r9)
114; AVX2-FAST-NEXT:    vmovdqa %ymm2, (%r9)
115; AVX2-FAST-NEXT:    vzeroupper
116; AVX2-FAST-NEXT:    retq
117  %in.vec0 = load <4 x i16>, <4 x i16>* %in.vecptr0, align 32
118  %in.vec1 = load <4 x i16>, <4 x i16>* %in.vecptr1, align 32
119  %in.vec2 = load <4 x i16>, <4 x i16>* %in.vecptr2, align 32
120  %in.vec3 = load <4 x i16>, <4 x i16>* %in.vecptr3, align 32
121  %in.vec4 = load <4 x i16>, <4 x i16>* %in.vecptr4, align 32
122
123  %concat01 = shufflevector <4 x i16> %in.vec0, <4 x i16> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
124  %concat23 = shufflevector <4 x i16> %in.vec2, <4 x i16> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
125  %concat0123 = shufflevector <8 x i16> %concat01, <8 x i16> %concat23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
126  %concat4uuu = shufflevector <4 x i16> %in.vec4, <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
127  %concat01234 = shufflevector <16 x i16> %concat0123, <16 x i16> %concat4uuu, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
128  %interleaved.vec = shufflevector <20 x i16> %concat01234, <20 x i16> poison, <20 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 1, i32 5, i32 9, i32 13, i32 17, i32 2, i32 6, i32 10, i32 14, i32 18, i32 3, i32 7, i32 11, i32 15, i32 19>
129
130  store <20 x i16> %interleaved.vec, <20 x i16>* %out.vec, align 32
131
132  ret void
133}
134
135define void @vf8(<8 x i16>* %in.vecptr0, <8 x i16>* %in.vecptr1, <8 x i16>* %in.vecptr2, <8 x i16>* %in.vecptr3, <8 x i16>* %in.vecptr4, <40 x i16>* %out.vec) nounwind {
136; AVX2-SLOW-LABEL: vf8:
137; AVX2-SLOW:       # %bb.0:
138; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm3
139; AVX2-SLOW-NEXT:    vmovdqa (%rsi), %xmm0
140; AVX2-SLOW-NEXT:    vmovdqa (%rdx), %xmm1
141; AVX2-SLOW-NEXT:    vmovdqa (%rcx), %xmm2
142; AVX2-SLOW-NEXT:    vmovdqa (%r8), %xmm4
143; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm5
144; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm5[6,7,u,u,u,u,10,11,u,u,8,9,u,u,u,u,22,23,u,u,u,u,26,27,u,u,24,25,u,u,u,u]
145; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
146; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,6,7,10,11,u,u,8,9,u,u,8,9,12,13,u,u,22,23,26,27,u,u,24,25,u,u,24,25,28,29]
147; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4],ymm6[5],ymm5[6,7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12],ymm6[13],ymm5[14,15]
148; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm6
149; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[8,9,10,11,6,7,u,u,u,u,10,11,12,13,8,9,24,25,26,27,22,23,u,u,u,u,26,27,28,29,24,25]
150; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
151; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
152; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4],ymm7[5,6,7,8,9,10],ymm6[11,12],ymm7[13,14,15]
153; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
154; AVX2-SLOW-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
155; AVX2-SLOW-NEXT:    vpbroadcastq 8(%rdi), %ymm6
156; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
157; AVX2-SLOW-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
158; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm6
159; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0]
160; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,12,13],zero,zero,zero,zero,ymm6[2,3,18,19,18,19],zero,zero,zero,zero,ymm6[28,29,20,21,28,29],zero,zero
161; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm8
162; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2]
163; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm8[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm8[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm8[22,23]
164; AVX2-SLOW-NEXT:    vpor %ymm6, %ymm8, %ymm6
165; AVX2-SLOW-NEXT:    vpbroadcastq (%r8), %ymm8
166; AVX2-SLOW-NEXT:    vpblendvb %ymm7, %ymm6, %ymm8, %ymm6
167; AVX2-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
168; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,6]
169; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
170; AVX2-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
171; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6]
172; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
173; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7]
174; AVX2-SLOW-NEXT:    vpsrlq $48, %xmm0, %xmm0
175; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
176; AVX2-SLOW-NEXT:    vmovdqa %xmm0, 64(%r9)
177; AVX2-SLOW-NEXT:    vmovdqa %ymm6, (%r9)
178; AVX2-SLOW-NEXT:    vmovdqa %ymm5, 32(%r9)
179; AVX2-SLOW-NEXT:    vzeroupper
180; AVX2-SLOW-NEXT:    retq
181;
182; AVX2-FAST-ALL-LABEL: vf8:
183; AVX2-FAST-ALL:       # %bb.0:
184; AVX2-FAST-ALL-NEXT:    vmovdqa (%rdi), %xmm1
185; AVX2-FAST-ALL-NEXT:    vmovdqa (%rsi), %xmm0
186; AVX2-FAST-ALL-NEXT:    vmovdqa (%rdx), %xmm2
187; AVX2-FAST-ALL-NEXT:    vmovdqa (%rcx), %xmm3
188; AVX2-FAST-ALL-NEXT:    vmovdqa (%r8), %xmm4
189; AVX2-FAST-ALL-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm5
190; AVX2-FAST-ALL-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0]
191; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[0,1,8,9,12,13],zero,zero,zero,zero,ymm5[2,3,18,19,18,19],zero,zero,zero,zero,ymm5[28,29,20,21,28,29],zero,zero
192; AVX2-FAST-ALL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm6
193; AVX2-FAST-ALL-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2]
194; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm6[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm6[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm6[22,23]
195; AVX2-FAST-ALL-NEXT:    vpor %ymm5, %ymm6, %ymm5
196; AVX2-FAST-ALL-NEXT:    vpbroadcastq (%r8), %ymm6
197; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
198; AVX2-FAST-ALL-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
199; AVX2-FAST-ALL-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm6
200; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm8 = <1,5,2,u,6,2,u,u>
201; AVX2-FAST-ALL-NEXT:    vpermd %ymm6, %ymm8, %ymm6
202; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[2,3,6,7,6,7],zero,zero,zero,zero,ymm6[8,9,16,17,18,19],zero,zero,zero,zero,ymm6[22,23,18,19,18,19],zero,zero
203; AVX2-FAST-ALL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm8
204; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm9 = <1,5,2,6,2,6,3,u>
205; AVX2-FAST-ALL-NEXT:    vpermd %ymm8, %ymm9, %ymm8
206; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[2,3,6,7],zero,zero,zero,zero,zero,zero,ymm8[8,9,12,13],zero,zero,zero,zero,zero,zero,ymm8[18,19,22,23],zero,zero,zero,zero,zero,zero,ymm8[24,25]
207; AVX2-FAST-ALL-NEXT:    vpor %ymm6, %ymm8, %ymm6
208; AVX2-FAST-ALL-NEXT:    vpbroadcastq 8(%rdi), %ymm8
209; AVX2-FAST-ALL-NEXT:    vpblendvb %ymm7, %ymm6, %ymm8, %ymm6
210; AVX2-FAST-ALL-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
211; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,8,9,14,15,u,u,u,u,u,u,12,13]
212; AVX2-FAST-ALL-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
213; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u]
214; AVX2-FAST-ALL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6],xmm1[7]
215; AVX2-FAST-ALL-NEXT:    vpsrlq $48, %xmm0, %xmm0
216; AVX2-FAST-ALL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
217; AVX2-FAST-ALL-NEXT:    vmovdqa %xmm0, 64(%r9)
218; AVX2-FAST-ALL-NEXT:    vmovdqa %ymm6, 32(%r9)
219; AVX2-FAST-ALL-NEXT:    vmovdqa %ymm5, (%r9)
220; AVX2-FAST-ALL-NEXT:    vzeroupper
221; AVX2-FAST-ALL-NEXT:    retq
222;
223; AVX2-FAST-PERLANE-LABEL: vf8:
224; AVX2-FAST-PERLANE:       # %bb.0:
225; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm2
226; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsi), %xmm0
227; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdx), %xmm1
228; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rcx), %xmm3
229; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%r8), %xmm4
230; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm5
231; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm6 = ymm5[6,7,u,u,u,u,10,11,u,u,8,9,u,u,u,u,22,23,u,u,u,u,26,27,u,u,24,25,u,u,u,u]
232; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
233; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,6,7,10,11,u,u,8,9,u,u,8,9,12,13,u,u,22,23,26,27,u,u,24,25,u,u,24,25,28,29]
234; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4],ymm6[5],ymm5[6,7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12],ymm6[13],ymm5[14,15]
235; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm6
236; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[8,9,10,11,6,7,u,u,u,u,10,11,12,13,8,9,24,25,26,27,22,23,u,u,u,u,26,27,28,29,24,25]
237; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
238; AVX2-FAST-PERLANE-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13]
239; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4],ymm7[5,6,7,8,9,10],ymm6[11,12],ymm7[13,14,15]
240; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
241; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
242; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq 8(%rdi), %ymm6
243; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
244; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
245; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm6
246; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0]
247; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,12,13],zero,zero,zero,zero,ymm6[2,3,18,19,18,19],zero,zero,zero,zero,ymm6[28,29,20,21,28,29],zero,zero
248; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm8
249; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2]
250; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm8[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm8[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm8[22,23]
251; AVX2-FAST-PERLANE-NEXT:    vpor %ymm6, %ymm8, %ymm6
252; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq (%r8), %ymm8
253; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm6, %ymm8, %ymm6
254; AVX2-FAST-PERLANE-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
255; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,8,9,14,15,u,u,u,u,u,u,12,13]
256; AVX2-FAST-PERLANE-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
257; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u]
258; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7]
259; AVX2-FAST-PERLANE-NEXT:    vpsrlq $48, %xmm0, %xmm0
260; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
261; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm0, 64(%r9)
262; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm6, (%r9)
263; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm5, 32(%r9)
264; AVX2-FAST-PERLANE-NEXT:    vzeroupper
265; AVX2-FAST-PERLANE-NEXT:    retq
266  %in.vec0 = load <8 x i16>, <8 x i16>* %in.vecptr0, align 32
267  %in.vec1 = load <8 x i16>, <8 x i16>* %in.vecptr1, align 32
268  %in.vec2 = load <8 x i16>, <8 x i16>* %in.vecptr2, align 32
269  %in.vec3 = load <8 x i16>, <8 x i16>* %in.vecptr3, align 32
270  %in.vec4 = load <8 x i16>, <8 x i16>* %in.vecptr4, align 32
271
272  %concat01 = shufflevector <8 x i16> %in.vec0, <8 x i16> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
273  %concat23 = shufflevector <8 x i16> %in.vec2, <8 x i16> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
274  %concat0123 = shufflevector <16 x i16> %concat01, <16 x i16> %concat23, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
275  %concat4uuu = shufflevector <8 x i16> %in.vec4, <8 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
276  %concat01234 = shufflevector <32 x i16> %concat0123, <32 x i16> %concat4uuu, <40 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
277  %interleaved.vec = shufflevector <40 x i16> %concat01234, <40 x i16> poison, <40 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 1, i32 9, i32 17, i32 25, i32 33, i32 2, i32 10, i32 18, i32 26, i32 34, i32 3, i32 11, i32 19, i32 27, i32 35, i32 4, i32 12, i32 20, i32 28, i32 36, i32 5, i32 13, i32 21, i32 29, i32 37, i32 6, i32 14, i32 22, i32 30, i32 38, i32 7, i32 15, i32 23, i32 31, i32 39>
278
279  store <40 x i16> %interleaved.vec, <40 x i16>* %out.vec, align 32
280
281  ret void
282}
283
284define void @vf16(<16 x i16>* %in.vecptr0, <16 x i16>* %in.vecptr1, <16 x i16>* %in.vecptr2, <16 x i16>* %in.vecptr3, <16 x i16>* %in.vecptr4, <80 x i16>* %out.vec) nounwind {
285; AVX2-SLOW-LABEL: vf16:
286; AVX2-SLOW:       # %bb.0:
287; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm12
288; AVX2-SLOW-NEXT:    vmovdqa (%rsi), %ymm11
289; AVX2-SLOW-NEXT:    vmovdqa (%rdx), %ymm3
290; AVX2-SLOW-NEXT:    vmovdqa (%rcx), %ymm4
291; AVX2-SLOW-NEXT:    vmovdqa (%r8), %ymm2
292; AVX2-SLOW-NEXT:    vmovdqa (%rdx), %xmm5
293; AVX2-SLOW-NEXT:    vmovdqa (%rcx), %xmm6
294; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
295; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
296; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
297; AVX2-SLOW-NEXT:    vmovdqa (%rsi), %xmm0
298; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
299; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
300; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
301; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6]
302; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
303; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
304; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm1, %ymm7, %ymm1
305; AVX2-SLOW-NEXT:    vpbroadcastq (%r8), %ymm8
306; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
307; AVX2-SLOW-NEXT:    vpblendvb %ymm7, %ymm1, %ymm8, %ymm9
308; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm5[1,2,2,2]
309; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,u,u,10,11,u,u,8,9,8,9,u,u,12,13]
310; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
311; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
312; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,1,2,3]
313; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm6[3,3,3,3,4,5,6,7]
314; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,6,4]
315; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2],xmm1[3],xmm5[4],xmm1[5,6],xmm5[7]
316; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
317; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
318; AVX2-SLOW-NEXT:    vpbroadcastq 8(%rdi), %ymm1
319; AVX2-SLOW-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm8
320; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm2[0,1,0,1,4,5,4,5]
321; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,u,u,24,25,20,21,u,u,24,25]
322; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
323; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
324; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm12[0,1,2,1,4,5,6,5]
325; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15]
326; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
327; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3],ymm1[4],ymm5[5,6],ymm1[7],ymm5[8,9],ymm1[10],ymm5[11],ymm1[12],ymm5[13,14],ymm1[15]
328; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
329; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
330; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[1,2,2,3,5,6,6,7]
331; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
332; AVX2-SLOW-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
333; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7]
334; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,3,2,3,6,7,6,7]
335; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15]
336; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
337; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm5 = ymm3[3,2,3,3,7,6,7,7]
338; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm11[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
339; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[0,2,3,2,4,6,7,6]
340; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15]
341; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
342; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm1, %ymm5, %ymm1
343; AVX2-SLOW-NEXT:    vpsrldq {{.*#+}} ymm5 = ymm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
344; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2]
345; AVX2-SLOW-NEXT:    vpblendvb %ymm7, %ymm1, %ymm5, %ymm1
346; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
347; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4]
348; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
349; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm12[1,1,2,2]
350; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2]
351; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4],ymm4[5],ymm2[6,7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12],ymm4[13],ymm2[14,15]
352; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm3, %ymm2, %ymm2
353; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm11[0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31]
354; AVX2-SLOW-NEXT:    vpblendvb %ymm7, %ymm2, %ymm3, %ymm2
355; AVX2-SLOW-NEXT:    vmovdqa %ymm2, 64(%r9)
356; AVX2-SLOW-NEXT:    vmovdqa %ymm1, 128(%r9)
357; AVX2-SLOW-NEXT:    vmovdqa %ymm0, 96(%r9)
358; AVX2-SLOW-NEXT:    vmovdqa %ymm8, 32(%r9)
359; AVX2-SLOW-NEXT:    vmovdqa %ymm9, (%r9)
360; AVX2-SLOW-NEXT:    vzeroupper
361; AVX2-SLOW-NEXT:    retq
362;
363; AVX2-FAST-ALL-LABEL: vf16:
364; AVX2-FAST-ALL:       # %bb.0:
365; AVX2-FAST-ALL-NEXT:    vmovdqa (%rdi), %ymm12
366; AVX2-FAST-ALL-NEXT:    vmovdqa (%rsi), %ymm11
367; AVX2-FAST-ALL-NEXT:    vmovdqa (%rdx), %ymm3
368; AVX2-FAST-ALL-NEXT:    vmovdqa (%rcx), %ymm4
369; AVX2-FAST-ALL-NEXT:    vmovdqa (%r8), %ymm2
370; AVX2-FAST-ALL-NEXT:    vmovdqa (%rsi), %xmm5
371; AVX2-FAST-ALL-NEXT:    vmovdqa (%rdi), %xmm6
372; AVX2-FAST-ALL-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
373; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13]
374; AVX2-FAST-ALL-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[0,1,0,1]
375; AVX2-FAST-ALL-NEXT:    vmovdqa (%rdx), %xmm6
376; AVX2-FAST-ALL-NEXT:    vmovdqa (%rcx), %xmm0
377; AVX2-FAST-ALL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
378; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
379; AVX2-FAST-ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
380; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
381; AVX2-FAST-ALL-NEXT:    vpblendvb %ymm10, %ymm7, %ymm1, %ymm1
382; AVX2-FAST-ALL-NEXT:    vpbroadcastq (%r8), %ymm8
383; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
384; AVX2-FAST-ALL-NEXT:    vpblendvb %ymm7, %ymm1, %ymm8, %ymm9
385; AVX2-FAST-ALL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm6[1,2,2,2]
386; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,u,u,10,11,u,u,8,9,8,9,u,u,12,13]
387; AVX2-FAST-ALL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7]
388; AVX2-FAST-ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
389; AVX2-FAST-ALL-NEXT:    vpshufd {{.*#+}} xmm5 = mem[2,1,2,3]
390; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,6,7,u,u,10,11,u,u,u,u,8,9]
391; AVX2-FAST-ALL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3],xmm0[4],xmm5[5,6],xmm0[7]
392; AVX2-FAST-ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
393; AVX2-FAST-ALL-NEXT:    vpblendvb %ymm10, %ymm1, %ymm0, %ymm0
394; AVX2-FAST-ALL-NEXT:    vpbroadcastq 8(%rdi), %ymm1
395; AVX2-FAST-ALL-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm8
396; AVX2-FAST-ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm2[0,1,0,1,4,5,4,5]
397; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,u,u,24,25,20,21,u,u,24,25]
398; AVX2-FAST-ALL-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
399; AVX2-FAST-ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
400; AVX2-FAST-ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm12[0,1,2,1,4,5,6,5]
401; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u]
402; AVX2-FAST-ALL-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3],ymm1[4],ymm5[5,6],ymm1[7],ymm5[8,9],ymm1[10],ymm5[11],ymm1[12],ymm5[13,14],ymm1[15]
403; AVX2-FAST-ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
404; AVX2-FAST-ALL-NEXT:    vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
405; AVX2-FAST-ALL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [25769803781,25769803781,25769803781,25769803781]
406; AVX2-FAST-ALL-NEXT:    vpermd %ymm3, %ymm1, %ymm1
407; AVX2-FAST-ALL-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
408; AVX2-FAST-ALL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7]
409; AVX2-FAST-ALL-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,3,2,3,6,7,6,7]
410; AVX2-FAST-ALL-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15]
411; AVX2-FAST-ALL-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
412; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29]
413; AVX2-FAST-ALL-NEXT:    vpshufd {{.*#+}} ymm6 = ymm3[3,2,3,3,7,6,7,7]
414; AVX2-FAST-ALL-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15]
415; AVX2-FAST-ALL-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
416; AVX2-FAST-ALL-NEXT:    vpblendvb %ymm10, %ymm1, %ymm5, %ymm1
417; AVX2-FAST-ALL-NEXT:    vpsrldq {{.*#+}} ymm5 = ymm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
418; AVX2-FAST-ALL-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2]
419; AVX2-FAST-ALL-NEXT:    vpblendvb %ymm7, %ymm1, %ymm5, %ymm1
420; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
421; AVX2-FAST-ALL-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4]
422; AVX2-FAST-ALL-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
423; AVX2-FAST-ALL-NEXT:    vpermq {{.*#+}} ymm4 = ymm12[1,1,2,2]
424; AVX2-FAST-ALL-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2]
425; AVX2-FAST-ALL-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4],ymm4[5],ymm2[6,7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12],ymm4[13],ymm2[14,15]
426; AVX2-FAST-ALL-NEXT:    vpblendvb %ymm10, %ymm3, %ymm2, %ymm2
427; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm3 = ymm11[0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31]
428; AVX2-FAST-ALL-NEXT:    vpblendvb %ymm7, %ymm2, %ymm3, %ymm2
429; AVX2-FAST-ALL-NEXT:    vmovdqa %ymm2, 64(%r9)
430; AVX2-FAST-ALL-NEXT:    vmovdqa %ymm1, 128(%r9)
431; AVX2-FAST-ALL-NEXT:    vmovdqa %ymm0, 96(%r9)
432; AVX2-FAST-ALL-NEXT:    vmovdqa %ymm8, 32(%r9)
433; AVX2-FAST-ALL-NEXT:    vmovdqa %ymm9, (%r9)
434; AVX2-FAST-ALL-NEXT:    vzeroupper
435; AVX2-FAST-ALL-NEXT:    retq
436;
437; AVX2-FAST-PERLANE-LABEL: vf16:
438; AVX2-FAST-PERLANE:       # %bb.0:
439; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm12
440; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsi), %ymm11
441; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdx), %ymm3
442; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rcx), %ymm4
443; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%r8), %ymm2
444; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsi), %xmm5
445; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm6
446; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
447; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13]
448; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[0,1,0,1]
449; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdx), %xmm6
450; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rcx), %xmm0
451; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
452; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
453; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
454; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
455; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm7, %ymm1, %ymm1
456; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq (%r8), %ymm8
457; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
458; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm1, %ymm8, %ymm9
459; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm1 = xmm6[1,2,2,2]
460; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[6,7,u,u,10,11,u,u,8,9,8,9,u,u,12,13]
461; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7]
462; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
463; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm5 = mem[2,1,2,3]
464; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,6,7,u,u,10,11,u,u,u,u,8,9]
465; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3],xmm0[4],xmm5[5,6],xmm0[7]
466; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
467; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm1, %ymm0, %ymm0
468; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq 8(%rdi), %ymm1
469; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm8
470; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm0 = ymm2[0,1,0,1,4,5,4,5]
471; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,u,u,24,25,20,21,u,u,24,25]
472; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
473; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
474; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm1 = ymm12[0,1,2,1,4,5,6,5]
475; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u]
476; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3],ymm1[4],ymm5[5,6],ymm1[7],ymm5[8,9],ymm1[10],ymm5[11],ymm1[12],ymm5[13,14],ymm1[15]
477; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
478; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
479; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm1 = ymm3[1,2,2,3,5,6,6,7]
480; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
481; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
482; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7]
483; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm5 = ymm2[2,3,2,3,6,7,6,7]
484; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15]
485; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
486; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29]
487; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm6 = ymm3[3,2,3,3,7,6,7,7]
488; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15]
489; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
490; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm1, %ymm5, %ymm1
491; AVX2-FAST-PERLANE-NEXT:    vpsrldq {{.*#+}} ymm5 = ymm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
492; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2]
493; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm1, %ymm5, %ymm1
494; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
495; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4]
496; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
497; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm4 = ymm12[1,1,2,2]
498; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2]
499; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4],ymm4[5],ymm2[6,7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12],ymm4[13],ymm2[14,15]
500; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm3, %ymm2, %ymm2
501; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm3 = ymm11[0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31]
502; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm2, %ymm3, %ymm2
503; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm2, 64(%r9)
504; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, 128(%r9)
505; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, 96(%r9)
506; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm8, 32(%r9)
507; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm9, (%r9)
508; AVX2-FAST-PERLANE-NEXT:    vzeroupper
509; AVX2-FAST-PERLANE-NEXT:    retq
510  %in.vec0 = load <16 x i16>, <16 x i16>* %in.vecptr0, align 32
511  %in.vec1 = load <16 x i16>, <16 x i16>* %in.vecptr1, align 32
512  %in.vec2 = load <16 x i16>, <16 x i16>* %in.vecptr2, align 32
513  %in.vec3 = load <16 x i16>, <16 x i16>* %in.vecptr3, align 32
514  %in.vec4 = load <16 x i16>, <16 x i16>* %in.vecptr4, align 32
515
516  %concat01 = shufflevector <16 x i16> %in.vec0, <16 x i16> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
517  %concat23 = shufflevector <16 x i16> %in.vec2, <16 x i16> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
518  %concat0123 = shufflevector <32 x i16> %concat01, <32 x i16> %concat23, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
519  %concat4uuu = shufflevector <16 x i16> %in.vec4, <16 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
520  %concat01234 = shufflevector <64 x i16> %concat0123, <64 x i16> %concat4uuu, <80 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79>
521  %interleaved.vec = shufflevector <80 x i16> %concat01234, <80 x i16> poison, <80 x i32> <i32 0, i32 16, i32 32, i32 48, i32 64, i32 1, i32 17, i32 33, i32 49, i32 65, i32 2, i32 18, i32 34, i32 50, i32 66, i32 3, i32 19, i32 35, i32 51, i32 67, i32 4, i32 20, i32 36, i32 52, i32 68, i32 5, i32 21, i32 37, i32 53, i32 69, i32 6, i32 22, i32 38, i32 54, i32 70, i32 7, i32 23, i32 39, i32 55, i32 71, i32 8, i32 24, i32 40, i32 56, i32 72, i32 9, i32 25, i32 41, i32 57, i32 73, i32 10, i32 26, i32 42, i32 58, i32 74, i32 11, i32 27, i32 43, i32 59, i32 75, i32 12, i32 28, i32 44, i32 60, i32 76, i32 13, i32 29, i32 45, i32 61, i32 77, i32 14, i32 30, i32 46, i32 62, i32 78, i32 15, i32 31, i32 47, i32 63, i32 79>
522
523  store <80 x i16> %interleaved.vec, <80 x i16>* %out.vec, align 32
524
525  ret void
526}
527