Lines Matching refs:AVX2

2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck --check-prefixes=AVX2,AVX2-SLOW %s
3 …shuffle,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2,AVX2-FAST,AVX2-FAST-ALL %s
4 …=+avx2,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2,AVX2-FAST,AVX2-FAST-PERLAN…
9 ; AVX2-LABEL: vf2:
10 ; AVX2: # %bb.0:
11 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
12 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
13 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem…
14 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,4,5,12,13,6,7,4,5,6,7]
15 ; AVX2-NEXT: vpextrd $2, %xmm0, 8(%rcx)
16 ; AVX2-NEXT: vmovq %xmm0, (%rcx)
17 ; AVX2-NEXT: retq
33 ; AVX2-LABEL: vf4:
34 ; AVX2: # %bb.0:
35 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
36 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
37 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
38 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
39 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
40 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9],zero,zero,ymm0[2,3,10,11],zero,zero,ymm0[4,5,…
41 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
42 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,ymm0[2,3]…
43 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
44 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
45 ; AVX2-NEXT: vmovq %xmm1, 16(%rcx)
46 ; AVX2-NEXT: vmovdqa %xmm0, (%rcx)
47 ; AVX2-NEXT: vzeroupper
48 ; AVX2-NEXT: retq
64 ; AVX2-SLOW-LABEL: vf8:
65 ; AVX2-SLOW: # %bb.0:
66 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
67 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm1
68 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm2
69 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
70 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u,22,23,1…
71 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
72 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u…
73 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],y…
74 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
75 ; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm4, %ymm4
76 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,2…
77 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
78 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm…
79 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
80 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
81 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
82 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, 32(%rcx)
83 ; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%rcx)
84 ; AVX2-SLOW-NEXT: vzeroupper
85 ; AVX2-SLOW-NEXT: retq
87 ; AVX2-FAST-ALL-LABEL: vf8:
88 ; AVX2-FAST-ALL: # %bb.0:
89 ; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm0
90 ; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm1
91 ; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm2
92 ; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
93 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
94 ; AVX2-FAST-ALL-NEXT: vpermd %ymm2, %ymm4, %ymm4
95 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6]
96 ; AVX2-FAST-ALL-NEXT: vpermd %ymm3, %ymm5, %ymm3
97 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5,u,u,2,3,6,7,u,u,8,9,12,13,u,u,18,19,2…
98 ; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,2…
99 ; AVX2-FAST-ALL-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
100 ; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6]…
101 ; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
102 ; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
103 ; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[…
104 ; AVX2-FAST-ALL-NEXT: vmovdqa %xmm0, 32(%rcx)
105 ; AVX2-FAST-ALL-NEXT: vmovdqa %ymm3, (%rcx)
106 ; AVX2-FAST-ALL-NEXT: vzeroupper
107 ; AVX2-FAST-ALL-NEXT: retq
109 ; AVX2-FAST-PERLANE-LABEL: vf8:
110 ; AVX2-FAST-PERLANE: # %bb.0:
111 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
112 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm1
113 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm2
114 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
115 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,u,u,6,7,2,3,u,u,8,9,4,5,u,u,16,17,u,u…
116 ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
117 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23…
118 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],y…
119 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
120 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm4, %ymm4
121 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,2…
122 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
123 ; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm…
124 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
125 ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
126 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],x…
127 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, 32(%rcx)
128 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rcx)
129 ; AVX2-FAST-PERLANE-NEXT: vzeroupper
130 ; AVX2-FAST-PERLANE-NEXT: retq
146 ; AVX2-SLOW-LABEL: vf16:
147 ; AVX2-SLOW: # %bb.0:
148 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm0
149 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm1
150 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm2
151 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm3
152 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2]
153 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm5
154 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[3,3,3,3,4,5,6,7]
155 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
156 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2],xmm4[3,4],xmm6[5],xmm4[6,7]
157 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm…
158 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
159 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
160 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
161 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm4, %ymm4
162 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,2…
163 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
164 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,2,2]
165 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm6
166 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
167 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3],xmm4[4],xmm7[5,6],xmm4[7]
168 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm…
169 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,4,5,10,11,6,7,8,9,14,15,12,13,12,13]
170 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
171 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23…
172 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
173 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
174 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u>
175 ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm4, %ymm1
176 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,…
177 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,2…
178 ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
179 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <u,3,3,u,4,4,u,5>
180 ; AVX2-SLOW-NEXT: vpermd (%rdi), %ymm1, %ymm1
181 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0
182 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
183 ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 64(%rcx)
184 ; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rcx)
185 ; AVX2-SLOW-NEXT: vzeroupper
186 ; AVX2-SLOW-NEXT: retq
188 ; AVX2-FAST-LABEL: vf16:
189 ; AVX2-FAST: # %bb.0:
190 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm0
191 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1
192 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm2
193 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u]
194 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm4
195 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm5
196 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[1,1,2,2]
197 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2],xmm6[3,4],xmm3[5],xmm6[6,7]
198 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm…
199 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
200 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
201 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,0,0,u,1,1,u,2>
202 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm3
203 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,2…
204 ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
205 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,1,2,2]
206 ; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm6
207 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
208 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3],xmm3[4],xmm7[5,6],xmm3[7]
209 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm…
210 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,4,5,10,11,6,7,8,9,14,15,12,13,12,13]
211 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3
212 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23…
213 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
214 ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3
215 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <2,u,3,3,u,4,4,u>
216 ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1
217 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,…
218 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,2…
219 ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0
220 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <u,3,3,u,4,4,u,5>
221 ; AVX2-FAST-NEXT: vpermd (%rdi), %ymm1, %ymm1
222 ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
223 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
224 ; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%rcx)
225 ; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rcx)
226 ; AVX2-FAST-NEXT: vzeroupper
227 ; AVX2-FAST-NEXT: retq
243 ; AVX2-SLOW-LABEL: vf32:
244 ; AVX2-SLOW: # %bb.0:
245 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm9
246 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3
247 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %ymm11
248 ; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm10
249 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm5
250 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm6
251 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4
252 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
253 ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0
254 ; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm1
255 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[3,3,3,3,4,5,6,7]
256 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
257 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm2[2],xmm7[3,4],xmm2[5],xmm7[6,7]
258 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm…
259 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
260 ; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0
261 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
262 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = <u,0,0,u,1,1,u,2>
263 ; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm13, %ymm7
264 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,2…
265 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm7, %ymm8
266 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0
267 ; AVX2-SLOW-NEXT: vmovdqa 48(%rdx), %xmm7
268 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm…
269 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [8,9,4,5,10,11,6,7,8,9,14,15,12,13,12,13]
270 ; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm2, %xmm2
271 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
272 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2]
273 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2,3],xmm0[4],xmm7[5,6],xmm0[7]
274 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
275 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,…
276 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm7
277 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
278 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm7, %ymm15
279 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,2,2]
280 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[3,3,3,3,4,5,6,7]
281 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
282 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2],xmm0[3,4],xmm7[5],xmm0[6,7]
283 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm…
284 ; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm1, %xmm1
285 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
286 ; AVX2-SLOW-NEXT: vpermd %ymm11, %ymm13, %ymm1
287 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0
288 ; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm1
289 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm…
290 ; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm4, %xmm4
291 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
292 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
293 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6],xmm6[7]
294 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
295 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm2
296 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
297 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
298 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,2…
299 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm3
300 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u>
301 ; AVX2-SLOW-NEXT: vpermd %ymm11, %ymm4, %ymm6
302 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,2…
303 ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm3, %ymm6, %ymm3
304 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <u,3,3,u,4,4,u,5>
305 ; AVX2-SLOW-NEXT: vpermd 32(%rdi), %ymm6, %ymm11
306 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm3, %ymm11, %ymm3
307 ; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm9, %ymm2
308 ; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm4, %ymm4
309 ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm2
310 ; AVX2-SLOW-NEXT: vpermd (%rdi), %ymm6, %ymm4
311 ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
312 ; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rcx)
313 ; AVX2-SLOW-NEXT: vmovdqa %ymm3, 128(%rcx)
314 ; AVX2-SLOW-NEXT: vmovdqa %ymm1, 64(%rcx)
315 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%rcx)
316 ; AVX2-SLOW-NEXT: vmovdqa %ymm15, 160(%rcx)
317 ; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx)
318 ; AVX2-SLOW-NEXT: vzeroupper
319 ; AVX2-SLOW-NEXT: retq
321 ; AVX2-FAST-LABEL: vf32:
322 ; AVX2-FAST: # %bb.0:
323 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm10
324 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm3
325 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm12
326 ; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm11
327 ; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm5
328 ; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm4
329 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u>
330 ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm5, %xmm7
331 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
332 ; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm6
333 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm1
334 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2]
335 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2],xmm2[3,4],xmm7[5],xmm2[6,7]
336 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm…
337 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
338 ; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0
339 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
340 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <u,0,0,u,1,1,u,2>
341 ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm14, %ymm7
342 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,2…
343 ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm7, %ymm0
344 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
345 ; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm0
346 ; AVX2-FAST-NEXT: vmovdqa 48(%rdx), %xmm7
347 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm…
348 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,4,5,10,11,6,7,8,9,14,15,12,13,12,13]
349 ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm2, %xmm2
350 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
351 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2]
352 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2,3],xmm0[4],xmm7[5,6],xmm0[7]
353 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
354 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,…
355 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm7
356 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
357 ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm7, %ymm8
358 ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm0
359 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,2,2]
360 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7]
361 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm…
362 ; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm1, %xmm1
363 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
364 ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm14, %ymm1
365 ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0
366 ; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm1
367 ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm…
368 ; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm4, %xmm4
369 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
370 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
371 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6],xmm6[7]
372 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
373 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm2
374 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
375 ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
376 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,2…
377 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3
378 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u>
379 ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm4, %ymm6
380 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,2…
381 ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm3, %ymm6, %ymm3
382 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <u,3,3,u,4,4,u,5>
383 ; AVX2-FAST-NEXT: vpermd 32(%rdi), %ymm6, %ymm9
384 ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm3, %ymm9, %ymm3
385 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm10, %ymm2
386 ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm4, %ymm4
387 ; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm2
388 ; AVX2-FAST-NEXT: vpermd (%rdi), %ymm6, %ymm4
389 ; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
390 ; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rcx)
391 ; AVX2-FAST-NEXT: vmovdqa %ymm3, 128(%rcx)
392 ; AVX2-FAST-NEXT: vmovdqa %ymm1, 64(%rcx)
393 ; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%rcx)
394 ; AVX2-FAST-NEXT: vmovdqa %ymm8, 160(%rcx)
395 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
396 ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rcx)
397 ; AVX2-FAST-NEXT: vzeroupper
398 ; AVX2-FAST-NEXT: retq