1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86,X86-AVX512F
3; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X86,X86-AVX512BW
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64,X64-AVX512F
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X64,X64-AVX512BW
6
7declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
8
9declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
10declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
11declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>)
12
13declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8)
14declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
15
16declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
17declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
18
19declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
20declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
21
22declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
23declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
24
25define <8 x double> @combine_permvar_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
26; CHECK-LABEL: combine_permvar_8f64_identity:
27; CHECK:       # %bb.0:
28; CHECK-NEXT:    ret{{[l|q]}}
29  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
30  %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %1, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>)
31  ret <8 x double> %2
32}
33define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
34; X86-AVX512F-LABEL: combine_permvar_8f64_identity_mask:
35; X86-AVX512F:       # %bb.0:
36; X86-AVX512F-NEXT:    vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
37; X86-AVX512F-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
38; X86-AVX512F-NEXT:    kmovw %eax, %k1
39; X86-AVX512F-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
40; X86-AVX512F-NEXT:    vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
41; X86-AVX512F-NEXT:    vpermpd %zmm1, %zmm0, %zmm1 {%k1}
42; X86-AVX512F-NEXT:    vmovapd %zmm1, %zmm0
43; X86-AVX512F-NEXT:    retl
44;
45; X86-AVX512BW-LABEL: combine_permvar_8f64_identity_mask:
46; X86-AVX512BW:       # %bb.0:
47; X86-AVX512BW-NEXT:    vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
48; X86-AVX512BW-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
49; X86-AVX512BW-NEXT:    kmovd %eax, %k1
50; X86-AVX512BW-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
51; X86-AVX512BW-NEXT:    vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
52; X86-AVX512BW-NEXT:    vpermpd %zmm1, %zmm0, %zmm1 {%k1}
53; X86-AVX512BW-NEXT:    vmovapd %zmm1, %zmm0
54; X86-AVX512BW-NEXT:    retl
55;
56; X64-AVX512F-LABEL: combine_permvar_8f64_identity_mask:
57; X64-AVX512F:       # %bb.0:
58; X64-AVX512F-NEXT:    vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
59; X64-AVX512F-NEXT:    kmovw %edi, %k1
60; X64-AVX512F-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
61; X64-AVX512F-NEXT:    vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
62; X64-AVX512F-NEXT:    vpermpd %zmm1, %zmm0, %zmm1 {%k1}
63; X64-AVX512F-NEXT:    vmovapd %zmm1, %zmm0
64; X64-AVX512F-NEXT:    retq
65;
66; X64-AVX512BW-LABEL: combine_permvar_8f64_identity_mask:
67; X64-AVX512BW:       # %bb.0:
68; X64-AVX512BW-NEXT:    vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
69; X64-AVX512BW-NEXT:    kmovd %edi, %k1
70; X64-AVX512BW-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
71; X64-AVX512BW-NEXT:    vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
72; X64-AVX512BW-NEXT:    vpermpd %zmm1, %zmm0, %zmm1 {%k1}
73; X64-AVX512BW-NEXT:    vmovapd %zmm1, %zmm0
74; X64-AVX512BW-NEXT:    retq
75  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
76  %2 = bitcast i8 %m to <8 x i1>
77  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x1
78  %4 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %3, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>)
79  %5 = bitcast i8 %m to <8 x i1>
80  %6 = select <8 x i1> %5, <8 x double> %4, <8 x double> %3
81  ret <8 x double> %6
82}
83
84define <8 x i64> @combine_permvar_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
85; CHECK-LABEL: combine_permvar_8i64_identity:
86; CHECK:       # %bb.0:
87; CHECK-NEXT:    ret{{[l|q]}}
88  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
89  %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %1, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>)
90  ret <8 x i64> %2
91}
92define <8 x i64> @combine_permvar_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
93; X86-AVX512F-LABEL: combine_permvar_8i64_identity_mask:
94; X86-AVX512F:       # %bb.0:
95; X86-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
96; X86-AVX512F-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
97; X86-AVX512F-NEXT:    kmovw %eax, %k1
98; X86-AVX512F-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
99; X86-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
100; X86-AVX512F-NEXT:    vpermq %zmm1, %zmm0, %zmm1 {%k1}
101; X86-AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
102; X86-AVX512F-NEXT:    retl
103;
104; X86-AVX512BW-LABEL: combine_permvar_8i64_identity_mask:
105; X86-AVX512BW:       # %bb.0:
106; X86-AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
107; X86-AVX512BW-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
108; X86-AVX512BW-NEXT:    kmovd %eax, %k1
109; X86-AVX512BW-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
110; X86-AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
111; X86-AVX512BW-NEXT:    vpermq %zmm1, %zmm0, %zmm1 {%k1}
112; X86-AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
113; X86-AVX512BW-NEXT:    retl
114;
115; X64-AVX512F-LABEL: combine_permvar_8i64_identity_mask:
116; X64-AVX512F:       # %bb.0:
117; X64-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
118; X64-AVX512F-NEXT:    kmovw %edi, %k1
119; X64-AVX512F-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
120; X64-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
121; X64-AVX512F-NEXT:    vpermq %zmm1, %zmm0, %zmm1 {%k1}
122; X64-AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
123; X64-AVX512F-NEXT:    retq
124;
125; X64-AVX512BW-LABEL: combine_permvar_8i64_identity_mask:
126; X64-AVX512BW:       # %bb.0:
127; X64-AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
128; X64-AVX512BW-NEXT:    kmovd %edi, %k1
129; X64-AVX512BW-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
130; X64-AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
131; X64-AVX512BW-NEXT:    vpermq %zmm1, %zmm0, %zmm1 {%k1}
132; X64-AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
133; X64-AVX512BW-NEXT:    retq
134  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
135  %2 = bitcast i8 %m to <8 x i1>
136  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1
137  %4 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %3, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>)
138  %5 = bitcast i8 %m to <8 x i1>
139  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %3
140  ret <8 x i64> %6
141}
142
143define <8 x double> @combine_vpermt2var_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
144; CHECK-LABEL: combine_vpermt2var_8f64_identity:
145; CHECK:       # %bb.0:
146; CHECK-NEXT:    ret{{[l|q]}}
147  %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 -1)
148  %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 -1)
149  ret <8 x double> %res1
150}
151define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
152; X86-AVX512F-LABEL: combine_vpermt2var_8f64_identity_mask:
153; X86-AVX512F:       # %bb.0:
154; X86-AVX512F-NEXT:    vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
155; X86-AVX512F-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
156; X86-AVX512F-NEXT:    kmovw %eax, %k1
157; X86-AVX512F-NEXT:    vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z}
158; X86-AVX512F-NEXT:    vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
159; X86-AVX512F-NEXT:    vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z}
160; X86-AVX512F-NEXT:    retl
161;
162; X86-AVX512BW-LABEL: combine_vpermt2var_8f64_identity_mask:
163; X86-AVX512BW:       # %bb.0:
164; X86-AVX512BW-NEXT:    vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
165; X86-AVX512BW-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
166; X86-AVX512BW-NEXT:    kmovd %eax, %k1
167; X86-AVX512BW-NEXT:    vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z}
168; X86-AVX512BW-NEXT:    vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
169; X86-AVX512BW-NEXT:    vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z}
170; X86-AVX512BW-NEXT:    retl
171;
172; X64-AVX512F-LABEL: combine_vpermt2var_8f64_identity_mask:
173; X64-AVX512F:       # %bb.0:
174; X64-AVX512F-NEXT:    vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
175; X64-AVX512F-NEXT:    kmovw %edi, %k1
176; X64-AVX512F-NEXT:    vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z}
177; X64-AVX512F-NEXT:    vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
178; X64-AVX512F-NEXT:    vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z}
179; X64-AVX512F-NEXT:    retq
180;
181; X64-AVX512BW-LABEL: combine_vpermt2var_8f64_identity_mask:
182; X64-AVX512BW:       # %bb.0:
183; X64-AVX512BW-NEXT:    vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
184; X64-AVX512BW-NEXT:    kmovd %edi, %k1
185; X64-AVX512BW-NEXT:    vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z}
186; X64-AVX512BW-NEXT:    vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
187; X64-AVX512BW-NEXT:    vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z}
188; X64-AVX512BW-NEXT:    retq
189  %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 %m)
190  %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 %m)
191  ret <8 x double> %res1
192}
193
194define <8 x double> @combine_vpermt2var_8f64_movddup(<8 x double> %x0, <8 x double> %x1) {
195; CHECK-LABEL: combine_vpermt2var_8f64_movddup:
196; CHECK:       # %bb.0:
197; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
198; CHECK-NEXT:    ret{{[l|q]}}
199  %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 undef, i64 undef>, <8 x double> %x0, <8 x double> %x1, i8 -1)
200  ret <8 x double> %res0
201}
202define <8 x double> @combine_vpermt2var_8f64_movddup_load(<8 x double> *%p0, <8 x double> %x1) {
203; X86-LABEL: combine_vpermt2var_8f64_movddup_load:
204; X86:       # %bb.0:
205; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
206; X86-NEXT:    vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
207; X86-NEXT:    retl
208;
209; X64-LABEL: combine_vpermt2var_8f64_movddup_load:
210; X64:       # %bb.0:
211; X64-NEXT:    vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
212; X64-NEXT:    retq
213  %x0 = load <8 x double>, <8 x double> *%p0
214  %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 -1)
215  ret <8 x double> %res0
216}
217define <8 x double> @combine_vpermt2var_8f64_movddup_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
218; X86-AVX512F-LABEL: combine_vpermt2var_8f64_movddup_mask:
219; X86-AVX512F:       # %bb.0:
220; X86-AVX512F-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
221; X86-AVX512F-NEXT:    kmovw %eax, %k1
222; X86-AVX512F-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
223; X86-AVX512F-NEXT:    retl
224;
225; X86-AVX512BW-LABEL: combine_vpermt2var_8f64_movddup_mask:
226; X86-AVX512BW:       # %bb.0:
227; X86-AVX512BW-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
228; X86-AVX512BW-NEXT:    kmovd %eax, %k1
229; X86-AVX512BW-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
230; X86-AVX512BW-NEXT:    retl
231;
232; X64-AVX512F-LABEL: combine_vpermt2var_8f64_movddup_mask:
233; X64-AVX512F:       # %bb.0:
234; X64-AVX512F-NEXT:    kmovw %edi, %k1
235; X64-AVX512F-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
236; X64-AVX512F-NEXT:    retq
237;
238; X64-AVX512BW-LABEL: combine_vpermt2var_8f64_movddup_mask:
239; X64-AVX512BW:       # %bb.0:
240; X64-AVX512BW-NEXT:    kmovd %edi, %k1
241; X64-AVX512BW-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
242; X64-AVX512BW-NEXT:    retq
243  %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 %m)
244  ret <8 x double> %res0
245}
246
247define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
248; CHECK-LABEL: combine_vpermt2var_8i64_identity:
249; CHECK:       # %bb.0:
250; CHECK-NEXT:    ret{{[l|q]}}
251  %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 -1)
252  %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 -1)
253  ret <8 x i64> %res1
254}
255define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
256; X86-AVX512F-LABEL: combine_vpermt2var_8i64_identity_mask:
257; X86-AVX512F:       # %bb.0:
258; X86-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
259; X86-AVX512F-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
260; X86-AVX512F-NEXT:    kmovw %eax, %k1
261; X86-AVX512F-NEXT:    vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z}
262; X86-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
263; X86-AVX512F-NEXT:    vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z}
264; X86-AVX512F-NEXT:    retl
265;
266; X86-AVX512BW-LABEL: combine_vpermt2var_8i64_identity_mask:
267; X86-AVX512BW:       # %bb.0:
268; X86-AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
269; X86-AVX512BW-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
270; X86-AVX512BW-NEXT:    kmovd %eax, %k1
271; X86-AVX512BW-NEXT:    vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z}
272; X86-AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
273; X86-AVX512BW-NEXT:    vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z}
274; X86-AVX512BW-NEXT:    retl
275;
276; X64-AVX512F-LABEL: combine_vpermt2var_8i64_identity_mask:
277; X64-AVX512F:       # %bb.0:
278; X64-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
279; X64-AVX512F-NEXT:    kmovw %edi, %k1
280; X64-AVX512F-NEXT:    vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z}
281; X64-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
282; X64-AVX512F-NEXT:    vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z}
283; X64-AVX512F-NEXT:    retq
284;
285; X64-AVX512BW-LABEL: combine_vpermt2var_8i64_identity_mask:
286; X64-AVX512BW:       # %bb.0:
287; X64-AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
288; X64-AVX512BW-NEXT:    kmovd %edi, %k1
289; X64-AVX512BW-NEXT:    vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z}
290; X64-AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
291; X64-AVX512BW-NEXT:    vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z}
292; X64-AVX512BW-NEXT:    retq
293  %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 %m)
294  %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 %m)
295  ret <8 x i64> %res1
296}
297
298define <16 x float> @combine_vpermt2var_16f32_identity(<16 x float> %x0, <16 x float> %x1) {
299; CHECK-LABEL: combine_vpermt2var_16f32_identity:
300; CHECK:       # %bb.0:
301; CHECK-NEXT:    ret{{[l|q]}}
302  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 -1)
303  %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 -1)
304  ret <16 x float> %res1
305}
306define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
307; X86-LABEL: combine_vpermt2var_16f32_identity_mask:
308; X86:       # %bb.0:
309; X86-NEXT:    vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
310; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
311; X86-NEXT:    vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z}
312; X86-NEXT:    vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
313; X86-NEXT:    vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z}
314; X86-NEXT:    retl
315;
316; X64-AVX512F-LABEL: combine_vpermt2var_16f32_identity_mask:
317; X64-AVX512F:       # %bb.0:
318; X64-AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
319; X64-AVX512F-NEXT:    kmovw %edi, %k1
320; X64-AVX512F-NEXT:    vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z}
321; X64-AVX512F-NEXT:    vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
322; X64-AVX512F-NEXT:    vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z}
323; X64-AVX512F-NEXT:    retq
324;
325; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_identity_mask:
326; X64-AVX512BW:       # %bb.0:
327; X64-AVX512BW-NEXT:    vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
328; X64-AVX512BW-NEXT:    kmovd %edi, %k1
329; X64-AVX512BW-NEXT:    vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z}
330; X64-AVX512BW-NEXT:    vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
331; X64-AVX512BW-NEXT:    vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z}
332; X64-AVX512BW-NEXT:    retq
333  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 %m)
334  %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 %m)
335  ret <16 x float> %res1
336}
337
338define <16 x float> @combine_vpermt2var_16f32_vmovddup(<16 x float> %x0, <16 x float> %x1) {
339; CHECK-LABEL: combine_vpermt2var_16f32_vmovddup:
340; CHECK:       # %bb.0:
341; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
342; CHECK-NEXT:    ret{{[l|q]}}
343  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 -1)
344  ret <16 x float> %res0
345}
346define <16 x float> @combine_vpermt2var_16f32_vmovddup_load(<16 x float> *%p0, <16 x float> %x1) {
347; X86-LABEL: combine_vpermt2var_16f32_vmovddup_load:
348; X86:       # %bb.0:
349; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
350; X86-NEXT:    vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
351; X86-NEXT:    retl
352;
353; X64-LABEL: combine_vpermt2var_16f32_vmovddup_load:
354; X64:       # %bb.0:
355; X64-NEXT:    vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
356; X64-NEXT:    retq
357  %x0 = load <16 x float>, <16 x float> *%p0
358  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 -1)
359  ret <16 x float> %res0
360}
361define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
362; X86-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
363; X86:       # %bb.0:
364; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
365; X86-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
366; X86-NEXT:    retl
367;
368; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
369; X64-AVX512F:       # %bb.0:
370; X64-AVX512F-NEXT:    kmovw %edi, %k1
371; X64-AVX512F-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
372; X64-AVX512F-NEXT:    retq
373;
374; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
375; X64-AVX512BW:       # %bb.0:
376; X64-AVX512BW-NEXT:    kmovd %edi, %k1
377; X64-AVX512BW-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
378; X64-AVX512BW-NEXT:    retq
379  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m)
380  ret <16 x float> %res0
381}
382define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
383; X86-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
384; X86:       # %bb.0:
385; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
386; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
387; X86-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
388; X86-NEXT:    retl
389;
390; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
391; X64-AVX512F:       # %bb.0:
392; X64-AVX512F-NEXT:    kmovw %esi, %k1
393; X64-AVX512F-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
394; X64-AVX512F-NEXT:    retq
395;
396; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
397; X64-AVX512BW:       # %bb.0:
398; X64-AVX512BW-NEXT:    kmovd %esi, %k1
399; X64-AVX512BW-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
400; X64-AVX512BW-NEXT:    retq
401  %x0 = load <16 x float>, <16 x float> *%p0
402  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m)
403  ret <16 x float> %res0
404}
405
406define <16 x float> @combine_vpermt2var_16f32_vmovshdup(<16 x float> %x0, <16 x float> %x1) {
407; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup:
408; CHECK:       # %bb.0:
409; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
410; CHECK-NEXT:    ret{{[l|q]}}
411  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1)
412  ret <16 x float> %res0
413}
414define <16 x float> @combine_vpermt2var_16f32_vmovshdup_load(<16 x float> *%p0, <16 x float> %x1) {
415; X86-LABEL: combine_vpermt2var_16f32_vmovshdup_load:
416; X86:       # %bb.0:
417; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
418; X86-NEXT:    vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
419; X86-NEXT:    retl
420;
421; X64-LABEL: combine_vpermt2var_16f32_vmovshdup_load:
422; X64:       # %bb.0:
423; X64-NEXT:    vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
424; X64-NEXT:    retq
425  %x0 = load <16 x float>, <16 x float> *%p0
426  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1)
427  ret <16 x float> %res0
428}
429define <16 x float> @combine_vpermt2var_16f32_vmovshdup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
430; X86-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:
431; X86:       # %bb.0:
432; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
433; X86-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
434; X86-NEXT:    retl
435;
436; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:
437; X64-AVX512F:       # %bb.0:
438; X64-AVX512F-NEXT:    kmovw %edi, %k1
439; X64-AVX512F-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
440; X64-AVX512F-NEXT:    retq
441;
442; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:
443; X64-AVX512BW:       # %bb.0:
444; X64-AVX512BW-NEXT:    kmovd %edi, %k1
445; X64-AVX512BW-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
446; X64-AVX512BW-NEXT:    retq
447  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 %m)
448  ret <16 x float> %res0
449}
450
451define <16 x float> @combine_vpermt2var_16f32_vmovsldup(<16 x float> %x0, <16 x float> %x1) {
452; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup:
453; CHECK:       # %bb.0:
454; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
455; CHECK-NEXT:    ret{{[l|q]}}
456  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1)
457  ret <16 x float> %res0
458}
459define <16 x float> @combine_vpermt2var_16f32_vmovsldup_load(<16 x float> *%p0, <16 x float> %x1) {
460; X86-LABEL: combine_vpermt2var_16f32_vmovsldup_load:
461; X86:       # %bb.0:
462; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
463; X86-NEXT:    vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
464; X86-NEXT:    retl
465;
466; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_load:
467; X64:       # %bb.0:
468; X64-NEXT:    vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
469; X64-NEXT:    retq
470  %x0 = load <16 x float>, <16 x float> *%p0
471  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1)
472  ret <16 x float> %res0
473}
474define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
475; X86-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:
476; X86:       # %bb.0:
477; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
478; X86-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
479; X86-NEXT:    retl
480;
481; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:
482; X64-AVX512F:       # %bb.0:
483; X64-AVX512F-NEXT:    kmovw %edi, %k1
484; X64-AVX512F-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
485; X64-AVX512F-NEXT:    retq
486;
487; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:
488; X64-AVX512BW:       # %bb.0:
489; X64-AVX512BW-NEXT:    kmovd %edi, %k1
490; X64-AVX512BW-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
491; X64-AVX512BW-NEXT:    retq
492  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
493  ret <16 x float> %res0
494}
495define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
496; X86-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:
497; X86:       # %bb.0:
498; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
499; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
500; X86-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
501; X86-NEXT:    retl
502;
503; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:
504; X64-AVX512F:       # %bb.0:
505; X64-AVX512F-NEXT:    kmovw %esi, %k1
506; X64-AVX512F-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
507; X64-AVX512F-NEXT:    retq
508;
509; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:
510; X64-AVX512BW:       # %bb.0:
511; X64-AVX512BW-NEXT:    kmovd %esi, %k1
512; X64-AVX512BW-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
513; X64-AVX512BW-NEXT:    retq
514  %x0 = load <16 x float>, <16 x float> *%p0
515  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
516  ret <16 x float> %res0
517}
518
519define <16 x float> @combine_vpermt2var_16f32_vpermilps(<16 x float> %x0, <16 x float> %x1) {
520; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps:
521; CHECK:       # %bb.0:
522; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
523; CHECK-NEXT:    ret{{[l|q]}}
524  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1)
525  ret <16 x float> %res0
526}
527define <16 x float> @combine_vpermt2var_16f32_vpermilps_load(<16 x float> *%p0, <16 x float> %x1) {
528; X86-LABEL: combine_vpermt2var_16f32_vpermilps_load:
529; X86:       # %bb.0:
530; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
531; X86-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
532; X86-NEXT:    retl
533;
534; X64-LABEL: combine_vpermt2var_16f32_vpermilps_load:
535; X64:       # %bb.0:
536; X64-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
537; X64-NEXT:    retq
538  %x0 = load <16 x float>, <16 x float> *%p0
539  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1)
540  ret <16 x float> %res0
541}
542define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
543; X86-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
544; X86:       # %bb.0:
545; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
546; X86-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
547; X86-NEXT:    retl
548;
549; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
550; X64-AVX512F:       # %bb.0:
551; X64-AVX512F-NEXT:    kmovw %edi, %k1
552; X64-AVX512F-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
553; X64-AVX512F-NEXT:    retq
554;
555; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
556; X64-AVX512BW:       # %bb.0:
557; X64-AVX512BW-NEXT:    kmovd %edi, %k1
558; X64-AVX512BW-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
559; X64-AVX512BW-NEXT:    retq
560  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m)
561  ret <16 x float> %res0
562}
563define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
564; X86-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load:
565; X86:       # %bb.0:
566; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
567; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
568; X86-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
569; X86-NEXT:    retl
570;
571; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load:
572; X64-AVX512F:       # %bb.0:
573; X64-AVX512F-NEXT:    kmovw %esi, %k1
574; X64-AVX512F-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
575; X64-AVX512F-NEXT:    retq
576;
577; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load:
578; X64-AVX512BW:       # %bb.0:
579; X64-AVX512BW-NEXT:    kmovd %esi, %k1
580; X64-AVX512BW-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
581; X64-AVX512BW-NEXT:    retq
582  %x0 = load <16 x float>, <16 x float> *%p0
583  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m)
584  ret <16 x float> %res0
585}
586
587define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) {
588; CHECK-LABEL: combine_vpermt2var_16i32_identity:
589; CHECK:       # %bb.0:
590; CHECK-NEXT:    ret{{[l|q]}}
591  %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>, <16 x i32> %x0, <16 x i32> %x1, i16 -1)
592  %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 undef, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
593  ret <16 x i32> %res1
594}
595define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x i32> %x1, i16 %m) {
596; X86-LABEL: combine_vpermt2var_16i32_identity_mask:
597; X86:       # %bb.0:
598; X86-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
599; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
600; X86-NEXT:    vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z}
601; X86-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
602; X86-NEXT:    vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z}
603; X86-NEXT:    retl
604;
605; X64-AVX512F-LABEL: combine_vpermt2var_16i32_identity_mask:
606; X64-AVX512F:       # %bb.0:
607; X64-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
608; X64-AVX512F-NEXT:    kmovw %edi, %k1
609; X64-AVX512F-NEXT:    vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z}
610; X64-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
611; X64-AVX512F-NEXT:    vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z}
612; X64-AVX512F-NEXT:    retq
613;
614; X64-AVX512BW-LABEL: combine_vpermt2var_16i32_identity_mask:
615; X64-AVX512BW:       # %bb.0:
616; X64-AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
617; X64-AVX512BW-NEXT:    kmovd %edi, %k1
618; X64-AVX512BW-NEXT:    vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z}
619; X64-AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
620; X64-AVX512BW-NEXT:    vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z}
621; X64-AVX512BW-NEXT:    retq
622  %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %x0, <16 x i32> %x1, i16 %m)
623  %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 %m)
624  ret <16 x i32> %res1
625}
626
627define <16 x i32> @combine_permvar_as_vpbroadcastd512(<16 x i32> %x0) {
628; CHECK-LABEL: combine_permvar_as_vpbroadcastd512:
629; CHECK:       # %bb.0:
630; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
631; CHECK-NEXT:    ret{{[l|q]}}
632  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> zeroinitializer)
633  ret <16 x i32> %1
634}
635
636define <8 x i64> @combine_permvar_as_vpbroadcastq512(<8 x i64> %x0) {
637; CHECK-LABEL: combine_permvar_as_vpbroadcastq512:
638; CHECK:       # %bb.0:
639; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
640; CHECK-NEXT:    ret{{[l|q]}}
641  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> zeroinitializer)
642  ret <8 x i64> %1
643}
644
645define <8 x i64> @combine_permvar_8i64_as_permq(<8 x i64> %x0, <8 x i64> %x1) {
646; CHECK-LABEL: combine_permvar_8i64_as_permq:
647; CHECK:       # %bb.0:
648; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
649; CHECK-NEXT:    ret{{[l|q]}}
650  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>)
651  ret <8 x i64> %1
652}
653define <8 x i64> @combine_permvar_8i64_as_permq_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
654; X86-AVX512F-LABEL: combine_permvar_8i64_as_permq_mask:
655; X86-AVX512F:       # %bb.0:
656; X86-AVX512F-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
657; X86-AVX512F-NEXT:    kmovw %eax, %k1
658; X86-AVX512F-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
659; X86-AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
660; X86-AVX512F-NEXT:    retl
661;
662; X86-AVX512BW-LABEL: combine_permvar_8i64_as_permq_mask:
663; X86-AVX512BW:       # %bb.0:
664; X86-AVX512BW-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
665; X86-AVX512BW-NEXT:    kmovd %eax, %k1
666; X86-AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
667; X86-AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
668; X86-AVX512BW-NEXT:    retl
669;
670; X64-AVX512F-LABEL: combine_permvar_8i64_as_permq_mask:
671; X64-AVX512F:       # %bb.0:
672; X64-AVX512F-NEXT:    kmovw %edi, %k1
673; X64-AVX512F-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
674; X64-AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
675; X64-AVX512F-NEXT:    retq
676;
677; X64-AVX512BW-LABEL: combine_permvar_8i64_as_permq_mask:
678; X64-AVX512BW:       # %bb.0:
679; X64-AVX512BW-NEXT:    kmovd %edi, %k1
680; X64-AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
681; X64-AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
682; X64-AVX512BW-NEXT:    retq
683  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>)
684  %2 = bitcast i8 %m to <8 x i1>
685  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1
686  ret <8 x i64> %3
687}
688
689define <8 x double> @combine_permvar_8f64_as_permpd(<8 x double> %x0, <8 x double> %x1) {
690; CHECK-LABEL: combine_permvar_8f64_as_permpd:
691; CHECK:       # %bb.0:
692; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
693; CHECK-NEXT:    ret{{[l|q]}}
694  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>)
695  ret <8 x double> %1
696}
697define <8 x double> @combine_permvar_8f64_as_permpd_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
698; X86-AVX512F-LABEL: combine_permvar_8f64_as_permpd_mask:
699; X86-AVX512F:       # %bb.0:
700; X86-AVX512F-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
701; X86-AVX512F-NEXT:    kmovw %eax, %k1
702; X86-AVX512F-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
703; X86-AVX512F-NEXT:    vmovapd %zmm1, %zmm0
704; X86-AVX512F-NEXT:    retl
705;
706; X86-AVX512BW-LABEL: combine_permvar_8f64_as_permpd_mask:
707; X86-AVX512BW:       # %bb.0:
708; X86-AVX512BW-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
709; X86-AVX512BW-NEXT:    kmovd %eax, %k1
710; X86-AVX512BW-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
711; X86-AVX512BW-NEXT:    vmovapd %zmm1, %zmm0
712; X86-AVX512BW-NEXT:    retl
713;
714; X64-AVX512F-LABEL: combine_permvar_8f64_as_permpd_mask:
715; X64-AVX512F:       # %bb.0:
716; X64-AVX512F-NEXT:    kmovw %edi, %k1
717; X64-AVX512F-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
718; X64-AVX512F-NEXT:    vmovapd %zmm1, %zmm0
719; X64-AVX512F-NEXT:    retq
720;
721; X64-AVX512BW-LABEL: combine_permvar_8f64_as_permpd_mask:
722; X64-AVX512BW:       # %bb.0:
723; X64-AVX512BW-NEXT:    kmovd %edi, %k1
724; X64-AVX512BW-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
725; X64-AVX512BW-NEXT:    vmovapd %zmm1, %zmm0
726; X64-AVX512BW-NEXT:    retq
727  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>)
728  %2 = bitcast i8 %m to <8 x i1>
729  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x1
730  ret <8 x double> %3
731}
732
733define <16 x float> @combine_vpermilvar_16f32_230146759A8BCFDE(<16 x float> %x0) {
734; CHECK-LABEL: combine_vpermilvar_16f32_230146759A8BCFDE:
735; CHECK:       # %bb.0:
736; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,4,6,7,5,9,10,8,11,12,15,13,14]
737; CHECK-NEXT:    ret{{[l|q]}}
738  %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1, i32 1, i32 0, i32 3, i32 2>, <16 x float> undef, i16 -1)
739  %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %res0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 1, i32 0, i32 2, i32 3, i32 0, i32 2, i32 1, i32 1, i32 2, i32 0, i32 3>, <16 x float> undef, i16 -1)
740  ret <16 x float> %res1
741}
742
743define <8 x double> @combine_vpermi2var_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
744; CHECK-LABEL: combine_vpermi2var_8f64_identity:
745; CHECK:       # %bb.0:
746; CHECK-NEXT:    ret{{[l|q]}}
747  %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x1, i8 -1)
748  %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, i8 -1)
749  ret <8 x double> %res1
750}
751
752define <8 x double> @combine_vpermi2var_8f64_as_shufpd(<8 x double> %x0, <8 x double> %x1) {
753; CHECK-LABEL: combine_vpermi2var_8f64_as_shufpd:
754; CHECK:       # %bb.0:
755; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[0],zmm0[2],zmm1[2],zmm0[5],zmm1[5],zmm0[6],zmm1[7]
756; CHECK-NEXT:    ret{{[l|q]}}
757  %1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 1, i64 8, i64 2, i64 10, i64 5, i64 13, i64 6, i64 15>, <8 x double> %x1, i8 -1)
758  ret <8 x double> %1
759}
760
761define <8 x i64> @combine_vpermi2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
762; CHECK-LABEL: combine_vpermi2var_8i64_identity:
763; CHECK:       # %bb.0:
764; CHECK-NEXT:    ret{{[l|q]}}
765  %res0 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x1, i8 -1)
766  %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %res0, <8 x i64> <i64 undef, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, i8 -1)
767  ret <8 x i64> %res1
768}
769
770define <16 x float> @combine_vpermi2var_16f32_identity(<16 x float> %x0, <16 x float> %x1) {
771; CHECK-LABEL: combine_vpermi2var_16f32_identity:
772; CHECK:       # %bb.0:
773; CHECK-NEXT:    ret{{[l|q]}}
774  %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x1, i16 -1)
775  %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, i16 -1)
776  ret <16 x float> %res1
777}
778
779define <16 x i32> @combine_vpermi2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) {
780; CHECK-LABEL: combine_vpermi2var_16i32_identity:
781; CHECK:       # %bb.0:
782; CHECK-NEXT:    ret{{[l|q]}}
783  %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>, <16 x i32> %x1, i16 -1)
784  %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %res0, <16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 undef, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, i16 -1)
785  ret <16 x i32> %res1
786}
787
788define <16 x float> @combine_vpermt2var_vpermi2var_16f32_as_unpckhps(<16 x float> %a0, <16 x float> %a1) {
789; CHECK-LABEL: combine_vpermt2var_vpermi2var_16f32_as_unpckhps:
790; CHECK:       # %bb.0:
791; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15]
792; CHECK-NEXT:    ret{{[l|q]}}
793  %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %a0, <16 x i32> <i32 18, i32 2, i32 19, i32 3, i32 22, i32 6, i32 23, i32 7, i32 26, i32 10, i32 27, i32 11, i32 30, i32 14, i32 31, i32 15>, <16 x float> %a1, i16 -1)
794  ret <16 x float> %res0
795}
796
797define <16 x i32> @vpermt2var_vpermi2var_16i32_as_unpckldq(<16 x i32> %a0, <16 x i32> %a1) {
798; CHECK-LABEL: vpermt2var_vpermi2var_16i32_as_unpckldq:
799; CHECK:       # %bb.0:
800; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
801; CHECK-NEXT:    ret{{[l|q]}}
802  %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>, <16 x i32> %a1, i16 -1)
803  ret <16 x i32> %res0
804}
805
806define <8 x double> @combine_vpermi2var_8f64_as_vpermpd(<8 x double> %x0, <8 x double> %x1) {
807; X86-LABEL: combine_vpermi2var_8f64_as_vpermpd:
808; X86:       # %bb.0:
809; X86-NEXT:    vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
810; X86-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
811; X86-NEXT:    retl
812;
813; X64-LABEL: combine_vpermi2var_8f64_as_vpermpd:
814; X64:       # %bb.0:
815; X64-NEXT:    vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
816; X64-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
817; X64-NEXT:    retq
818  %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 0, i64 7, i64 6, i64 5, i64 4>, <8 x double> %x1, i8 -1)
819  %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %res0, <8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x double> %res0, i8 -1)
820  ret <8 x double> %res1
821}
822
823define <8 x i64> @combine_vpermt2var_8i64_as_vpermq(<8 x i64> %x0, <8 x i64> %x1) {
824; X86-LABEL: combine_vpermt2var_8i64_as_vpermq:
825; X86:       # %bb.0:
826; X86-NEXT:    vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
827; X86-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
828; X86-NEXT:    retl
829;
830; X64-LABEL: combine_vpermt2var_8i64_as_vpermq:
831; X64:       # %bb.0:
832; X64-NEXT:    vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
833; X64-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
834; X64-NEXT:    retq
835  %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 3, i64 2, i64 1, i64 0, i64 7, i64 6, i64 5, i64 4>, <8 x i64> %x0, <8 x i64> %x1, i8 -1)
836  %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x i64> %res0, <8 x i64> %res0, i8 -1)
837  ret <8 x i64> %res1
838}
839
840define <16 x float> @combine_vpermi2var_16f32_as_vpermps(<16 x float> %x0, <16 x float> %x1) {
841; CHECK-LABEL: combine_vpermi2var_16f32_as_vpermps:
842; CHECK:       # %bb.0:
843; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
844; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
845; CHECK-NEXT:    ret{{[l|q]}}
846  %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>, <16 x float> %x1, i16 -1)
847  %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>, <16 x float> %res0, i16 -1)
848  ret <16 x float> %res1
849}
850
851define <16 x i32> @combine_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) {
852; CHECK-LABEL: combine_vpermt2var_16i32_as_vpermd:
853; CHECK:       # %bb.0:
854; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
855; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
856; CHECK-NEXT:    ret{{[l|q]}}
857  %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>, <16 x i32> %x0, <16 x i32> %x1, i16 -1)
858  %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
859  ret <16 x i32> %res1
860}
861
862define <16 x i32> @combine_vpermt2var_16i32_as_vpsrlq(<16 x i32> %x0) {
863; CHECK-LABEL: combine_vpermt2var_16i32_as_vpsrlq:
864; CHECK:       # %bb.0:
865; CHECK-NEXT:    vpsrlq $32, %zmm0, %zmm0
866; CHECK-NEXT:    ret{{[l|q]}}
867  %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 1, i32 16, i32 3, i32 16, i32 5, i32 16, i32 7, i32 16, i32 9, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>, <16 x i32> %x0, <16 x i32> zeroinitializer, i16 -1)
868  ret <16 x i32> %res0
869}
870
871define <16 x i32> @combine_vpermt2var_16i32_as_vpsllq(<16 x i32> %x0) {
872; CHECK-LABEL: combine_vpermt2var_16i32_as_vpsllq:
873; CHECK:       # %bb.0:
874; CHECK-NEXT:    vpsllq $32, %zmm0, %zmm0
875; CHECK-NEXT:    ret{{[l|q]}}
876  %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>, <16 x i32> %x0, <16 x i32> zeroinitializer, i16 -1)
877  ret <16 x i32> %res0
878}
879
880define <8 x double> @combine_vpermi2var_vpermt2var_8f64_as_vperm2(<8 x double> %x0, <8 x double> %x1) {
881; X86-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2:
882; X86:       # %bb.0:
883; X86-NEXT:    vmovapd {{.*#+}} zmm2 = [4,0,14,0,3,0,12,0,7,0,8,0,0,0,15,0]
884; X86-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
885; X86-NEXT:    vmovapd %zmm2, %zmm0
886; X86-NEXT:    retl
887;
888; X64-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2:
889; X64:       # %bb.0:
890; X64-NEXT:    vmovapd {{.*#+}} zmm2 = [4,14,3,12,7,8,0,15]
891; X64-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
892; X64-NEXT:    vmovapd %zmm2, %zmm0
893; X64-NEXT:    retq
894  %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 15, i64 0, i64 8, i64 7, i64 12, i64 6, i64 11, i64 4>, <8 x double> %x1, i8 -1)
895  %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x double> %res0, <8 x double> %res0, i8 -1)
896  ret <8 x double> %res1
897}
898
899define <8 x double> @combine_vpermi2var_8f64_as_permpd(<8 x double> %x0, <8 x double> %x1, i64 %a2) {
900; X86-LABEL: combine_vpermi2var_8f64_as_permpd:
901; X86:       # %bb.0:
902; X86-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
903; X86-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0]
904; X86-NEXT:    vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
905; X86-NEXT:    vinsertf64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %zmm2, %zmm2
906; X86-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm2
907; X86-NEXT:    vpermpd {{.*#+}} zmm0 = zmm2[2,3,1,1,6,7,5,5]
908; X86-NEXT:    retl
909;
910; X64-LABEL: combine_vpermi2var_8f64_as_permpd:
911; X64:       # %bb.0:
912; X64-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[1,3,2,2,5,7,6,6]
913; X64-NEXT:    retq
914  %res0 = insertelement <8 x i64> <i64 0, i64 2, i64 1, i64 3, i64 4, i64 6, i64 5, i64 7>, i64 %a2, i32 0
915  %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %res0, <8 x double> %x1, i8 -1)
916  %res2 = shufflevector <8 x double> %res1, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 1, i32 6, i32 7, i32 5, i32 5>
917  ret <8 x double> %res2
918}
919
920define <16 x i32> @combine_vpermi2var_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) {
921; CHECK-LABEL: combine_vpermi2var_vpermt2var_16i32_as_vpermd:
922; CHECK:       # %bb.0:
923; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
924; CHECK-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
925; CHECK-NEXT:    ret{{[l|q]}}
926  %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> <i32 0, i32 31, i32 2, i32 29, i32 4, i32 27, i32 6, i32 25, i32 8, i32 23, i32 10, i32 21, i32 12, i32 19, i32 14, i32 17>, <16 x i32> %x1, i16 -1)
927  %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 0, i32 17, i32 2, i32 18, i32 4, i32 19, i32 6, i32 21, i32 8, i32 23, i32 10, i32 25, i32 12, i32 27, i32 14, i32 29>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
928  ret <16 x i32> %res1
929}
930
931define <8 x double> @combine_vpermi2var_vpermvar_8f64_as_vperm2_zero(<8 x double> %x0) {
932; X86-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero:
933; X86:       # %bb.0:
934; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
935; X86-NEXT:    vmovapd {{.*#+}} zmm2 = [8,0,3,0,10,0,11,0,1,0,7,0,14,0,5,0]
936; X86-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
937; X86-NEXT:    retl
938;
939; X64-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero:
940; X64:       # %bb.0:
941; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
942; X64-NEXT:    vmovapd {{.*#+}} zmm2 = [8,3,10,11,1,7,14,5]
943; X64-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
944; X64-NEXT:    retq
945  %res0 = shufflevector <8 x double> %x0, <8 x double> zeroinitializer, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
946  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %res0, <8 x i64> <i64 3, i64 2, i64 1, i64 7, i64 0, i64 6, i64 5, i64 4>)
947  ret <8 x double> %1
948}
949
950define <16 x float> @combine_vpermi2var_vpermvar_16f32_as_vperm2_zero(<16 x float> %x0) {
951; CHECK-LABEL: combine_vpermi2var_vpermvar_16f32_as_vperm2_zero:
952; CHECK:       # %bb.0:
953; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
954; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8]
955; CHECK-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
956; CHECK-NEXT:    ret{{[l|q]}}
957  %res0 = shufflevector <16 x float> %x0, <16 x float> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
958  %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 0, i32 14, i32 2, i32 12, i32 4, i32 10, i32 3, i32 12, i32 4, i32 11, i32 5, i32 10, i32 6, i32 9, i32 7, i32 8>, <16 x float> %res0, i16 -1)
959  ret <16 x float> %res1
960}
961
962define <8 x i64> @combine_vpermvar_insertion_as_broadcast_v8i64(i64 %a0) {
963; X86-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64:
964; X86:       # %bb.0:
965; X86-NEXT:    vbroadcastsd {{[0-9]+}}(%esp), %zmm0
966; X86-NEXT:    retl
967;
968; X64-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64:
969; X64:       # %bb.0:
970; X64-NEXT:    vpbroadcastq %rdi, %zmm0
971; X64-NEXT:    retq
972  %1 = insertelement <8 x i64> undef, i64 %a0, i32 0
973  %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %1, <8 x i64> zeroinitializer)
974  ret <8 x i64> %2
975}
976