1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vp2intersect,+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vp2intersect,+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=X64
4
5define void @test_mm256_2intersect_epi32(<4 x i64> %a, <4 x i64> %b, i8* nocapture %m0, i8* nocapture %m1) {
6; X86-LABEL: test_mm256_2intersect_epi32:
7; X86:       # %bb.0: # %entry
8; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
9; X86-NEXT:    vp2intersectd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0xc1]
10; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
11; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
12; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
13; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
14; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
15; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
16; X86-NEXT:    retl # encoding: [0xc3]
17;
18; X64-LABEL: test_mm256_2intersect_epi32:
19; X64:       # %bb.0: # %entry
20; X64-NEXT:    vp2intersectd %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0xc1]
21; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
22; X64-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
23; X64-NEXT:    movb %cl, (%rdi) # encoding: [0x88,0x0f]
24; X64-NEXT:    movb %al, (%rsi) # encoding: [0x88,0x06]
25; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
26; X64-NEXT:    retq # encoding: [0xc3]
27entry:
28  %0 = bitcast <4 x i64> %a to <8 x i32>
29  %1 = bitcast <4 x i64> %b to <8 x i32>
30  %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %0, <8 x i32> %1)
31  %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0
32  %4 = bitcast i8* %m0 to <8 x i1>*
33  store <8 x i1> %3, <8 x i1>* %4, align 8
34  %5 = extractvalue { <8 x i1>, <8 x i1> } %2, 1
35  %6 = bitcast i8* %m1 to <8 x i1>*
36  store <8 x i1> %5, <8 x i1>* %6, align 8
37  ret void
38}
39
40define void @test_mm256_2intersect_epi64(<4 x i64> %a, <4 x i64> %b, i8* nocapture %m0, i8* nocapture %m1) {
41; X86-LABEL: test_mm256_2intersect_epi64:
42; X86:       # %bb.0: # %entry
43; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
44; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
45; X86-NEXT:    vp2intersectq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0xc1]
46; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
47; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
48; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
49; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
50; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
51; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
52; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
53; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
54; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
55; X86-NEXT:    retl # encoding: [0xc3]
56;
57; X64-LABEL: test_mm256_2intersect_epi64:
58; X64:       # %bb.0: # %entry
59; X64-NEXT:    vp2intersectq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0xc1]
60; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
61; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
62; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
63; X64-NEXT:    movb %al, (%rdi) # encoding: [0x88,0x07]
64; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
65; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
66; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
67; X64-NEXT:    movb %al, (%rsi) # encoding: [0x88,0x06]
68; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
69; X64-NEXT:    retq # encoding: [0xc3]
70entry:
71  %0 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %a, <4 x i64> %b)
72  %1 = extractvalue { <4 x i1>, <4 x i1> } %0, 0
73  %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
74  %3 = bitcast <8 x i1> %2 to i8
75  store i8 %3, i8* %m0, align 1
76  %4 = extractvalue { <4 x i1>, <4 x i1> } %0, 1
77  %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
78  %6 = bitcast <8 x i1> %5 to i8
79  store i8 %6, i8* %m1, align 1
80  ret void
81}
82
83define void @test_mm256_2intersect_epi32_p(<4 x i64>* nocapture readonly %a, <4 x i64>* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
84; X86-LABEL: test_mm256_2intersect_epi32_p:
85; X86:       # %bb.0: # %entry
86; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c]
87; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
88; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
89; X86-NEXT:    vmovaps (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x02]
90; X86-NEXT:    vp2intersectd (%ecx), %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0x01]
91; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
92; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
93; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
94; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10]
95; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
96; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
97; X86-NEXT:    retl # encoding: [0xc3]
98;
99; X64-LABEL: test_mm256_2intersect_epi32_p:
100; X64:       # %bb.0: # %entry
101; X64-NEXT:    vmovaps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07]
102; X64-NEXT:    vp2intersectd (%rsi), %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x28,0x68,0x06]
103; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
104; X64-NEXT:    kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0]
105; X64-NEXT:    movb %sil, (%rdx) # encoding: [0x40,0x88,0x32]
106; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
107; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
108; X64-NEXT:    retq # encoding: [0xc3]
109entry:
110  %0 = bitcast <4 x i64>* %a to <8 x i32>*
111  %1 = load <8 x i32>, <8 x i32>* %0, align 32
112  %2 = bitcast <4 x i64>* %b to <8 x i32>*
113  %3 = load <8 x i32>, <8 x i32>* %2, align 32
114  %4 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %1, <8 x i32> %3)
115  %5 = extractvalue { <8 x i1>, <8 x i1> } %4, 0
116  %6 = bitcast i8* %m0 to <8 x i1>*
117  store <8 x i1> %5, <8 x i1>* %6, align 8
118  %7 = extractvalue { <8 x i1>, <8 x i1> } %4, 1
119  %8 = bitcast i8* %m1 to <8 x i1>*
120  store <8 x i1> %7, <8 x i1>* %8, align 8
121  ret void
122}
123
124define void @test_mm256_2intersect_epi64_p(<4 x i64>* nocapture readonly %a, <4 x i64>* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
125; X86-LABEL: test_mm256_2intersect_epi64_p:
126; X86:       # %bb.0: # %entry
127; X86-NEXT:    pushl %esi # encoding: [0x56]
128; X86-NEXT:    .cfi_def_cfa_offset 8
129; X86-NEXT:    .cfi_offset %esi, -8
130; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
131; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
132; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
133; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
134; X86-NEXT:    vmovaps (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x06]
135; X86-NEXT:    vp2intersectq (%edx), %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0x02]
136; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
137; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
138; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
139; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
140; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
141; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
142; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
143; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
144; X86-NEXT:    popl %esi # encoding: [0x5e]
145; X86-NEXT:    .cfi_def_cfa_offset 4
146; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
147; X86-NEXT:    retl # encoding: [0xc3]
148;
149; X64-LABEL: test_mm256_2intersect_epi64_p:
150; X64:       # %bb.0: # %entry
151; X64-NEXT:    vmovaps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07]
152; X64-NEXT:    vp2intersectq (%rsi), %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0x06]
153; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
154; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
155; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
156; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
157; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
158; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
159; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
160; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
161; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
162; X64-NEXT:    retq # encoding: [0xc3]
163entry:
164  %0 = load <4 x i64>, <4 x i64>* %a, align 32
165  %1 = load <4 x i64>, <4 x i64>* %b, align 32
166  %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %0, <4 x i64> %1)
167  %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
168  %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
169  %5 = bitcast <8 x i1> %4 to i8
170  store i8 %5, i8* %m0, align 1
171  %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
172  %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
173  %8 = bitcast <8 x i1> %7 to i8
174  store i8 %8, i8* %m1, align 1
175  ret void
176}
177
178define void @test_mm256_2intersect_epi32_b(i32* nocapture readonly %a, i32* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
179; X86-LABEL: test_mm256_2intersect_epi32_b:
180; X86:       # %bb.0: # %entry
181; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c]
182; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
183; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
184; X86-NEXT:    vbroadcastss (%edx), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0x02]
185; X86-NEXT:    vp2intersectd (%ecx){1to8}, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x38,0x68,0x01]
186; X86-NEXT:    kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9]
187; X86-NEXT:    kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0]
188; X86-NEXT:    movb %dl, (%eax) # encoding: [0x88,0x10]
189; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x10]
190; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
191; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
192; X86-NEXT:    retl # encoding: [0xc3]
193;
194; X64-LABEL: test_mm256_2intersect_epi32_b:
195; X64:       # %bb.0: # %entry
196; X64-NEXT:    vbroadcastss (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0x07]
197; X64-NEXT:    vp2intersectd (%rsi){1to8}, %ymm0, %k0 # encoding: [0x62,0xf2,0x7f,0x38,0x68,0x06]
198; X64-NEXT:    kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1]
199; X64-NEXT:    kmovw %k0, %esi # encoding: [0xc5,0xf8,0x93,0xf0]
200; X64-NEXT:    movb %sil, (%rdx) # encoding: [0x40,0x88,0x32]
201; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
202; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
203; X64-NEXT:    retq # encoding: [0xc3]
204entry:
205  %0 = load i32, i32* %a, align 4
206  %vecinit.i.i = insertelement <8 x i32> undef, i32 %0, i32 0
207  %vecinit7.i.i = shufflevector <8 x i32> %vecinit.i.i, <8 x i32> undef, <8 x i32> zeroinitializer
208  %1 = load i32, i32* %b, align 4
209  %vecinit.i.i2 = insertelement <8 x i32> undef, i32 %1, i32 0
210  %vecinit7.i.i3 = shufflevector <8 x i32> %vecinit.i.i2, <8 x i32> undef, <8 x i32> zeroinitializer
211  %2 = tail call { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32> %vecinit7.i.i, <8 x i32> %vecinit7.i.i3)
212  %3 = extractvalue { <8 x i1>, <8 x i1> } %2, 0
213  %4 = bitcast i8* %m0 to <8 x i1>*
214  store <8 x i1> %3, <8 x i1>* %4, align 8
215  %5 = extractvalue { <8 x i1>, <8 x i1> } %2, 1
216  %6 = bitcast i8* %m1 to <8 x i1>*
217  store <8 x i1> %5, <8 x i1>* %6, align 8
218  ret void
219}
220
221define void @test_mm256_2intersect_epi64_b(i64* nocapture readonly %a, i64* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
222; X86-LABEL: test_mm256_2intersect_epi64_b:
223; X86:       # %bb.0: # %entry
224; X86-NEXT:    pushl %esi # encoding: [0x56]
225; X86-NEXT:    .cfi_def_cfa_offset 8
226; X86-NEXT:    .cfi_offset %esi, -8
227; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
228; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
229; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
230; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
231; X86-NEXT:    vbroadcastsd (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x06]
232; X86-NEXT:    vp2intersectq (%edx){1to4}, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x38,0x68,0x02]
233; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
234; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
235; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
236; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
237; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
238; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
239; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
240; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
241; X86-NEXT:    popl %esi # encoding: [0x5e]
242; X86-NEXT:    .cfi_def_cfa_offset 4
243; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
244; X86-NEXT:    retl # encoding: [0xc3]
245;
246; X64-LABEL: test_mm256_2intersect_epi64_b:
247; X64:       # %bb.0: # %entry
248; X64-NEXT:    vbroadcastsd (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x07]
249; X64-NEXT:    vp2intersectq (%rsi){1to4}, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x38,0x68,0x06]
250; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
251; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
252; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
253; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
254; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
255; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
256; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
257; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
258; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
259; X64-NEXT:    retq # encoding: [0xc3]
260entry:
261  %0 = load i64, i64* %a, align 8
262  %vecinit.i.i = insertelement <4 x i64> undef, i64 %0, i32 0
263  %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
264  %1 = load i64, i64* %b, align 8
265  %vecinit.i.i2 = insertelement <4 x i64> undef, i64 %1, i32 0
266  %vecinit3.i.i3 = shufflevector <4 x i64> %vecinit.i.i2, <4 x i64> undef, <4 x i32> zeroinitializer
267  %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64> %vecinit3.i.i, <4 x i64> %vecinit3.i.i3)
268  %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
269  %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
270  %5 = bitcast <8 x i1> %4 to i8
271  store i8 %5, i8* %m0, align 1
272  %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
273  %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
274  %8 = bitcast <8 x i1> %7 to i8
275  store i8 %8, i8* %m1, align 1
276  ret void
277}
278
279define void @test_mm_2intersect_epi32(<2 x i64> %a, <2 x i64> %b, i8* nocapture %m0, i8* nocapture %m1) {
280; X86-LABEL: test_mm_2intersect_epi32:
281; X86:       # %bb.0: # %entry
282; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
283; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
284; X86-NEXT:    vp2intersectd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0xc1]
285; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
286; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
287; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
288; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
289; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
290; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
291; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
292; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
293; X86-NEXT:    retl # encoding: [0xc3]
294;
295; X64-LABEL: test_mm_2intersect_epi32:
296; X64:       # %bb.0: # %entry
297; X64-NEXT:    vp2intersectd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0xc1]
298; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
299; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
300; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
301; X64-NEXT:    movb %al, (%rdi) # encoding: [0x88,0x07]
302; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
303; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
304; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
305; X64-NEXT:    movb %al, (%rsi) # encoding: [0x88,0x06]
306; X64-NEXT:    retq # encoding: [0xc3]
307entry:
308  %0 = bitcast <2 x i64> %a to <4 x i32>
309  %1 = bitcast <2 x i64> %b to <4 x i32>
310  %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %0, <4 x i32> %1)
311  %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
312  %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
313  %5 = bitcast <8 x i1> %4 to i8
314  store i8 %5, i8* %m0, align 1
315  %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
316  %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
317  %8 = bitcast <8 x i1> %7 to i8
318  store i8 %8, i8* %m1, align 1
319  ret void
320}
321
322define void @test_mm_2intersect_epi64(<2 x i64> %a, <2 x i64> %b, i8* nocapture %m0, i8* nocapture %m1) {
323; X86-LABEL: test_mm_2intersect_epi64:
324; X86:       # %bb.0: # %entry
325; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
326; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
327; X86-NEXT:    vp2intersectq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0xc1]
328; X86-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
329; X86-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
330; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
331; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
332; X86-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
333; X86-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
334; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
335; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
336; X86-NEXT:    retl # encoding: [0xc3]
337;
338; X64-LABEL: test_mm_2intersect_epi64:
339; X64:       # %bb.0: # %entry
340; X64-NEXT:    vp2intersectq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0xc1]
341; X64-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
342; X64-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
343; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
344; X64-NEXT:    movb %al, (%rdi) # encoding: [0x88,0x07]
345; X64-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
346; X64-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
347; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
348; X64-NEXT:    movb %al, (%rsi) # encoding: [0x88,0x06]
349; X64-NEXT:    retq # encoding: [0xc3]
350entry:
351  %0 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %a, <2 x i64> %b)
352  %1 = extractvalue { <2 x i1>, <2 x i1> } %0, 0
353  %2 = shufflevector <2 x i1> %1, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
354  %3 = bitcast <8 x i1> %2 to i8
355  store i8 %3, i8* %m0, align 1
356  %4 = extractvalue { <2 x i1>, <2 x i1> } %0, 1
357  %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
358  %6 = bitcast <8 x i1> %5 to i8
359  store i8 %6, i8* %m1, align 1
360  ret void
361}
362
363define void @test_mm_2intersect_epi32_p(<2 x i64>* nocapture readonly %a, <2 x i64>* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
364; X86-LABEL: test_mm_2intersect_epi32_p:
365; X86:       # %bb.0: # %entry
366; X86-NEXT:    pushl %esi # encoding: [0x56]
367; X86-NEXT:    .cfi_def_cfa_offset 8
368; X86-NEXT:    .cfi_offset %esi, -8
369; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
370; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
371; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
372; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
373; X86-NEXT:    vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06]
374; X86-NEXT:    vp2intersectd (%edx), %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0x02]
375; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
376; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
377; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
378; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
379; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
380; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
381; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
382; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
383; X86-NEXT:    popl %esi # encoding: [0x5e]
384; X86-NEXT:    .cfi_def_cfa_offset 4
385; X86-NEXT:    retl # encoding: [0xc3]
386;
387; X64-LABEL: test_mm_2intersect_epi32_p:
388; X64:       # %bb.0: # %entry
389; X64-NEXT:    vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
390; X64-NEXT:    vp2intersectd (%rsi), %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x08,0x68,0x06]
391; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
392; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
393; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
394; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
395; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
396; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
397; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
398; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
399; X64-NEXT:    retq # encoding: [0xc3]
400entry:
401  %0 = bitcast <2 x i64>* %a to <4 x i32>*
402  %1 = load <4 x i32>, <4 x i32>* %0, align 16
403  %2 = bitcast <2 x i64>* %b to <4 x i32>*
404  %3 = load <4 x i32>, <4 x i32>* %2, align 16
405  %4 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %1, <4 x i32> %3)
406  %5 = extractvalue { <4 x i1>, <4 x i1> } %4, 0
407  %6 = shufflevector <4 x i1> %5, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
408  %7 = bitcast <8 x i1> %6 to i8
409  store i8 %7, i8* %m0, align 1
410  %8 = extractvalue { <4 x i1>, <4 x i1> } %4, 1
411  %9 = shufflevector <4 x i1> %8, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
412  %10 = bitcast <8 x i1> %9 to i8
413  store i8 %10, i8* %m1, align 1
414  ret void
415}
416
417define void @test_mm_2intersect_epi64_p(<2 x i64>* nocapture readonly %a, <2 x i64>* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
418; X86-LABEL: test_mm_2intersect_epi64_p:
419; X86:       # %bb.0: # %entry
420; X86-NEXT:    pushl %esi # encoding: [0x56]
421; X86-NEXT:    .cfi_def_cfa_offset 8
422; X86-NEXT:    .cfi_offset %esi, -8
423; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
424; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
425; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
426; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
427; X86-NEXT:    vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06]
428; X86-NEXT:    vp2intersectq (%edx), %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0x02]
429; X86-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
430; X86-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
431; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
432; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
433; X86-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
434; X86-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
435; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
436; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
437; X86-NEXT:    popl %esi # encoding: [0x5e]
438; X86-NEXT:    .cfi_def_cfa_offset 4
439; X86-NEXT:    retl # encoding: [0xc3]
440;
441; X64-LABEL: test_mm_2intersect_epi64_p:
442; X64:       # %bb.0: # %entry
443; X64-NEXT:    vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
444; X64-NEXT:    vp2intersectq (%rsi), %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0x06]
445; X64-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
446; X64-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
447; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
448; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
449; X64-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
450; X64-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
451; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
452; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
453; X64-NEXT:    retq # encoding: [0xc3]
454entry:
455  %0 = load <2 x i64>, <2 x i64>* %a, align 16
456  %1 = load <2 x i64>, <2 x i64>* %b, align 16
457  %2 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %0, <2 x i64> %1)
458  %3 = extractvalue { <2 x i1>, <2 x i1> } %2, 0
459  %4 = shufflevector <2 x i1> %3, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
460  %5 = bitcast <8 x i1> %4 to i8
461  store i8 %5, i8* %m0, align 1
462  %6 = extractvalue { <2 x i1>, <2 x i1> } %2, 1
463  %7 = shufflevector <2 x i1> %6, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
464  %8 = bitcast <8 x i1> %7 to i8
465  store i8 %8, i8* %m1, align 1
466  ret void
467}
468
469define void @test_mm_2intersect_epi32_b(i32* nocapture readonly %a, i32* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
470; X86-LABEL: test_mm_2intersect_epi32_b:
471; X86:       # %bb.0: # %entry
472; X86-NEXT:    pushl %esi # encoding: [0x56]
473; X86-NEXT:    .cfi_def_cfa_offset 8
474; X86-NEXT:    .cfi_offset %esi, -8
475; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
476; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
477; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
478; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
479; X86-NEXT:    vbroadcastss (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x06]
480; X86-NEXT:    vp2intersectd (%edx){1to4}, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x18,0x68,0x02]
481; X86-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
482; X86-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
483; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
484; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
485; X86-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
486; X86-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
487; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
488; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
489; X86-NEXT:    popl %esi # encoding: [0x5e]
490; X86-NEXT:    .cfi_def_cfa_offset 4
491; X86-NEXT:    retl # encoding: [0xc3]
492;
493; X64-LABEL: test_mm_2intersect_epi32_b:
494; X64:       # %bb.0: # %entry
495; X64-NEXT:    vbroadcastss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x07]
496; X64-NEXT:    vp2intersectd (%rsi){1to4}, %xmm0, %k0 # encoding: [0x62,0xf2,0x7f,0x18,0x68,0x06]
497; X64-NEXT:    kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c]
498; X64-NEXT:    kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c]
499; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
500; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
501; X64-NEXT:    kshiftlw $12, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0c]
502; X64-NEXT:    kshiftrw $12, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
503; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
504; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
505; X64-NEXT:    retq # encoding: [0xc3]
506entry:
507  %0 = load i32, i32* %a, align 4
508  %vecinit.i.i = insertelement <4 x i32> undef, i32 %0, i32 0
509  %vecinit3.i.i = shufflevector <4 x i32> %vecinit.i.i, <4 x i32> undef, <4 x i32> zeroinitializer
510  %1 = load i32, i32* %b, align 4
511  %vecinit.i.i2 = insertelement <4 x i32> undef, i32 %1, i32 0
512  %vecinit3.i.i3 = shufflevector <4 x i32> %vecinit.i.i2, <4 x i32> undef, <4 x i32> zeroinitializer
513  %2 = tail call { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32> %vecinit3.i.i, <4 x i32> %vecinit3.i.i3)
514  %3 = extractvalue { <4 x i1>, <4 x i1> } %2, 0
515  %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
516  %5 = bitcast <8 x i1> %4 to i8
517  store i8 %5, i8* %m0, align 1
518  %6 = extractvalue { <4 x i1>, <4 x i1> } %2, 1
519  %7 = shufflevector <4 x i1> %6, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
520  %8 = bitcast <8 x i1> %7 to i8
521  store i8 %8, i8* %m1, align 1
522  ret void
523}
524
525define void @test_mm_2intersect_epi64_b(i64* nocapture readonly %a, i64* nocapture readonly %b, i8* nocapture %m0, i8* nocapture %m1) {
526; X86-LABEL: test_mm_2intersect_epi64_b:
527; X86:       # %bb.0: # %entry
528; X86-NEXT:    pushl %esi # encoding: [0x56]
529; X86-NEXT:    .cfi_def_cfa_offset 8
530; X86-NEXT:    .cfi_offset %esi, -8
531; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
532; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
533; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
534; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
535; X86-NEXT:    vmovddup (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x06]
536; X86-NEXT:    # xmm0 = mem[0,0]
537; X86-NEXT:    vp2intersectq (%edx){1to2}, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x18,0x68,0x02]
538; X86-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
539; X86-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
540; X86-NEXT:    kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2]
541; X86-NEXT:    movb %dl, (%ecx) # encoding: [0x88,0x11]
542; X86-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
543; X86-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
544; X86-NEXT:    kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8]
545; X86-NEXT:    movb %cl, (%eax) # encoding: [0x88,0x08]
546; X86-NEXT:    popl %esi # encoding: [0x5e]
547; X86-NEXT:    .cfi_def_cfa_offset 4
548; X86-NEXT:    retl # encoding: [0xc3]
549;
550; X64-LABEL: test_mm_2intersect_epi64_b:
551; X64:       # %bb.0: # %entry
552; X64-NEXT:    vmovddup (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x07]
553; X64-NEXT:    # xmm0 = mem[0,0]
554; X64-NEXT:    vp2intersectq (%rsi){1to2}, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x18,0x68,0x06]
555; X64-NEXT:    kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e]
556; X64-NEXT:    kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e]
557; X64-NEXT:    kmovw %k2, %eax # encoding: [0xc5,0xf8,0x93,0xc2]
558; X64-NEXT:    movb %al, (%rdx) # encoding: [0x88,0x02]
559; X64-NEXT:    kshiftlw $14, %k1, %k0 # encoding: [0xc4,0xe3,0xf9,0x32,0xc1,0x0e]
560; X64-NEXT:    kshiftrw $14, %k0, %k0 # encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0e]
561; X64-NEXT:    kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0]
562; X64-NEXT:    movb %al, (%rcx) # encoding: [0x88,0x01]
563; X64-NEXT:    retq # encoding: [0xc3]
564entry:
565  %0 = load i64, i64* %a, align 8
566  %vecinit.i.i = insertelement <2 x i64> undef, i64 %0, i32 0
567  %vecinit1.i.i = shufflevector <2 x i64> %vecinit.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
568  %1 = load i64, i64* %b, align 8
569  %vecinit.i.i2 = insertelement <2 x i64> undef, i64 %1, i32 0
570  %vecinit1.i.i3 = shufflevector <2 x i64> %vecinit.i.i2, <2 x i64> undef, <2 x i32> zeroinitializer
571  %2 = tail call { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64> %vecinit1.i.i, <2 x i64> %vecinit1.i.i3)
572  %3 = extractvalue { <2 x i1>, <2 x i1> } %2, 0
573  %4 = shufflevector <2 x i1> %3, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
574  %5 = bitcast <8 x i1> %4 to i8
575  store i8 %5, i8* %m0, align 1
576  %6 = extractvalue { <2 x i1>, <2 x i1> } %2, 1
577  %7 = shufflevector <2 x i1> %6, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
578  %8 = bitcast <8 x i1> %7 to i8
579  store i8 %8, i8* %m1, align 1
580  ret void
581}
582
583declare { <8 x i1>, <8 x i1> } @llvm.x86.avx512.vp2intersect.d.256(<8 x i32>, <8 x i32>)
584declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.q.256(<4 x i64>, <4 x i64>)
585declare { <4 x i1>, <4 x i1> } @llvm.x86.avx512.vp2intersect.d.128(<4 x i32>, <4 x i32>)
586declare { <2 x i1>, <2 x i1> } @llvm.x86.avx512.vp2intersect.q.128(<2 x i64>, <2 x i64>)
587