1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop  | FileCheck %s --check-prefix=XOP
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
8
9;
10; 128-bit vectors
11;
12
13define <2 x i64> @bitselect_v2i64_rr(<2 x i64>, <2 x i64>) {
14; SSE-LABEL: bitselect_v2i64_rr:
15; SSE:       # %bb.0:
16; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
17; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
18; SSE-NEXT:    orps %xmm1, %xmm0
19; SSE-NEXT:    retq
20;
21; XOP-LABEL: bitselect_v2i64_rr:
22; XOP:       # %bb.0:
23; XOP-NEXT:    vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1, %xmm0
24; XOP-NEXT:    retq
25;
26; AVX-LABEL: bitselect_v2i64_rr:
27; AVX:       # %bb.0:
28; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
29; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
30; AVX-NEXT:    vorps %xmm0, %xmm1, %xmm0
31; AVX-NEXT:    retq
32;
33; AVX512F-LABEL: bitselect_v2i64_rr:
34; AVX512F:       # %bb.0:
35; AVX512F-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
36; AVX512F-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
37; AVX512F-NEXT:    vorps %xmm0, %xmm1, %xmm0
38; AVX512F-NEXT:    retq
39;
40; AVX512VL-LABEL: bitselect_v2i64_rr:
41; AVX512VL:       # %bb.0:
42; AVX512VL-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
43; AVX512VL-NEXT:    retq
44  %3 = and <2 x i64> %0, <i64 4294967296, i64 12884901890>
45  %4 = and <2 x i64> %1, <i64 -4294967297, i64 -12884901891>
46  %5 = or <2 x i64> %4, %3
47  ret <2 x i64> %5
48}
49
50define <2 x i64> @bitselect_v2i64_rm(<2 x i64>, <2 x i64>* nocapture readonly) {
51; SSE-LABEL: bitselect_v2i64_rm:
52; SSE:       # %bb.0:
53; SSE-NEXT:    movaps (%rdi), %xmm1
54; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
55; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
56; SSE-NEXT:    orps %xmm1, %xmm0
57; SSE-NEXT:    retq
58;
59; XOP-LABEL: bitselect_v2i64_rm:
60; XOP:       # %bb.0:
61; XOP-NEXT:    vmovdqa (%rdi), %xmm1
62; XOP-NEXT:    vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1, %xmm0
63; XOP-NEXT:    retq
64;
65; AVX-LABEL: bitselect_v2i64_rm:
66; AVX:       # %bb.0:
67; AVX-NEXT:    vmovaps (%rdi), %xmm1
68; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
69; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
70; AVX-NEXT:    vorps %xmm0, %xmm1, %xmm0
71; AVX-NEXT:    retq
72;
73; AVX512F-LABEL: bitselect_v2i64_rm:
74; AVX512F:       # %bb.0:
75; AVX512F-NEXT:    vmovaps (%rdi), %xmm1
76; AVX512F-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
77; AVX512F-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
78; AVX512F-NEXT:    vorps %xmm0, %xmm1, %xmm0
79; AVX512F-NEXT:    retq
80;
81; AVX512VL-LABEL: bitselect_v2i64_rm:
82; AVX512VL:       # %bb.0:
83; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm1
84; AVX512VL-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
85; AVX512VL-NEXT:    retq
86  %3 = load <2 x i64>, <2 x i64>* %1
87  %4 = and <2 x i64> %0, <i64 8589934593, i64 3>
88  %5 = and <2 x i64> %3, <i64 -8589934594, i64 -4>
89  %6 = or <2 x i64> %5, %4
90  ret <2 x i64> %6
91}
92
93define <2 x i64> @bitselect_v2i64_mr(<2 x i64>* nocapture readonly, <2 x i64>) {
94; SSE-LABEL: bitselect_v2i64_mr:
95; SSE:       # %bb.0:
96; SSE-NEXT:    movaps (%rdi), %xmm1
97; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
98; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
99; SSE-NEXT:    orps %xmm1, %xmm0
100; SSE-NEXT:    retq
101;
102; XOP-LABEL: bitselect_v2i64_mr:
103; XOP:       # %bb.0:
104; XOP-NEXT:    vmovdqa (%rdi), %xmm1
105; XOP-NEXT:    vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1, %xmm0
106; XOP-NEXT:    retq
107;
108; AVX-LABEL: bitselect_v2i64_mr:
109; AVX:       # %bb.0:
110; AVX-NEXT:    vmovaps (%rdi), %xmm1
111; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
112; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
113; AVX-NEXT:    vorps %xmm0, %xmm1, %xmm0
114; AVX-NEXT:    retq
115;
116; AVX512F-LABEL: bitselect_v2i64_mr:
117; AVX512F:       # %bb.0:
118; AVX512F-NEXT:    vmovaps (%rdi), %xmm1
119; AVX512F-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
120; AVX512F-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
121; AVX512F-NEXT:    vorps %xmm0, %xmm1, %xmm0
122; AVX512F-NEXT:    retq
123;
124; AVX512VL-LABEL: bitselect_v2i64_mr:
125; AVX512VL:       # %bb.0:
126; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm1
127; AVX512VL-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
128; AVX512VL-NEXT:    retq
129  %3 = load <2 x i64>, <2 x i64>* %0
130  %4 = and <2 x i64> %3, <i64 12884901890, i64 4294967296>
131  %5 = and <2 x i64> %1, <i64 -12884901891, i64 -4294967297>
132  %6 = or <2 x i64> %4, %5
133  ret <2 x i64> %6
134}
135
136define <2 x i64> @bitselect_v2i64_mm(<2 x i64>* nocapture readonly, <2 x i64>* nocapture readonly) {
137; SSE-LABEL: bitselect_v2i64_mm:
138; SSE:       # %bb.0:
139; SSE-NEXT:    movaps (%rdi), %xmm1
140; SSE-NEXT:    movaps (%rsi), %xmm0
141; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
142; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
143; SSE-NEXT:    orps %xmm1, %xmm0
144; SSE-NEXT:    retq
145;
146; XOP-LABEL: bitselect_v2i64_mm:
147; XOP:       # %bb.0:
148; XOP-NEXT:    vmovdqa (%rsi), %xmm0
149; XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551612,18446744065119617022]
150; XOP-NEXT:    vpcmov %xmm1, (%rdi), %xmm0, %xmm0
151; XOP-NEXT:    retq
152;
153; AVX-LABEL: bitselect_v2i64_mm:
154; AVX:       # %bb.0:
155; AVX-NEXT:    vmovaps (%rdi), %xmm0
156; AVX-NEXT:    vmovaps (%rsi), %xmm1
157; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
158; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
159; AVX-NEXT:    vorps %xmm0, %xmm1, %xmm0
160; AVX-NEXT:    retq
161;
162; AVX512F-LABEL: bitselect_v2i64_mm:
163; AVX512F:       # %bb.0:
164; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
165; AVX512F-NEXT:    vmovaps (%rsi), %xmm1
166; AVX512F-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
167; AVX512F-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
168; AVX512F-NEXT:    vorps %xmm0, %xmm1, %xmm0
169; AVX512F-NEXT:    retq
170;
171; AVX512VL-LABEL: bitselect_v2i64_mm:
172; AVX512VL:       # %bb.0:
173; AVX512VL-NEXT:    vmovdqa (%rsi), %xmm1
174; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm0 = [18446744073709551612,18446744065119617022]
175; AVX512VL-NEXT:    vpternlogq $202, (%rdi), %xmm1, %xmm0
176; AVX512VL-NEXT:    retq
177  %3 = load <2 x i64>, <2 x i64>* %0
178  %4 = load <2 x i64>, <2 x i64>* %1
179  %5 = and <2 x i64> %3, <i64 3, i64 8589934593>
180  %6 = and <2 x i64> %4, <i64 -4, i64 -8589934594>
181  %7 = or <2 x i64> %6, %5
182  ret <2 x i64> %7
183}
184
185define <2 x i64> @bitselect_v2i64_broadcast_rrr(<2 x i64> %a0, <2 x i64> %a1, i64 %a2) {
186; SSE-LABEL: bitselect_v2i64_broadcast_rrr:
187; SSE:       # %bb.0:
188; SSE-NEXT:    movq %rdi, %xmm2
189; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
190; SSE-NEXT:    pand %xmm2, %xmm0
191; SSE-NEXT:    pandn %xmm1, %xmm2
192; SSE-NEXT:    por %xmm2, %xmm0
193; SSE-NEXT:    retq
194;
195; XOP-LABEL: bitselect_v2i64_broadcast_rrr:
196; XOP:       # %bb.0:
197; XOP-NEXT:    vmovq %rdi, %xmm2
198; XOP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
199; XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
200; XOP-NEXT:    retq
201;
202; AVX1-LABEL: bitselect_v2i64_broadcast_rrr:
203; AVX1:       # %bb.0:
204; AVX1-NEXT:    vmovq %rdi, %xmm2
205; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
206; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
207; AVX1-NEXT:    vpandn %xmm1, %xmm2, %xmm1
208; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
209; AVX1-NEXT:    retq
210;
211; AVX2-LABEL: bitselect_v2i64_broadcast_rrr:
212; AVX2:       # %bb.0:
213; AVX2-NEXT:    vmovq %rdi, %xmm2
214; AVX2-NEXT:    vpbroadcastq %xmm2, %xmm2
215; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
216; AVX2-NEXT:    vpandn %xmm1, %xmm2, %xmm1
217; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
218; AVX2-NEXT:    retq
219;
220; AVX512F-LABEL: bitselect_v2i64_broadcast_rrr:
221; AVX512F:       # %bb.0:
222; AVX512F-NEXT:    vmovq %rdi, %xmm2
223; AVX512F-NEXT:    vpbroadcastq %xmm2, %xmm2
224; AVX512F-NEXT:    vpand %xmm2, %xmm0, %xmm0
225; AVX512F-NEXT:    vpandn %xmm1, %xmm2, %xmm1
226; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
227; AVX512F-NEXT:    retq
228;
229; AVX512VL-LABEL: bitselect_v2i64_broadcast_rrr:
230; AVX512VL:       # %bb.0:
231; AVX512VL-NEXT:    vpbroadcastq %rdi, %xmm2
232; AVX512VL-NEXT:    vpternlogq $226, %xmm1, %xmm2, %xmm0
233; AVX512VL-NEXT:    retq
234  %1 = insertelement <2 x i64> undef, i64 %a2, i32 0
235  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
236  %3 = xor <2 x i64> %1, <i64 -1, i64 undef>
237  %4 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
238  %5 = and <2 x i64> %a0, %2
239  %6 = and <2 x i64> %a1, %4
240  %7 = or <2 x i64> %5, %6
241  ret <2 x i64> %7
242}
243
244define <2 x i64> @bitselect_v2i64_broadcast_rrm(<2 x i64> %a0, <2 x i64> %a1, i64* %p2) {
245; SSE-LABEL: bitselect_v2i64_broadcast_rrm:
246; SSE:       # %bb.0:
247; SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
248; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
249; SSE-NEXT:    pand %xmm2, %xmm0
250; SSE-NEXT:    pandn %xmm1, %xmm2
251; SSE-NEXT:    por %xmm2, %xmm0
252; SSE-NEXT:    retq
253;
254; XOP-LABEL: bitselect_v2i64_broadcast_rrm:
255; XOP:       # %bb.0:
256; XOP-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
257; XOP-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
258; XOP-NEXT:    retq
259;
260; AVX-LABEL: bitselect_v2i64_broadcast_rrm:
261; AVX:       # %bb.0:
262; AVX-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
263; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm0
264; AVX-NEXT:    vandnps %xmm1, %xmm2, %xmm1
265; AVX-NEXT:    vorps %xmm1, %xmm0, %xmm0
266; AVX-NEXT:    retq
267;
268; AVX512F-LABEL: bitselect_v2i64_broadcast_rrm:
269; AVX512F:       # %bb.0:
270; AVX512F-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
271; AVX512F-NEXT:    vandps %xmm2, %xmm0, %xmm0
272; AVX512F-NEXT:    vandnps %xmm1, %xmm2, %xmm1
273; AVX512F-NEXT:    vorps %xmm1, %xmm0, %xmm0
274; AVX512F-NEXT:    retq
275;
276; AVX512VL-LABEL: bitselect_v2i64_broadcast_rrm:
277; AVX512VL:       # %bb.0:
278; AVX512VL-NEXT:    vpternlogq $228, (%rdi){1to2}, %xmm1, %xmm0
279; AVX512VL-NEXT:    retq
280  %a2 = load i64, i64* %p2
281  %1 = insertelement <2 x i64> undef, i64 %a2, i32 0
282  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
283  %3 = xor <2 x i64> %1, <i64 -1, i64 undef>
284  %4 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
285  %5 = and <2 x i64> %a0, %2
286  %6 = and <2 x i64> %a1, %4
287  %7 = or <2 x i64> %5, %6
288  ret <2 x i64> %7
289}
290
291;
292; 256-bit vectors
293;
294
295define <4 x i64> @bitselect_v4i64_rr(<4 x i64>, <4 x i64>) {
296; SSE-LABEL: bitselect_v4i64_rr:
297; SSE:       # %bb.0:
298; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
299; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
300; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
301; SSE-NEXT:    orps %xmm3, %xmm1
302; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
303; SSE-NEXT:    orps %xmm2, %xmm0
304; SSE-NEXT:    retq
305;
306; XOP-LABEL: bitselect_v4i64_rr:
307; XOP:       # %bb.0:
308; XOP-NEXT:    vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1, %ymm0
309; XOP-NEXT:    retq
310;
311; AVX-LABEL: bitselect_v4i64_rr:
312; AVX:       # %bb.0:
313; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
314; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
315; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
316; AVX-NEXT:    retq
317;
318; AVX512F-LABEL: bitselect_v4i64_rr:
319; AVX512F:       # %bb.0:
320; AVX512F-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
321; AVX512F-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
322; AVX512F-NEXT:    vorps %ymm0, %ymm1, %ymm0
323; AVX512F-NEXT:    retq
324;
325; AVX512VL-LABEL: bitselect_v4i64_rr:
326; AVX512VL:       # %bb.0:
327; AVX512VL-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
328; AVX512VL-NEXT:    retq
329  %3 = and <4 x i64> %0, <i64 4294967296, i64 12884901890, i64 12884901890, i64 12884901890>
330  %4 = and <4 x i64> %1, <i64 -4294967297, i64 -12884901891, i64 -12884901891, i64 -12884901891>
331  %5 = or <4 x i64> %4, %3
332  ret <4 x i64> %5
333}
334
335define <4 x i64> @bitselect_v4i64_rm(<4 x i64>, <4 x i64>* nocapture readonly) {
336; SSE-LABEL: bitselect_v4i64_rm:
337; SSE:       # %bb.0:
338; SSE-NEXT:    movaps {{.*#+}} xmm2 = [18446744065119617022,18446744073709551612]
339; SSE-NEXT:    movaps 16(%rdi), %xmm4
340; SSE-NEXT:    andps %xmm2, %xmm4
341; SSE-NEXT:    movaps (%rdi), %xmm5
342; SSE-NEXT:    andps %xmm2, %xmm5
343; SSE-NEXT:    movaps %xmm2, %xmm3
344; SSE-NEXT:    andnps %xmm0, %xmm3
345; SSE-NEXT:    orps %xmm5, %xmm3
346; SSE-NEXT:    andnps %xmm1, %xmm2
347; SSE-NEXT:    orps %xmm4, %xmm2
348; SSE-NEXT:    movaps %xmm3, %xmm0
349; SSE-NEXT:    movaps %xmm2, %xmm1
350; SSE-NEXT:    retq
351;
352; XOP-LABEL: bitselect_v4i64_rm:
353; XOP:       # %bb.0:
354; XOP-NEXT:    vmovdqa (%rdi), %ymm1
355; XOP-NEXT:    vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1, %ymm0
356; XOP-NEXT:    retq
357;
358; AVX-LABEL: bitselect_v4i64_rm:
359; AVX:       # %bb.0:
360; AVX-NEXT:    vmovaps (%rdi), %ymm1
361; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
362; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
363; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
364; AVX-NEXT:    retq
365;
366; AVX512F-LABEL: bitselect_v4i64_rm:
367; AVX512F:       # %bb.0:
368; AVX512F-NEXT:    vmovaps (%rdi), %ymm1
369; AVX512F-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
370; AVX512F-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
371; AVX512F-NEXT:    vorps %ymm0, %ymm1, %ymm0
372; AVX512F-NEXT:    retq
373;
374; AVX512VL-LABEL: bitselect_v4i64_rm:
375; AVX512VL:       # %bb.0:
376; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm1
377; AVX512VL-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
378; AVX512VL-NEXT:    retq
379  %3 = load <4 x i64>, <4 x i64>* %1
380  %4 = and <4 x i64> %0, <i64 8589934593, i64 3, i64 8589934593, i64 3>
381  %5 = and <4 x i64> %3, <i64 -8589934594, i64 -4, i64 -8589934594, i64 -4>
382  %6 = or <4 x i64> %5, %4
383  ret <4 x i64> %6
384}
385
386define <4 x i64> @bitselect_v4i64_mr(<4 x i64>* nocapture readonly, <4 x i64>) {
387; SSE-LABEL: bitselect_v4i64_mr:
388; SSE:       # %bb.0:
389; SSE-NEXT:    movaps {{.*#+}} xmm2 = [12884901890,4294967296]
390; SSE-NEXT:    movaps 16(%rdi), %xmm4
391; SSE-NEXT:    andps %xmm2, %xmm4
392; SSE-NEXT:    movaps (%rdi), %xmm5
393; SSE-NEXT:    andps %xmm2, %xmm5
394; SSE-NEXT:    movaps %xmm2, %xmm3
395; SSE-NEXT:    andnps %xmm0, %xmm3
396; SSE-NEXT:    orps %xmm5, %xmm3
397; SSE-NEXT:    andnps %xmm1, %xmm2
398; SSE-NEXT:    orps %xmm4, %xmm2
399; SSE-NEXT:    movaps %xmm3, %xmm0
400; SSE-NEXT:    movaps %xmm2, %xmm1
401; SSE-NEXT:    retq
402;
403; XOP-LABEL: bitselect_v4i64_mr:
404; XOP:       # %bb.0:
405; XOP-NEXT:    vmovdqa (%rdi), %ymm1
406; XOP-NEXT:    vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1, %ymm0
407; XOP-NEXT:    retq
408;
409; AVX-LABEL: bitselect_v4i64_mr:
410; AVX:       # %bb.0:
411; AVX-NEXT:    vmovaps (%rdi), %ymm1
412; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
413; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
414; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
415; AVX-NEXT:    retq
416;
417; AVX512F-LABEL: bitselect_v4i64_mr:
418; AVX512F:       # %bb.0:
419; AVX512F-NEXT:    vmovaps (%rdi), %ymm1
420; AVX512F-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
421; AVX512F-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
422; AVX512F-NEXT:    vorps %ymm0, %ymm1, %ymm0
423; AVX512F-NEXT:    retq
424;
425; AVX512VL-LABEL: bitselect_v4i64_mr:
426; AVX512VL:       # %bb.0:
427; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm1
428; AVX512VL-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
429; AVX512VL-NEXT:    retq
430  %3 = load <4 x i64>, <4 x i64>* %0
431  %4 = and <4 x i64> %3, <i64 12884901890, i64 4294967296, i64 12884901890, i64 4294967296>
432  %5 = and <4 x i64> %1, <i64 -12884901891, i64 -4294967297, i64 -12884901891, i64 -4294967297>
433  %6 = or <4 x i64> %4, %5
434  ret <4 x i64> %6
435}
436
437define <4 x i64> @bitselect_v4i64_mm(<4 x i64>* nocapture readonly, <4 x i64>* nocapture readonly) {
438; SSE-LABEL: bitselect_v4i64_mm:
439; SSE:       # %bb.0:
440; SSE-NEXT:    movaps {{.*#+}} xmm1 = [18446744073709551612,18446744065119617022]
441; SSE-NEXT:    movaps 16(%rsi), %xmm2
442; SSE-NEXT:    andps %xmm1, %xmm2
443; SSE-NEXT:    movaps (%rsi), %xmm3
444; SSE-NEXT:    andps %xmm1, %xmm3
445; SSE-NEXT:    movaps %xmm1, %xmm0
446; SSE-NEXT:    andnps (%rdi), %xmm0
447; SSE-NEXT:    orps %xmm3, %xmm0
448; SSE-NEXT:    andnps 16(%rdi), %xmm1
449; SSE-NEXT:    orps %xmm2, %xmm1
450; SSE-NEXT:    retq
451;
452; XOP-LABEL: bitselect_v4i64_mm:
453; XOP:       # %bb.0:
454; XOP-NEXT:    vmovdqa (%rsi), %ymm0
455; XOP-NEXT:    vmovdqa {{.*#+}} ymm1 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
456; XOP-NEXT:    vpcmov %ymm1, (%rdi), %ymm0, %ymm0
457; XOP-NEXT:    retq
458;
459; AVX-LABEL: bitselect_v4i64_mm:
460; AVX:       # %bb.0:
461; AVX-NEXT:    vmovaps (%rdi), %ymm0
462; AVX-NEXT:    vmovaps (%rsi), %ymm1
463; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
464; AVX-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
465; AVX-NEXT:    vorps %ymm0, %ymm1, %ymm0
466; AVX-NEXT:    retq
467;
468; AVX512F-LABEL: bitselect_v4i64_mm:
469; AVX512F:       # %bb.0:
470; AVX512F-NEXT:    vmovaps (%rdi), %ymm0
471; AVX512F-NEXT:    vmovaps (%rsi), %ymm1
472; AVX512F-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
473; AVX512F-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
474; AVX512F-NEXT:    vorps %ymm0, %ymm1, %ymm0
475; AVX512F-NEXT:    retq
476;
477; AVX512VL-LABEL: bitselect_v4i64_mm:
478; AVX512VL:       # %bb.0:
479; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm1
480; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
481; AVX512VL-NEXT:    vpternlogq $202, (%rdi), %ymm1, %ymm0
482; AVX512VL-NEXT:    retq
483  %3 = load <4 x i64>, <4 x i64>* %0
484  %4 = load <4 x i64>, <4 x i64>* %1
485  %5 = and <4 x i64> %3, <i64 3, i64 8589934593, i64 3, i64 8589934593>
486  %6 = and <4 x i64> %4, <i64 -4, i64 -8589934594, i64 -4, i64 -8589934594>
487  %7 = or <4 x i64> %6, %5
488  ret <4 x i64> %7
489}
490
491define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i64 %a2) {
492; SSE-LABEL: bitselect_v4i64_broadcast_rrr:
493; SSE:       # %bb.0:
494; SSE-NEXT:    movq %rdi, %xmm4
495; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
496; SSE-NEXT:    pand %xmm4, %xmm1
497; SSE-NEXT:    pand %xmm4, %xmm0
498; SSE-NEXT:    movdqa %xmm4, %xmm5
499; SSE-NEXT:    pandn %xmm3, %xmm5
500; SSE-NEXT:    por %xmm5, %xmm1
501; SSE-NEXT:    pandn %xmm2, %xmm4
502; SSE-NEXT:    por %xmm4, %xmm0
503; SSE-NEXT:    retq
504;
505; XOP-LABEL: bitselect_v4i64_broadcast_rrr:
506; XOP:       # %bb.0:
507; XOP-NEXT:    vmovq %rdi, %xmm2
508; XOP-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
509; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
510; XOP-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
511; XOP-NEXT:    retq
512;
513; AVX1-LABEL: bitselect_v4i64_broadcast_rrr:
514; AVX1:       # %bb.0:
515; AVX1-NEXT:    vmovq %rdi, %xmm2
516; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
517; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm2, %ymm2
518; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
519; AVX1-NEXT:    vandnps %ymm1, %ymm2, %ymm1
520; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
521; AVX1-NEXT:    retq
522;
523; AVX2-LABEL: bitselect_v4i64_broadcast_rrr:
524; AVX2:       # %bb.0:
525; AVX2-NEXT:    vmovq %rdi, %xmm2
526; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
527; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
528; AVX2-NEXT:    vpandn %ymm1, %ymm2, %ymm1
529; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
530; AVX2-NEXT:    retq
531;
532; AVX512F-LABEL: bitselect_v4i64_broadcast_rrr:
533; AVX512F:       # %bb.0:
534; AVX512F-NEXT:    vmovq %rdi, %xmm2
535; AVX512F-NEXT:    vpbroadcastq %xmm2, %ymm2
536; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
537; AVX512F-NEXT:    vpandn %ymm1, %ymm2, %ymm1
538; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
539; AVX512F-NEXT:    retq
540;
541; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrr:
542; AVX512VL:       # %bb.0:
543; AVX512VL-NEXT:    vpbroadcastq %rdi, %ymm2
544; AVX512VL-NEXT:    vpternlogq $226, %ymm1, %ymm2, %ymm0
545; AVX512VL-NEXT:    retq
546  %1 = insertelement <4 x i64> undef, i64 %a2, i32 0
547  %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <4 x i32> zeroinitializer
548  %3 = xor <4 x i64> %1, <i64 -1, i64 undef, i64 undef, i64 undef>
549  %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> zeroinitializer
550  %5 = and <4 x i64> %a0, %2
551  %6 = and <4 x i64> %a1, %4
552  %7 = or <4 x i64> %5, %6
553  ret <4 x i64> %7
554}
555
556define <4 x i64> @bitselect_v4i64_broadcast_rrm(<4 x i64> %a0, <4 x i64> %a1, i64* %p2) {
557; SSE-LABEL: bitselect_v4i64_broadcast_rrm:
558; SSE:       # %bb.0:
559; SSE-NEXT:    movq {{.*#+}} xmm4 = mem[0],zero
560; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
561; SSE-NEXT:    pand %xmm4, %xmm1
562; SSE-NEXT:    pand %xmm4, %xmm0
563; SSE-NEXT:    movdqa %xmm4, %xmm5
564; SSE-NEXT:    pandn %xmm3, %xmm5
565; SSE-NEXT:    por %xmm5, %xmm1
566; SSE-NEXT:    pandn %xmm2, %xmm4
567; SSE-NEXT:    por %xmm4, %xmm0
568; SSE-NEXT:    retq
569;
570; XOP-LABEL: bitselect_v4i64_broadcast_rrm:
571; XOP:       # %bb.0:
572; XOP-NEXT:    vbroadcastsd (%rdi), %ymm2
573; XOP-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
574; XOP-NEXT:    retq
575;
576; AVX-LABEL: bitselect_v4i64_broadcast_rrm:
577; AVX:       # %bb.0:
578; AVX-NEXT:    vbroadcastsd (%rdi), %ymm2
579; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
580; AVX-NEXT:    vandnps %ymm1, %ymm2, %ymm1
581; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
582; AVX-NEXT:    retq
583;
584; AVX512F-LABEL: bitselect_v4i64_broadcast_rrm:
585; AVX512F:       # %bb.0:
586; AVX512F-NEXT:    vbroadcastsd (%rdi), %ymm2
587; AVX512F-NEXT:    vandps %ymm2, %ymm0, %ymm0
588; AVX512F-NEXT:    vandnps %ymm1, %ymm2, %ymm1
589; AVX512F-NEXT:    vorps %ymm1, %ymm0, %ymm0
590; AVX512F-NEXT:    retq
591;
592; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrm:
593; AVX512VL:       # %bb.0:
594; AVX512VL-NEXT:    vpternlogq $228, (%rdi){1to4}, %ymm1, %ymm0
595; AVX512VL-NEXT:    retq
596  %a2 = load i64, i64* %p2
597  %1 = insertelement <4 x i64> undef, i64 %a2, i32 0
598  %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <4 x i32> zeroinitializer
599  %3 = xor <4 x i64> %1, <i64 -1, i64 undef, i64 undef, i64 undef>
600  %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> zeroinitializer
601  %5 = and <4 x i64> %a0, %2
602  %6 = and <4 x i64> %a1, %4
603  %7 = or <4 x i64> %5, %6
604  ret <4 x i64> %7
605}
606
607;
608; 512-bit vectors
609;
610
611define <8 x i64> @bitselect_v8i64_rr(<8 x i64>, <8 x i64>) {
612; SSE-LABEL: bitselect_v8i64_rr:
613; SSE:       # %bb.0:
614; SSE-NEXT:    movaps {{.*#+}} xmm8 = [18446744060824649725,18446744060824649725]
615; SSE-NEXT:    andps %xmm8, %xmm7
616; SSE-NEXT:    movaps {{.*#+}} xmm9 = [18446744069414584319,18446744060824649725]
617; SSE-NEXT:    andps %xmm9, %xmm6
618; SSE-NEXT:    andps %xmm8, %xmm5
619; SSE-NEXT:    andps %xmm9, %xmm4
620; SSE-NEXT:    movaps %xmm9, %xmm10
621; SSE-NEXT:    andnps %xmm0, %xmm10
622; SSE-NEXT:    orps %xmm4, %xmm10
623; SSE-NEXT:    movaps %xmm8, %xmm4
624; SSE-NEXT:    andnps %xmm1, %xmm4
625; SSE-NEXT:    orps %xmm5, %xmm4
626; SSE-NEXT:    andnps %xmm2, %xmm9
627; SSE-NEXT:    orps %xmm6, %xmm9
628; SSE-NEXT:    andnps %xmm3, %xmm8
629; SSE-NEXT:    orps %xmm7, %xmm8
630; SSE-NEXT:    movaps %xmm10, %xmm0
631; SSE-NEXT:    movaps %xmm4, %xmm1
632; SSE-NEXT:    movaps %xmm9, %xmm2
633; SSE-NEXT:    movaps %xmm8, %xmm3
634; SSE-NEXT:    retq
635;
636; XOP-LABEL: bitselect_v8i64_rr:
637; XOP:       # %bb.0:
638; XOP-NEXT:    vmovdqa {{.*#+}} ymm4 = [18446744069414584319,18446744060824649725,18446744060824649725,18446744060824649725]
639; XOP-NEXT:    vpcmov %ymm4, %ymm0, %ymm2, %ymm0
640; XOP-NEXT:    vpcmov %ymm4, %ymm1, %ymm3, %ymm1
641; XOP-NEXT:    retq
642;
643; AVX-LABEL: bitselect_v8i64_rr:
644; AVX:       # %bb.0:
645; AVX-NEXT:    vmovaps {{.*#+}} ymm4 = [18446744069414584319,18446744060824649725,18446744060824649725,18446744060824649725]
646; AVX-NEXT:    vandps %ymm4, %ymm3, %ymm3
647; AVX-NEXT:    vandps %ymm4, %ymm2, %ymm2
648; AVX-NEXT:    vandnps %ymm0, %ymm4, %ymm0
649; AVX-NEXT:    vorps %ymm0, %ymm2, %ymm0
650; AVX-NEXT:    vandnps %ymm1, %ymm4, %ymm1
651; AVX-NEXT:    vorps %ymm1, %ymm3, %ymm1
652; AVX-NEXT:    retq
653;
654; AVX512-LABEL: bitselect_v8i64_rr:
655; AVX512:       # %bb.0:
656; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
657; AVX512-NEXT:    retq
658  %3 = and <8 x i64> %0, <i64 4294967296, i64 12884901890, i64 12884901890, i64 12884901890, i64 4294967296, i64 12884901890, i64 12884901890, i64 12884901890>
659  %4 = and <8 x i64> %1, <i64 -4294967297, i64 -12884901891, i64 -12884901891, i64 -12884901891, i64 -4294967297, i64 -12884901891, i64 -12884901891, i64 -12884901891>
660  %5 = or <8 x i64> %4, %3
661  ret <8 x i64> %5
662}
663
664define <8 x i64> @bitselect_v8i64_rm(<8 x i64>, <8 x i64>* nocapture readonly) {
665; SSE-LABEL: bitselect_v8i64_rm:
666; SSE:       # %bb.0:
667; SSE-NEXT:    movaps {{.*#+}} xmm4 = [18446744065119617022,18446744073709551612]
668; SSE-NEXT:    movaps 48(%rdi), %xmm8
669; SSE-NEXT:    andps %xmm4, %xmm8
670; SSE-NEXT:    movaps 32(%rdi), %xmm9
671; SSE-NEXT:    andps %xmm4, %xmm9
672; SSE-NEXT:    movaps 16(%rdi), %xmm7
673; SSE-NEXT:    andps %xmm4, %xmm7
674; SSE-NEXT:    movaps (%rdi), %xmm6
675; SSE-NEXT:    andps %xmm4, %xmm6
676; SSE-NEXT:    movaps %xmm4, %xmm5
677; SSE-NEXT:    andnps %xmm0, %xmm5
678; SSE-NEXT:    orps %xmm6, %xmm5
679; SSE-NEXT:    movaps %xmm4, %xmm6
680; SSE-NEXT:    andnps %xmm1, %xmm6
681; SSE-NEXT:    orps %xmm7, %xmm6
682; SSE-NEXT:    movaps %xmm4, %xmm7
683; SSE-NEXT:    andnps %xmm2, %xmm7
684; SSE-NEXT:    orps %xmm9, %xmm7
685; SSE-NEXT:    andnps %xmm3, %xmm4
686; SSE-NEXT:    orps %xmm8, %xmm4
687; SSE-NEXT:    movaps %xmm5, %xmm0
688; SSE-NEXT:    movaps %xmm6, %xmm1
689; SSE-NEXT:    movaps %xmm7, %xmm2
690; SSE-NEXT:    movaps %xmm4, %xmm3
691; SSE-NEXT:    retq
692;
693; XOP-LABEL: bitselect_v8i64_rm:
694; XOP:       # %bb.0:
695; XOP-NEXT:    vmovdqa (%rdi), %ymm2
696; XOP-NEXT:    vmovdqa 32(%rdi), %ymm3
697; XOP-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612]
698; XOP-NEXT:    # ymm4 = mem[0,1,0,1]
699; XOP-NEXT:    vpcmov %ymm4, %ymm0, %ymm2, %ymm0
700; XOP-NEXT:    vpcmov %ymm4, %ymm1, %ymm3, %ymm1
701; XOP-NEXT:    retq
702;
703; AVX-LABEL: bitselect_v8i64_rm:
704; AVX:       # %bb.0:
705; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612]
706; AVX-NEXT:    # ymm2 = mem[0,1,0,1]
707; AVX-NEXT:    vandps 32(%rdi), %ymm2, %ymm3
708; AVX-NEXT:    vandps (%rdi), %ymm2, %ymm4
709; AVX-NEXT:    vandnps %ymm0, %ymm2, %ymm0
710; AVX-NEXT:    vorps %ymm0, %ymm4, %ymm0
711; AVX-NEXT:    vandnps %ymm1, %ymm2, %ymm1
712; AVX-NEXT:    vorps %ymm1, %ymm3, %ymm1
713; AVX-NEXT:    retq
714;
715; AVX512-LABEL: bitselect_v8i64_rm:
716; AVX512:       # %bb.0:
717; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm1
718; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
719; AVX512-NEXT:    retq
720  %3 = load <8 x i64>, <8 x i64>* %1
721  %4 = and <8 x i64> %0, <i64 8589934593, i64 3, i64 8589934593, i64 3, i64 8589934593, i64 3, i64 8589934593, i64 3>
722  %5 = and <8 x i64> %3, <i64 -8589934594, i64 -4, i64 -8589934594, i64 -4, i64 -8589934594, i64 -4, i64 -8589934594, i64 -4>
723  %6 = or <8 x i64> %5, %4
724  ret <8 x i64> %6
725}
726
727define <8 x i64> @bitselect_v8i64_mr(<8 x i64>* nocapture readonly, <8 x i64>) {
728; SSE-LABEL: bitselect_v8i64_mr:
729; SSE:       # %bb.0:
730; SSE-NEXT:    movaps {{.*#+}} xmm4 = [12884901890,4294967296]
731; SSE-NEXT:    movaps 48(%rdi), %xmm8
732; SSE-NEXT:    andps %xmm4, %xmm8
733; SSE-NEXT:    movaps 32(%rdi), %xmm9
734; SSE-NEXT:    andps %xmm4, %xmm9
735; SSE-NEXT:    movaps 16(%rdi), %xmm7
736; SSE-NEXT:    andps %xmm4, %xmm7
737; SSE-NEXT:    movaps (%rdi), %xmm6
738; SSE-NEXT:    andps %xmm4, %xmm6
739; SSE-NEXT:    movaps %xmm4, %xmm5
740; SSE-NEXT:    andnps %xmm0, %xmm5
741; SSE-NEXT:    orps %xmm6, %xmm5
742; SSE-NEXT:    movaps %xmm4, %xmm6
743; SSE-NEXT:    andnps %xmm1, %xmm6
744; SSE-NEXT:    orps %xmm7, %xmm6
745; SSE-NEXT:    movaps %xmm4, %xmm7
746; SSE-NEXT:    andnps %xmm2, %xmm7
747; SSE-NEXT:    orps %xmm9, %xmm7
748; SSE-NEXT:    andnps %xmm3, %xmm4
749; SSE-NEXT:    orps %xmm8, %xmm4
750; SSE-NEXT:    movaps %xmm5, %xmm0
751; SSE-NEXT:    movaps %xmm6, %xmm1
752; SSE-NEXT:    movaps %xmm7, %xmm2
753; SSE-NEXT:    movaps %xmm4, %xmm3
754; SSE-NEXT:    retq
755;
756; XOP-LABEL: bitselect_v8i64_mr:
757; XOP:       # %bb.0:
758; XOP-NEXT:    vmovdqa (%rdi), %ymm2
759; XOP-NEXT:    vmovdqa 32(%rdi), %ymm3
760; XOP-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [12884901890,4294967296,12884901890,4294967296]
761; XOP-NEXT:    # ymm4 = mem[0,1,0,1]
762; XOP-NEXT:    vpcmov %ymm4, %ymm0, %ymm2, %ymm0
763; XOP-NEXT:    vpcmov %ymm4, %ymm1, %ymm3, %ymm1
764; XOP-NEXT:    retq
765;
766; AVX-LABEL: bitselect_v8i64_mr:
767; AVX:       # %bb.0:
768; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [12884901890,4294967296,12884901890,4294967296]
769; AVX-NEXT:    # ymm2 = mem[0,1,0,1]
770; AVX-NEXT:    vandps 32(%rdi), %ymm2, %ymm3
771; AVX-NEXT:    vandps (%rdi), %ymm2, %ymm4
772; AVX-NEXT:    vandnps %ymm0, %ymm2, %ymm0
773; AVX-NEXT:    vorps %ymm0, %ymm4, %ymm0
774; AVX-NEXT:    vandnps %ymm1, %ymm2, %ymm1
775; AVX-NEXT:    vorps %ymm1, %ymm3, %ymm1
776; AVX-NEXT:    retq
777;
778; AVX512-LABEL: bitselect_v8i64_mr:
779; AVX512:       # %bb.0:
780; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm1
781; AVX512-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
782; AVX512-NEXT:    retq
783  %3 = load <8 x i64>, <8 x i64>* %0
784  %4 = and <8 x i64> %3, <i64 12884901890, i64 4294967296, i64 12884901890, i64 4294967296, i64 12884901890, i64 4294967296, i64 12884901890, i64 4294967296>
785  %5 = and <8 x i64> %1, <i64 -12884901891, i64 -4294967297, i64 -12884901891, i64 -4294967297, i64 -12884901891, i64 -4294967297, i64 -12884901891, i64 -4294967297>
786  %6 = or <8 x i64> %4, %5
787  ret <8 x i64> %6
788}
789
790define <8 x i64> @bitselect_v8i64_mm(<8 x i64>* nocapture readonly, <8 x i64>* nocapture readonly) {
791; SSE-LABEL: bitselect_v8i64_mm:
792; SSE:       # %bb.0:
793; SSE-NEXT:    movaps {{.*#+}} xmm3 = [18446744073709551612,18446744065119617022]
794; SSE-NEXT:    movaps 48(%rsi), %xmm4
795; SSE-NEXT:    andps %xmm3, %xmm4
796; SSE-NEXT:    movaps 32(%rsi), %xmm5
797; SSE-NEXT:    andps %xmm3, %xmm5
798; SSE-NEXT:    movaps 16(%rsi), %xmm2
799; SSE-NEXT:    andps %xmm3, %xmm2
800; SSE-NEXT:    movaps (%rsi), %xmm1
801; SSE-NEXT:    andps %xmm3, %xmm1
802; SSE-NEXT:    movaps %xmm3, %xmm0
803; SSE-NEXT:    andnps (%rdi), %xmm0
804; SSE-NEXT:    orps %xmm1, %xmm0
805; SSE-NEXT:    movaps %xmm3, %xmm1
806; SSE-NEXT:    andnps 16(%rdi), %xmm1
807; SSE-NEXT:    orps %xmm2, %xmm1
808; SSE-NEXT:    movaps %xmm3, %xmm2
809; SSE-NEXT:    andnps 32(%rdi), %xmm2
810; SSE-NEXT:    orps %xmm5, %xmm2
811; SSE-NEXT:    andnps 48(%rdi), %xmm3
812; SSE-NEXT:    orps %xmm4, %xmm3
813; SSE-NEXT:    retq
814;
815; XOP-LABEL: bitselect_v8i64_mm:
816; XOP:       # %bb.0:
817; XOP-NEXT:    vmovdqa (%rsi), %ymm0
818; XOP-NEXT:    vmovdqa 32(%rsi), %ymm1
819; XOP-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
820; XOP-NEXT:    # ymm2 = mem[0,1,0,1]
821; XOP-NEXT:    vpcmov %ymm2, (%rdi), %ymm0, %ymm0
822; XOP-NEXT:    vpcmov %ymm2, 32(%rdi), %ymm1, %ymm1
823; XOP-NEXT:    retq
824;
825; AVX-LABEL: bitselect_v8i64_mm:
826; AVX:       # %bb.0:
827; AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
828; AVX-NEXT:    # ymm1 = mem[0,1,0,1]
829; AVX-NEXT:    vandps 32(%rsi), %ymm1, %ymm2
830; AVX-NEXT:    vandps (%rsi), %ymm1, %ymm0
831; AVX-NEXT:    vandnps (%rdi), %ymm1, %ymm3
832; AVX-NEXT:    vorps %ymm3, %ymm0, %ymm0
833; AVX-NEXT:    vandnps 32(%rdi), %ymm1, %ymm1
834; AVX-NEXT:    vorps %ymm1, %ymm2, %ymm1
835; AVX-NEXT:    retq
836;
837; AVX512-LABEL: bitselect_v8i64_mm:
838; AVX512:       # %bb.0:
839; AVX512-NEXT:    vmovdqa64 (%rsi), %zmm1
840; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022]
841; AVX512-NEXT:    vpternlogq $202, (%rdi), %zmm1, %zmm0
842; AVX512-NEXT:    retq
843  %3 = load <8 x i64>, <8 x i64>* %0
844  %4 = load <8 x i64>, <8 x i64>* %1
845  %5 = and <8 x i64> %3, <i64 3, i64 8589934593, i64 3, i64 8589934593, i64 3, i64 8589934593, i64 3, i64 8589934593>
846  %6 = and <8 x i64> %4, <i64 -4, i64 -8589934594, i64 -4, i64 -8589934594, i64 -4, i64 -8589934594, i64 -4, i64 -8589934594>
847  %7 = or <8 x i64> %6, %5
848  ret <8 x i64> %7
849}
850
851define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i64 %a2) {
852; SSE-LABEL: bitselect_v8i64_broadcast_rrr:
853; SSE:       # %bb.0:
854; SSE-NEXT:    movq %rdi, %xmm8
855; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1]
856; SSE-NEXT:    pand %xmm8, %xmm3
857; SSE-NEXT:    pand %xmm8, %xmm2
858; SSE-NEXT:    pand %xmm8, %xmm1
859; SSE-NEXT:    pand %xmm8, %xmm0
860; SSE-NEXT:    movdqa %xmm8, %xmm9
861; SSE-NEXT:    pandn %xmm7, %xmm9
862; SSE-NEXT:    por %xmm9, %xmm3
863; SSE-NEXT:    movdqa %xmm8, %xmm7
864; SSE-NEXT:    pandn %xmm6, %xmm7
865; SSE-NEXT:    por %xmm7, %xmm2
866; SSE-NEXT:    movdqa %xmm8, %xmm6
867; SSE-NEXT:    pandn %xmm5, %xmm6
868; SSE-NEXT:    por %xmm6, %xmm1
869; SSE-NEXT:    pandn %xmm4, %xmm8
870; SSE-NEXT:    por %xmm8, %xmm0
871; SSE-NEXT:    retq
872;
873; XOP-LABEL: bitselect_v8i64_broadcast_rrr:
874; XOP:       # %bb.0:
875; XOP-NEXT:    vmovq %rdi, %xmm4
876; XOP-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
877; XOP-NEXT:    vinsertf128 $1, %xmm4, %ymm4, %ymm4
878; XOP-NEXT:    vpcmov %ymm4, %ymm2, %ymm0, %ymm0
879; XOP-NEXT:    vpcmov %ymm4, %ymm3, %ymm1, %ymm1
880; XOP-NEXT:    retq
881;
882; AVX1-LABEL: bitselect_v8i64_broadcast_rrr:
883; AVX1:       # %bb.0:
884; AVX1-NEXT:    vmovq %rdi, %xmm4
885; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
886; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm4, %ymm4
887; AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
888; AVX1-NEXT:    vandps %ymm4, %ymm0, %ymm0
889; AVX1-NEXT:    vandnps %ymm3, %ymm4, %ymm3
890; AVX1-NEXT:    vorps %ymm3, %ymm1, %ymm1
891; AVX1-NEXT:    vandnps %ymm2, %ymm4, %ymm2
892; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
893; AVX1-NEXT:    retq
894;
895; AVX2-LABEL: bitselect_v8i64_broadcast_rrr:
896; AVX2:       # %bb.0:
897; AVX2-NEXT:    vmovq %rdi, %xmm4
898; AVX2-NEXT:    vpbroadcastq %xmm4, %ymm4
899; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
900; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
901; AVX2-NEXT:    vpandn %ymm3, %ymm4, %ymm3
902; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
903; AVX2-NEXT:    vpandn %ymm2, %ymm4, %ymm2
904; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
905; AVX2-NEXT:    retq
906;
907; AVX512-LABEL: bitselect_v8i64_broadcast_rrr:
908; AVX512:       # %bb.0:
909; AVX512-NEXT:    vpbroadcastq %rdi, %zmm2
910; AVX512-NEXT:    vpternlogq $226, %zmm1, %zmm2, %zmm0
911; AVX512-NEXT:    retq
912  %1 = insertelement <8 x i64> undef, i64 %a2, i32 0
913  %2 = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> zeroinitializer
914  %3 = xor <8 x i64> %1, <i64 -1, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>
915  %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> zeroinitializer
916  %5 = and <8 x i64> %a0, %2
917  %6 = and <8 x i64> %a1, %4
918  %7 = or <8 x i64> %5, %6
919  ret <8 x i64> %7
920}
921
922define <8 x i64> @bitselect_v8i64_broadcast_rrm(<8 x i64> %a0, <8 x i64> %a1, i64* %p2) {
923; SSE-LABEL: bitselect_v8i64_broadcast_rrm:
924; SSE:       # %bb.0:
925; SSE-NEXT:    movq {{.*#+}} xmm8 = mem[0],zero
926; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1]
927; SSE-NEXT:    pand %xmm8, %xmm3
928; SSE-NEXT:    pand %xmm8, %xmm2
929; SSE-NEXT:    pand %xmm8, %xmm1
930; SSE-NEXT:    pand %xmm8, %xmm0
931; SSE-NEXT:    movdqa %xmm8, %xmm9
932; SSE-NEXT:    pandn %xmm7, %xmm9
933; SSE-NEXT:    por %xmm9, %xmm3
934; SSE-NEXT:    movdqa %xmm8, %xmm7
935; SSE-NEXT:    pandn %xmm6, %xmm7
936; SSE-NEXT:    por %xmm7, %xmm2
937; SSE-NEXT:    movdqa %xmm8, %xmm6
938; SSE-NEXT:    pandn %xmm5, %xmm6
939; SSE-NEXT:    por %xmm6, %xmm1
940; SSE-NEXT:    pandn %xmm4, %xmm8
941; SSE-NEXT:    por %xmm8, %xmm0
942; SSE-NEXT:    retq
943;
944; XOP-LABEL: bitselect_v8i64_broadcast_rrm:
945; XOP:       # %bb.0:
946; XOP-NEXT:    vbroadcastsd (%rdi), %ymm4
947; XOP-NEXT:    vpcmov %ymm4, %ymm2, %ymm0, %ymm0
948; XOP-NEXT:    vpcmov %ymm4, %ymm3, %ymm1, %ymm1
949; XOP-NEXT:    retq
950;
951; AVX-LABEL: bitselect_v8i64_broadcast_rrm:
952; AVX:       # %bb.0:
953; AVX-NEXT:    vbroadcastsd (%rdi), %ymm4
954; AVX-NEXT:    vandps %ymm4, %ymm1, %ymm1
955; AVX-NEXT:    vandps %ymm4, %ymm0, %ymm0
956; AVX-NEXT:    vandnps %ymm3, %ymm4, %ymm3
957; AVX-NEXT:    vorps %ymm3, %ymm1, %ymm1
958; AVX-NEXT:    vandnps %ymm2, %ymm4, %ymm2
959; AVX-NEXT:    vorps %ymm2, %ymm0, %ymm0
960; AVX-NEXT:    retq
961;
962; AVX512-LABEL: bitselect_v8i64_broadcast_rrm:
963; AVX512:       # %bb.0:
964; AVX512-NEXT:    vpternlogq $228, (%rdi){1to8}, %zmm1, %zmm0
965; AVX512-NEXT:    retq
966  %a2 = load i64, i64* %p2
967  %1 = insertelement <8 x i64> undef, i64 %a2, i32 0
968  %2 = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> zeroinitializer
969  %3 = xor <8 x i64> %1, <i64 -1, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>
970  %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> zeroinitializer
971  %5 = and <8 x i64> %a0, %2
972  %6 = and <8 x i64> %a1, %4
973  %7 = or <8 x i64> %5, %6
974  ret <8 x i64> %7
975}
976
977; Check that mask registers don't get canonicalized.
978define <4 x i1> @bitselect_v4i1_loop(<4 x i32> %a0, <4 x i32> %a1) {
979; SSE-LABEL: bitselect_v4i1_loop:
980; SSE:       # %bb.0: # %bb
981; SSE-NEXT:    pxor %xmm2, %xmm2
982; SSE-NEXT:    pcmpeqd %xmm0, %xmm2
983; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [12,12,12,12]
984; SSE-NEXT:    pcmpeqd %xmm1, %xmm0
985; SSE-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
986; SSE-NEXT:    pand %xmm2, %xmm1
987; SSE-NEXT:    pandn %xmm0, %xmm2
988; SSE-NEXT:    por %xmm1, %xmm2
989; SSE-NEXT:    movdqa %xmm2, %xmm0
990; SSE-NEXT:    retq
991;
992; XOP-LABEL: bitselect_v4i1_loop:
993; XOP:       # %bb.0: # %bb
994; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
995; XOP-NEXT:    vpcomneqd %xmm2, %xmm0, %xmm0
996; XOP-NEXT:    vpcomeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
997; XOP-NEXT:    vpcomeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
998; XOP-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
999; XOP-NEXT:    retq
1000;
1001; AVX1-LABEL: bitselect_v4i1_loop:
1002; AVX1:       # %bb.0: # %bb
1003; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1004; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
1005; AVX1-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
1006; AVX1-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1007; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
1008; AVX1-NEXT:    retq
1009;
1010; AVX2-LABEL: bitselect_v4i1_loop:
1011; AVX2:       # %bb.0: # %bb
1012; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1013; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
1014; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [12,12,12,12]
1015; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm2
1016; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [15,15,15,15]
1017; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
1018; AVX2-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
1019; AVX2-NEXT:    retq
1020;
1021; AVX512F-LABEL: bitselect_v4i1_loop:
1022; AVX512F:       # %bb.0: # %bb
1023; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1024; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1025; AVX512F-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
1026; AVX512F-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k2
1027; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k0 {%k2}
1028; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
1029; AVX512F-NEXT:    korw %k0, %k1, %k1
1030; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1031; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1032; AVX512F-NEXT:    vzeroupper
1033; AVX512F-NEXT:    retq
1034;
1035; AVX512VL-LABEL: bitselect_v4i1_loop:
1036; AVX512VL:       # %bb.0: # %bb
1037; AVX512VL-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
1038; AVX512VL-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k2
1039; AVX512VL-NEXT:    vptestnmd %xmm0, %xmm0, %k0 {%k2}
1040; AVX512VL-NEXT:    vptestmd %xmm0, %xmm0, %k1 {%k1}
1041; AVX512VL-NEXT:    korw %k0, %k1, %k1
1042; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
1043; AVX512VL-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
1044; AVX512VL-NEXT:    retq
1045bb:
1046  %tmp = icmp ne <4 x i32> %a0, zeroinitializer
1047  %tmp2 = icmp eq <4 x i32> %a1, <i32 12, i32 12, i32 12, i32 12>
1048  %tmp3 = icmp eq <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15>
1049  %tmp4 = select <4 x i1> %tmp, <4 x i1> %tmp2, <4 x i1> %tmp3
1050  ret <4 x i1> %tmp4
1051}
1052
1053