1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefix=XOP 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL 8 9; 10; 128-bit vectors 11; 12 13define <2 x i64> @bitselect_v2i64_rr(<2 x i64>, <2 x i64>) { 14; SSE-LABEL: bitselect_v2i64_rr: 15; SSE: # %bb.0: 16; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 17; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 18; SSE-NEXT: orps %xmm1, %xmm0 19; SSE-NEXT: retq 20; 21; XOP-LABEL: bitselect_v2i64_rr: 22; XOP: # %bb.0: 23; XOP-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1, %xmm0 24; XOP-NEXT: retq 25; 26; AVX-LABEL: bitselect_v2i64_rr: 27; AVX: # %bb.0: 28; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 29; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 30; AVX-NEXT: vorps %xmm0, %xmm1, %xmm0 31; AVX-NEXT: retq 32; 33; AVX512F-LABEL: bitselect_v2i64_rr: 34; AVX512F: # %bb.0: 35; AVX512F-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 36; AVX512F-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 37; AVX512F-NEXT: vorps %xmm0, %xmm1, %xmm0 38; AVX512F-NEXT: retq 39; 40; AVX512VL-LABEL: bitselect_v2i64_rr: 41; AVX512VL: # %bb.0: 42; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 43; AVX512VL-NEXT: retq 44 %3 = and <2 x i64> %0, <i64 4294967296, i64 12884901890> 45 %4 = and <2 x i64> %1, <i64 -4294967297, i64 -12884901891> 46 %5 = or <2 x i64> %4, %3 47 ret <2 x i64> %5 48} 49 50define <2 x i64> @bitselect_v2i64_rm(<2 x i64>, <2 x i64>* nocapture readonly) { 51; SSE-LABEL: bitselect_v2i64_rm: 52; SSE: # %bb.0: 53; SSE-NEXT: movaps (%rdi), %xmm1 54; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 55; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 56; SSE-NEXT: orps %xmm1, %xmm0 57; SSE-NEXT: retq 58; 59; XOP-LABEL: bitselect_v2i64_rm: 60; XOP: # %bb.0: 61; XOP-NEXT: vmovdqa (%rdi), %xmm1 62; XOP-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1, %xmm0 63; XOP-NEXT: retq 64; 65; AVX-LABEL: bitselect_v2i64_rm: 66; AVX: # %bb.0: 67; AVX-NEXT: vmovaps (%rdi), %xmm1 68; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 69; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 70; AVX-NEXT: vorps %xmm0, %xmm1, %xmm0 71; AVX-NEXT: retq 72; 73; AVX512F-LABEL: bitselect_v2i64_rm: 74; AVX512F: # %bb.0: 75; AVX512F-NEXT: vmovaps (%rdi), %xmm1 76; AVX512F-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 77; AVX512F-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 78; AVX512F-NEXT: vorps %xmm0, %xmm1, %xmm0 79; AVX512F-NEXT: retq 80; 81; AVX512VL-LABEL: bitselect_v2i64_rm: 82; AVX512VL: # %bb.0: 83; AVX512VL-NEXT: vmovdqa (%rdi), %xmm1 84; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 85; AVX512VL-NEXT: retq 86 %3 = load <2 x i64>, <2 x i64>* %1 87 %4 = and <2 x i64> %0, <i64 8589934593, i64 3> 88 %5 = and <2 x i64> %3, <i64 -8589934594, i64 -4> 89 %6 = or <2 x i64> %5, %4 90 ret <2 x i64> %6 91} 92 93define <2 x i64> @bitselect_v2i64_mr(<2 x i64>* nocapture readonly, <2 x i64>) { 94; SSE-LABEL: bitselect_v2i64_mr: 95; SSE: # %bb.0: 96; SSE-NEXT: movaps (%rdi), %xmm1 97; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 98; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 99; SSE-NEXT: orps %xmm1, %xmm0 100; SSE-NEXT: retq 101; 102; XOP-LABEL: bitselect_v2i64_mr: 103; XOP: # %bb.0: 104; XOP-NEXT: vmovdqa (%rdi), %xmm1 105; XOP-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1, %xmm0 106; XOP-NEXT: retq 107; 108; AVX-LABEL: bitselect_v2i64_mr: 109; AVX: # %bb.0: 110; AVX-NEXT: vmovaps (%rdi), %xmm1 111; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 112; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 113; AVX-NEXT: vorps %xmm0, %xmm1, %xmm0 114; AVX-NEXT: retq 115; 116; AVX512F-LABEL: bitselect_v2i64_mr: 117; AVX512F: # %bb.0: 118; AVX512F-NEXT: vmovaps (%rdi), %xmm1 119; AVX512F-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 120; AVX512F-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 121; AVX512F-NEXT: vorps %xmm0, %xmm1, %xmm0 122; AVX512F-NEXT: retq 123; 124; AVX512VL-LABEL: bitselect_v2i64_mr: 125; AVX512VL: # %bb.0: 126; AVX512VL-NEXT: vmovdqa (%rdi), %xmm1 127; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 128; AVX512VL-NEXT: retq 129 %3 = load <2 x i64>, <2 x i64>* %0 130 %4 = and <2 x i64> %3, <i64 12884901890, i64 4294967296> 131 %5 = and <2 x i64> %1, <i64 -12884901891, i64 -4294967297> 132 %6 = or <2 x i64> %4, %5 133 ret <2 x i64> %6 134} 135 136define <2 x i64> @bitselect_v2i64_mm(<2 x i64>* nocapture readonly, <2 x i64>* nocapture readonly) { 137; SSE-LABEL: bitselect_v2i64_mm: 138; SSE: # %bb.0: 139; SSE-NEXT: movaps (%rdi), %xmm1 140; SSE-NEXT: movaps (%rsi), %xmm0 141; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 142; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 143; SSE-NEXT: orps %xmm1, %xmm0 144; SSE-NEXT: retq 145; 146; XOP-LABEL: bitselect_v2i64_mm: 147; XOP: # %bb.0: 148; XOP-NEXT: vmovdqa (%rsi), %xmm0 149; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551612,18446744065119617022] 150; XOP-NEXT: vpcmov %xmm1, (%rdi), %xmm0, %xmm0 151; XOP-NEXT: retq 152; 153; AVX-LABEL: bitselect_v2i64_mm: 154; AVX: # %bb.0: 155; AVX-NEXT: vmovaps (%rdi), %xmm0 156; AVX-NEXT: vmovaps (%rsi), %xmm1 157; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 158; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 159; AVX-NEXT: vorps %xmm0, %xmm1, %xmm0 160; AVX-NEXT: retq 161; 162; AVX512F-LABEL: bitselect_v2i64_mm: 163; AVX512F: # %bb.0: 164; AVX512F-NEXT: vmovaps (%rdi), %xmm0 165; AVX512F-NEXT: vmovaps (%rsi), %xmm1 166; AVX512F-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 167; AVX512F-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 168; AVX512F-NEXT: vorps %xmm0, %xmm1, %xmm0 169; AVX512F-NEXT: retq 170; 171; AVX512VL-LABEL: bitselect_v2i64_mm: 172; AVX512VL: # %bb.0: 173; AVX512VL-NEXT: vmovdqa (%rsi), %xmm1 174; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551612,18446744065119617022] 175; AVX512VL-NEXT: vpternlogq $202, (%rdi), %xmm1, %xmm0 176; AVX512VL-NEXT: retq 177 %3 = load <2 x i64>, <2 x i64>* %0 178 %4 = load <2 x i64>, <2 x i64>* %1 179 %5 = and <2 x i64> %3, <i64 3, i64 8589934593> 180 %6 = and <2 x i64> %4, <i64 -4, i64 -8589934594> 181 %7 = or <2 x i64> %6, %5 182 ret <2 x i64> %7 183} 184 185define <2 x i64> @bitselect_v2i64_broadcast_rrr(<2 x i64> %a0, <2 x i64> %a1, i64 %a2) { 186; SSE-LABEL: bitselect_v2i64_broadcast_rrr: 187; SSE: # %bb.0: 188; SSE-NEXT: movq %rdi, %xmm2 189; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] 190; SSE-NEXT: pand %xmm2, %xmm0 191; SSE-NEXT: pandn %xmm1, %xmm2 192; SSE-NEXT: por %xmm2, %xmm0 193; SSE-NEXT: retq 194; 195; XOP-LABEL: bitselect_v2i64_broadcast_rrr: 196; XOP: # %bb.0: 197; XOP-NEXT: vmovq %rdi, %xmm2 198; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] 199; XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 200; XOP-NEXT: retq 201; 202; AVX1-LABEL: bitselect_v2i64_broadcast_rrr: 203; AVX1: # %bb.0: 204; AVX1-NEXT: vmovq %rdi, %xmm2 205; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] 206; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 207; AVX1-NEXT: vpandn %xmm1, %xmm2, %xmm1 208; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 209; AVX1-NEXT: retq 210; 211; AVX2-LABEL: bitselect_v2i64_broadcast_rrr: 212; AVX2: # %bb.0: 213; AVX2-NEXT: vmovq %rdi, %xmm2 214; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2 215; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 216; AVX2-NEXT: vpandn %xmm1, %xmm2, %xmm1 217; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 218; AVX2-NEXT: retq 219; 220; AVX512F-LABEL: bitselect_v2i64_broadcast_rrr: 221; AVX512F: # %bb.0: 222; AVX512F-NEXT: vmovq %rdi, %xmm2 223; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2 224; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 225; AVX512F-NEXT: vpandn %xmm1, %xmm2, %xmm1 226; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 227; AVX512F-NEXT: retq 228; 229; AVX512VL-LABEL: bitselect_v2i64_broadcast_rrr: 230; AVX512VL: # %bb.0: 231; AVX512VL-NEXT: vpbroadcastq %rdi, %xmm2 232; AVX512VL-NEXT: vpternlogq $226, %xmm1, %xmm2, %xmm0 233; AVX512VL-NEXT: retq 234 %1 = insertelement <2 x i64> undef, i64 %a2, i32 0 235 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer 236 %3 = xor <2 x i64> %1, <i64 -1, i64 undef> 237 %4 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer 238 %5 = and <2 x i64> %a0, %2 239 %6 = and <2 x i64> %a1, %4 240 %7 = or <2 x i64> %5, %6 241 ret <2 x i64> %7 242} 243 244define <2 x i64> @bitselect_v2i64_broadcast_rrm(<2 x i64> %a0, <2 x i64> %a1, i64* %p2) { 245; SSE-LABEL: bitselect_v2i64_broadcast_rrm: 246; SSE: # %bb.0: 247; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero 248; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] 249; SSE-NEXT: pand %xmm2, %xmm0 250; SSE-NEXT: pandn %xmm1, %xmm2 251; SSE-NEXT: por %xmm2, %xmm0 252; SSE-NEXT: retq 253; 254; XOP-LABEL: bitselect_v2i64_broadcast_rrm: 255; XOP: # %bb.0: 256; XOP-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] 257; XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 258; XOP-NEXT: retq 259; 260; AVX-LABEL: bitselect_v2i64_broadcast_rrm: 261; AVX: # %bb.0: 262; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] 263; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 264; AVX-NEXT: vandnps %xmm1, %xmm2, %xmm1 265; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 266; AVX-NEXT: retq 267; 268; AVX512F-LABEL: bitselect_v2i64_broadcast_rrm: 269; AVX512F: # %bb.0: 270; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] 271; AVX512F-NEXT: vandps %xmm2, %xmm0, %xmm0 272; AVX512F-NEXT: vandnps %xmm1, %xmm2, %xmm1 273; AVX512F-NEXT: vorps %xmm1, %xmm0, %xmm0 274; AVX512F-NEXT: retq 275; 276; AVX512VL-LABEL: bitselect_v2i64_broadcast_rrm: 277; AVX512VL: # %bb.0: 278; AVX512VL-NEXT: vpternlogq $228, (%rdi){1to2}, %xmm1, %xmm0 279; AVX512VL-NEXT: retq 280 %a2 = load i64, i64* %p2 281 %1 = insertelement <2 x i64> undef, i64 %a2, i32 0 282 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer 283 %3 = xor <2 x i64> %1, <i64 -1, i64 undef> 284 %4 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer 285 %5 = and <2 x i64> %a0, %2 286 %6 = and <2 x i64> %a1, %4 287 %7 = or <2 x i64> %5, %6 288 ret <2 x i64> %7 289} 290 291; 292; 256-bit vectors 293; 294 295define <4 x i64> @bitselect_v4i64_rr(<4 x i64>, <4 x i64>) { 296; SSE-LABEL: bitselect_v4i64_rr: 297; SSE: # %bb.0: 298; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 299; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 300; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 301; SSE-NEXT: orps %xmm3, %xmm1 302; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 303; SSE-NEXT: orps %xmm2, %xmm0 304; SSE-NEXT: retq 305; 306; XOP-LABEL: bitselect_v4i64_rr: 307; XOP: # %bb.0: 308; XOP-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1, %ymm0 309; XOP-NEXT: retq 310; 311; AVX-LABEL: bitselect_v4i64_rr: 312; AVX: # %bb.0: 313; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 314; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 315; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 316; AVX-NEXT: retq 317; 318; AVX512F-LABEL: bitselect_v4i64_rr: 319; AVX512F: # %bb.0: 320; AVX512F-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 321; AVX512F-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 322; AVX512F-NEXT: vorps %ymm0, %ymm1, %ymm0 323; AVX512F-NEXT: retq 324; 325; AVX512VL-LABEL: bitselect_v4i64_rr: 326; AVX512VL: # %bb.0: 327; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 328; AVX512VL-NEXT: retq 329 %3 = and <4 x i64> %0, <i64 4294967296, i64 12884901890, i64 12884901890, i64 12884901890> 330 %4 = and <4 x i64> %1, <i64 -4294967297, i64 -12884901891, i64 -12884901891, i64 -12884901891> 331 %5 = or <4 x i64> %4, %3 332 ret <4 x i64> %5 333} 334 335define <4 x i64> @bitselect_v4i64_rm(<4 x i64>, <4 x i64>* nocapture readonly) { 336; SSE-LABEL: bitselect_v4i64_rm: 337; SSE: # %bb.0: 338; SSE-NEXT: movaps {{.*#+}} xmm2 = [18446744065119617022,18446744073709551612] 339; SSE-NEXT: movaps 16(%rdi), %xmm4 340; SSE-NEXT: andps %xmm2, %xmm4 341; SSE-NEXT: movaps (%rdi), %xmm5 342; SSE-NEXT: andps %xmm2, %xmm5 343; SSE-NEXT: movaps %xmm2, %xmm3 344; SSE-NEXT: andnps %xmm0, %xmm3 345; SSE-NEXT: orps %xmm5, %xmm3 346; SSE-NEXT: andnps %xmm1, %xmm2 347; SSE-NEXT: orps %xmm4, %xmm2 348; SSE-NEXT: movaps %xmm3, %xmm0 349; SSE-NEXT: movaps %xmm2, %xmm1 350; SSE-NEXT: retq 351; 352; XOP-LABEL: bitselect_v4i64_rm: 353; XOP: # %bb.0: 354; XOP-NEXT: vmovdqa (%rdi), %ymm1 355; XOP-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1, %ymm0 356; XOP-NEXT: retq 357; 358; AVX-LABEL: bitselect_v4i64_rm: 359; AVX: # %bb.0: 360; AVX-NEXT: vmovaps (%rdi), %ymm1 361; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 362; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 363; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 364; AVX-NEXT: retq 365; 366; AVX512F-LABEL: bitselect_v4i64_rm: 367; AVX512F: # %bb.0: 368; AVX512F-NEXT: vmovaps (%rdi), %ymm1 369; AVX512F-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 370; AVX512F-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 371; AVX512F-NEXT: vorps %ymm0, %ymm1, %ymm0 372; AVX512F-NEXT: retq 373; 374; AVX512VL-LABEL: bitselect_v4i64_rm: 375; AVX512VL: # %bb.0: 376; AVX512VL-NEXT: vmovdqa (%rdi), %ymm1 377; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 378; AVX512VL-NEXT: retq 379 %3 = load <4 x i64>, <4 x i64>* %1 380 %4 = and <4 x i64> %0, <i64 8589934593, i64 3, i64 8589934593, i64 3> 381 %5 = and <4 x i64> %3, <i64 -8589934594, i64 -4, i64 -8589934594, i64 -4> 382 %6 = or <4 x i64> %5, %4 383 ret <4 x i64> %6 384} 385 386define <4 x i64> @bitselect_v4i64_mr(<4 x i64>* nocapture readonly, <4 x i64>) { 387; SSE-LABEL: bitselect_v4i64_mr: 388; SSE: # %bb.0: 389; SSE-NEXT: movaps {{.*#+}} xmm2 = [12884901890,4294967296] 390; SSE-NEXT: movaps 16(%rdi), %xmm4 391; SSE-NEXT: andps %xmm2, %xmm4 392; SSE-NEXT: movaps (%rdi), %xmm5 393; SSE-NEXT: andps %xmm2, %xmm5 394; SSE-NEXT: movaps %xmm2, %xmm3 395; SSE-NEXT: andnps %xmm0, %xmm3 396; SSE-NEXT: orps %xmm5, %xmm3 397; SSE-NEXT: andnps %xmm1, %xmm2 398; SSE-NEXT: orps %xmm4, %xmm2 399; SSE-NEXT: movaps %xmm3, %xmm0 400; SSE-NEXT: movaps %xmm2, %xmm1 401; SSE-NEXT: retq 402; 403; XOP-LABEL: bitselect_v4i64_mr: 404; XOP: # %bb.0: 405; XOP-NEXT: vmovdqa (%rdi), %ymm1 406; XOP-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1, %ymm0 407; XOP-NEXT: retq 408; 409; AVX-LABEL: bitselect_v4i64_mr: 410; AVX: # %bb.0: 411; AVX-NEXT: vmovaps (%rdi), %ymm1 412; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 413; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 414; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 415; AVX-NEXT: retq 416; 417; AVX512F-LABEL: bitselect_v4i64_mr: 418; AVX512F: # %bb.0: 419; AVX512F-NEXT: vmovaps (%rdi), %ymm1 420; AVX512F-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 421; AVX512F-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 422; AVX512F-NEXT: vorps %ymm0, %ymm1, %ymm0 423; AVX512F-NEXT: retq 424; 425; AVX512VL-LABEL: bitselect_v4i64_mr: 426; AVX512VL: # %bb.0: 427; AVX512VL-NEXT: vmovdqa (%rdi), %ymm1 428; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 429; AVX512VL-NEXT: retq 430 %3 = load <4 x i64>, <4 x i64>* %0 431 %4 = and <4 x i64> %3, <i64 12884901890, i64 4294967296, i64 12884901890, i64 4294967296> 432 %5 = and <4 x i64> %1, <i64 -12884901891, i64 -4294967297, i64 -12884901891, i64 -4294967297> 433 %6 = or <4 x i64> %4, %5 434 ret <4 x i64> %6 435} 436 437define <4 x i64> @bitselect_v4i64_mm(<4 x i64>* nocapture readonly, <4 x i64>* nocapture readonly) { 438; SSE-LABEL: bitselect_v4i64_mm: 439; SSE: # %bb.0: 440; SSE-NEXT: movaps {{.*#+}} xmm1 = [18446744073709551612,18446744065119617022] 441; SSE-NEXT: movaps 16(%rsi), %xmm2 442; SSE-NEXT: andps %xmm1, %xmm2 443; SSE-NEXT: movaps (%rsi), %xmm3 444; SSE-NEXT: andps %xmm1, %xmm3 445; SSE-NEXT: movaps %xmm1, %xmm0 446; SSE-NEXT: andnps (%rdi), %xmm0 447; SSE-NEXT: orps %xmm3, %xmm0 448; SSE-NEXT: andnps 16(%rdi), %xmm1 449; SSE-NEXT: orps %xmm2, %xmm1 450; SSE-NEXT: retq 451; 452; XOP-LABEL: bitselect_v4i64_mm: 453; XOP: # %bb.0: 454; XOP-NEXT: vmovdqa (%rsi), %ymm0 455; XOP-NEXT: vmovdqa {{.*#+}} ymm1 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] 456; XOP-NEXT: vpcmov %ymm1, (%rdi), %ymm0, %ymm0 457; XOP-NEXT: retq 458; 459; AVX-LABEL: bitselect_v4i64_mm: 460; AVX: # %bb.0: 461; AVX-NEXT: vmovaps (%rdi), %ymm0 462; AVX-NEXT: vmovaps (%rsi), %ymm1 463; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 464; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 465; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0 466; AVX-NEXT: retq 467; 468; AVX512F-LABEL: bitselect_v4i64_mm: 469; AVX512F: # %bb.0: 470; AVX512F-NEXT: vmovaps (%rdi), %ymm0 471; AVX512F-NEXT: vmovaps (%rsi), %ymm1 472; AVX512F-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 473; AVX512F-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 474; AVX512F-NEXT: vorps %ymm0, %ymm1, %ymm0 475; AVX512F-NEXT: retq 476; 477; AVX512VL-LABEL: bitselect_v4i64_mm: 478; AVX512VL: # %bb.0: 479; AVX512VL-NEXT: vmovdqa (%rsi), %ymm1 480; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] 481; AVX512VL-NEXT: vpternlogq $202, (%rdi), %ymm1, %ymm0 482; AVX512VL-NEXT: retq 483 %3 = load <4 x i64>, <4 x i64>* %0 484 %4 = load <4 x i64>, <4 x i64>* %1 485 %5 = and <4 x i64> %3, <i64 3, i64 8589934593, i64 3, i64 8589934593> 486 %6 = and <4 x i64> %4, <i64 -4, i64 -8589934594, i64 -4, i64 -8589934594> 487 %7 = or <4 x i64> %6, %5 488 ret <4 x i64> %7 489} 490 491define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i64 %a2) { 492; SSE-LABEL: bitselect_v4i64_broadcast_rrr: 493; SSE: # %bb.0: 494; SSE-NEXT: movq %rdi, %xmm4 495; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] 496; SSE-NEXT: pand %xmm4, %xmm1 497; SSE-NEXT: pand %xmm4, %xmm0 498; SSE-NEXT: movdqa %xmm4, %xmm5 499; SSE-NEXT: pandn %xmm3, %xmm5 500; SSE-NEXT: por %xmm5, %xmm1 501; SSE-NEXT: pandn %xmm2, %xmm4 502; SSE-NEXT: por %xmm4, %xmm0 503; SSE-NEXT: retq 504; 505; XOP-LABEL: bitselect_v4i64_broadcast_rrr: 506; XOP: # %bb.0: 507; XOP-NEXT: vmovq %rdi, %xmm2 508; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] 509; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 510; XOP-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 511; XOP-NEXT: retq 512; 513; AVX1-LABEL: bitselect_v4i64_broadcast_rrr: 514; AVX1: # %bb.0: 515; AVX1-NEXT: vmovq %rdi, %xmm2 516; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] 517; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 518; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 519; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 520; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 521; AVX1-NEXT: retq 522; 523; AVX2-LABEL: bitselect_v4i64_broadcast_rrr: 524; AVX2: # %bb.0: 525; AVX2-NEXT: vmovq %rdi, %xmm2 526; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 527; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 528; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 529; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 530; AVX2-NEXT: retq 531; 532; AVX512F-LABEL: bitselect_v4i64_broadcast_rrr: 533; AVX512F: # %bb.0: 534; AVX512F-NEXT: vmovq %rdi, %xmm2 535; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 536; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 537; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1 538; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 539; AVX512F-NEXT: retq 540; 541; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrr: 542; AVX512VL: # %bb.0: 543; AVX512VL-NEXT: vpbroadcastq %rdi, %ymm2 544; AVX512VL-NEXT: vpternlogq $226, %ymm1, %ymm2, %ymm0 545; AVX512VL-NEXT: retq 546 %1 = insertelement <4 x i64> undef, i64 %a2, i32 0 547 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <4 x i32> zeroinitializer 548 %3 = xor <4 x i64> %1, <i64 -1, i64 undef, i64 undef, i64 undef> 549 %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> zeroinitializer 550 %5 = and <4 x i64> %a0, %2 551 %6 = and <4 x i64> %a1, %4 552 %7 = or <4 x i64> %5, %6 553 ret <4 x i64> %7 554} 555 556define <4 x i64> @bitselect_v4i64_broadcast_rrm(<4 x i64> %a0, <4 x i64> %a1, i64* %p2) { 557; SSE-LABEL: bitselect_v4i64_broadcast_rrm: 558; SSE: # %bb.0: 559; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero 560; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] 561; SSE-NEXT: pand %xmm4, %xmm1 562; SSE-NEXT: pand %xmm4, %xmm0 563; SSE-NEXT: movdqa %xmm4, %xmm5 564; SSE-NEXT: pandn %xmm3, %xmm5 565; SSE-NEXT: por %xmm5, %xmm1 566; SSE-NEXT: pandn %xmm2, %xmm4 567; SSE-NEXT: por %xmm4, %xmm0 568; SSE-NEXT: retq 569; 570; XOP-LABEL: bitselect_v4i64_broadcast_rrm: 571; XOP: # %bb.0: 572; XOP-NEXT: vbroadcastsd (%rdi), %ymm2 573; XOP-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 574; XOP-NEXT: retq 575; 576; AVX-LABEL: bitselect_v4i64_broadcast_rrm: 577; AVX: # %bb.0: 578; AVX-NEXT: vbroadcastsd (%rdi), %ymm2 579; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 580; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 581; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 582; AVX-NEXT: retq 583; 584; AVX512F-LABEL: bitselect_v4i64_broadcast_rrm: 585; AVX512F: # %bb.0: 586; AVX512F-NEXT: vbroadcastsd (%rdi), %ymm2 587; AVX512F-NEXT: vandps %ymm2, %ymm0, %ymm0 588; AVX512F-NEXT: vandnps %ymm1, %ymm2, %ymm1 589; AVX512F-NEXT: vorps %ymm1, %ymm0, %ymm0 590; AVX512F-NEXT: retq 591; 592; AVX512VL-LABEL: bitselect_v4i64_broadcast_rrm: 593; AVX512VL: # %bb.0: 594; AVX512VL-NEXT: vpternlogq $228, (%rdi){1to4}, %ymm1, %ymm0 595; AVX512VL-NEXT: retq 596 %a2 = load i64, i64* %p2 597 %1 = insertelement <4 x i64> undef, i64 %a2, i32 0 598 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <4 x i32> zeroinitializer 599 %3 = xor <4 x i64> %1, <i64 -1, i64 undef, i64 undef, i64 undef> 600 %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> zeroinitializer 601 %5 = and <4 x i64> %a0, %2 602 %6 = and <4 x i64> %a1, %4 603 %7 = or <4 x i64> %5, %6 604 ret <4 x i64> %7 605} 606 607; 608; 512-bit vectors 609; 610 611define <8 x i64> @bitselect_v8i64_rr(<8 x i64>, <8 x i64>) { 612; SSE-LABEL: bitselect_v8i64_rr: 613; SSE: # %bb.0: 614; SSE-NEXT: movaps {{.*#+}} xmm8 = [18446744060824649725,18446744060824649725] 615; SSE-NEXT: andps %xmm8, %xmm7 616; SSE-NEXT: movaps {{.*#+}} xmm9 = [18446744069414584319,18446744060824649725] 617; SSE-NEXT: andps %xmm9, %xmm6 618; SSE-NEXT: andps %xmm8, %xmm5 619; SSE-NEXT: andps %xmm9, %xmm4 620; SSE-NEXT: movaps %xmm9, %xmm10 621; SSE-NEXT: andnps %xmm0, %xmm10 622; SSE-NEXT: orps %xmm4, %xmm10 623; SSE-NEXT: movaps %xmm8, %xmm4 624; SSE-NEXT: andnps %xmm1, %xmm4 625; SSE-NEXT: orps %xmm5, %xmm4 626; SSE-NEXT: andnps %xmm2, %xmm9 627; SSE-NEXT: orps %xmm6, %xmm9 628; SSE-NEXT: andnps %xmm3, %xmm8 629; SSE-NEXT: orps %xmm7, %xmm8 630; SSE-NEXT: movaps %xmm10, %xmm0 631; SSE-NEXT: movaps %xmm4, %xmm1 632; SSE-NEXT: movaps %xmm9, %xmm2 633; SSE-NEXT: movaps %xmm8, %xmm3 634; SSE-NEXT: retq 635; 636; XOP-LABEL: bitselect_v8i64_rr: 637; XOP: # %bb.0: 638; XOP-NEXT: vmovdqa {{.*#+}} ymm4 = [18446744069414584319,18446744060824649725,18446744060824649725,18446744060824649725] 639; XOP-NEXT: vpcmov %ymm4, %ymm0, %ymm2, %ymm0 640; XOP-NEXT: vpcmov %ymm4, %ymm1, %ymm3, %ymm1 641; XOP-NEXT: retq 642; 643; AVX-LABEL: bitselect_v8i64_rr: 644; AVX: # %bb.0: 645; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [18446744069414584319,18446744060824649725,18446744060824649725,18446744060824649725] 646; AVX-NEXT: vandps %ymm4, %ymm3, %ymm3 647; AVX-NEXT: vandps %ymm4, %ymm2, %ymm2 648; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0 649; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 650; AVX-NEXT: vandnps %ymm1, %ymm4, %ymm1 651; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1 652; AVX-NEXT: retq 653; 654; AVX512-LABEL: bitselect_v8i64_rr: 655; AVX512: # %bb.0: 656; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 657; AVX512-NEXT: retq 658 %3 = and <8 x i64> %0, <i64 4294967296, i64 12884901890, i64 12884901890, i64 12884901890, i64 4294967296, i64 12884901890, i64 12884901890, i64 12884901890> 659 %4 = and <8 x i64> %1, <i64 -4294967297, i64 -12884901891, i64 -12884901891, i64 -12884901891, i64 -4294967297, i64 -12884901891, i64 -12884901891, i64 -12884901891> 660 %5 = or <8 x i64> %4, %3 661 ret <8 x i64> %5 662} 663 664define <8 x i64> @bitselect_v8i64_rm(<8 x i64>, <8 x i64>* nocapture readonly) { 665; SSE-LABEL: bitselect_v8i64_rm: 666; SSE: # %bb.0: 667; SSE-NEXT: movaps {{.*#+}} xmm4 = [18446744065119617022,18446744073709551612] 668; SSE-NEXT: movaps 48(%rdi), %xmm8 669; SSE-NEXT: andps %xmm4, %xmm8 670; SSE-NEXT: movaps 32(%rdi), %xmm9 671; SSE-NEXT: andps %xmm4, %xmm9 672; SSE-NEXT: movaps 16(%rdi), %xmm7 673; SSE-NEXT: andps %xmm4, %xmm7 674; SSE-NEXT: movaps (%rdi), %xmm6 675; SSE-NEXT: andps %xmm4, %xmm6 676; SSE-NEXT: movaps %xmm4, %xmm5 677; SSE-NEXT: andnps %xmm0, %xmm5 678; SSE-NEXT: orps %xmm6, %xmm5 679; SSE-NEXT: movaps %xmm4, %xmm6 680; SSE-NEXT: andnps %xmm1, %xmm6 681; SSE-NEXT: orps %xmm7, %xmm6 682; SSE-NEXT: movaps %xmm4, %xmm7 683; SSE-NEXT: andnps %xmm2, %xmm7 684; SSE-NEXT: orps %xmm9, %xmm7 685; SSE-NEXT: andnps %xmm3, %xmm4 686; SSE-NEXT: orps %xmm8, %xmm4 687; SSE-NEXT: movaps %xmm5, %xmm0 688; SSE-NEXT: movaps %xmm6, %xmm1 689; SSE-NEXT: movaps %xmm7, %xmm2 690; SSE-NEXT: movaps %xmm4, %xmm3 691; SSE-NEXT: retq 692; 693; XOP-LABEL: bitselect_v8i64_rm: 694; XOP: # %bb.0: 695; XOP-NEXT: vmovdqa (%rdi), %ymm2 696; XOP-NEXT: vmovdqa 32(%rdi), %ymm3 697; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612] 698; XOP-NEXT: # ymm4 = mem[0,1,0,1] 699; XOP-NEXT: vpcmov %ymm4, %ymm0, %ymm2, %ymm0 700; XOP-NEXT: vpcmov %ymm4, %ymm1, %ymm3, %ymm1 701; XOP-NEXT: retq 702; 703; AVX-LABEL: bitselect_v8i64_rm: 704; AVX: # %bb.0: 705; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612] 706; AVX-NEXT: # ymm2 = mem[0,1,0,1] 707; AVX-NEXT: vandps 32(%rdi), %ymm2, %ymm3 708; AVX-NEXT: vandps (%rdi), %ymm2, %ymm4 709; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0 710; AVX-NEXT: vorps %ymm0, %ymm4, %ymm0 711; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 712; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1 713; AVX-NEXT: retq 714; 715; AVX512-LABEL: bitselect_v8i64_rm: 716; AVX512: # %bb.0: 717; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 718; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 719; AVX512-NEXT: retq 720 %3 = load <8 x i64>, <8 x i64>* %1 721 %4 = and <8 x i64> %0, <i64 8589934593, i64 3, i64 8589934593, i64 3, i64 8589934593, i64 3, i64 8589934593, i64 3> 722 %5 = and <8 x i64> %3, <i64 -8589934594, i64 -4, i64 -8589934594, i64 -4, i64 -8589934594, i64 -4, i64 -8589934594, i64 -4> 723 %6 = or <8 x i64> %5, %4 724 ret <8 x i64> %6 725} 726 727define <8 x i64> @bitselect_v8i64_mr(<8 x i64>* nocapture readonly, <8 x i64>) { 728; SSE-LABEL: bitselect_v8i64_mr: 729; SSE: # %bb.0: 730; SSE-NEXT: movaps {{.*#+}} xmm4 = [12884901890,4294967296] 731; SSE-NEXT: movaps 48(%rdi), %xmm8 732; SSE-NEXT: andps %xmm4, %xmm8 733; SSE-NEXT: movaps 32(%rdi), %xmm9 734; SSE-NEXT: andps %xmm4, %xmm9 735; SSE-NEXT: movaps 16(%rdi), %xmm7 736; SSE-NEXT: andps %xmm4, %xmm7 737; SSE-NEXT: movaps (%rdi), %xmm6 738; SSE-NEXT: andps %xmm4, %xmm6 739; SSE-NEXT: movaps %xmm4, %xmm5 740; SSE-NEXT: andnps %xmm0, %xmm5 741; SSE-NEXT: orps %xmm6, %xmm5 742; SSE-NEXT: movaps %xmm4, %xmm6 743; SSE-NEXT: andnps %xmm1, %xmm6 744; SSE-NEXT: orps %xmm7, %xmm6 745; SSE-NEXT: movaps %xmm4, %xmm7 746; SSE-NEXT: andnps %xmm2, %xmm7 747; SSE-NEXT: orps %xmm9, %xmm7 748; SSE-NEXT: andnps %xmm3, %xmm4 749; SSE-NEXT: orps %xmm8, %xmm4 750; SSE-NEXT: movaps %xmm5, %xmm0 751; SSE-NEXT: movaps %xmm6, %xmm1 752; SSE-NEXT: movaps %xmm7, %xmm2 753; SSE-NEXT: movaps %xmm4, %xmm3 754; SSE-NEXT: retq 755; 756; XOP-LABEL: bitselect_v8i64_mr: 757; XOP: # %bb.0: 758; XOP-NEXT: vmovdqa (%rdi), %ymm2 759; XOP-NEXT: vmovdqa 32(%rdi), %ymm3 760; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [12884901890,4294967296,12884901890,4294967296] 761; XOP-NEXT: # ymm4 = mem[0,1,0,1] 762; XOP-NEXT: vpcmov %ymm4, %ymm0, %ymm2, %ymm0 763; XOP-NEXT: vpcmov %ymm4, %ymm1, %ymm3, %ymm1 764; XOP-NEXT: retq 765; 766; AVX-LABEL: bitselect_v8i64_mr: 767; AVX: # %bb.0: 768; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [12884901890,4294967296,12884901890,4294967296] 769; AVX-NEXT: # ymm2 = mem[0,1,0,1] 770; AVX-NEXT: vandps 32(%rdi), %ymm2, %ymm3 771; AVX-NEXT: vandps (%rdi), %ymm2, %ymm4 772; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0 773; AVX-NEXT: vorps %ymm0, %ymm4, %ymm0 774; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 775; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1 776; AVX-NEXT: retq 777; 778; AVX512-LABEL: bitselect_v8i64_mr: 779; AVX512: # %bb.0: 780; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 781; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 782; AVX512-NEXT: retq 783 %3 = load <8 x i64>, <8 x i64>* %0 784 %4 = and <8 x i64> %3, <i64 12884901890, i64 4294967296, i64 12884901890, i64 4294967296, i64 12884901890, i64 4294967296, i64 12884901890, i64 4294967296> 785 %5 = and <8 x i64> %1, <i64 -12884901891, i64 -4294967297, i64 -12884901891, i64 -4294967297, i64 -12884901891, i64 -4294967297, i64 -12884901891, i64 -4294967297> 786 %6 = or <8 x i64> %4, %5 787 ret <8 x i64> %6 788} 789 790define <8 x i64> @bitselect_v8i64_mm(<8 x i64>* nocapture readonly, <8 x i64>* nocapture readonly) { 791; SSE-LABEL: bitselect_v8i64_mm: 792; SSE: # %bb.0: 793; SSE-NEXT: movaps {{.*#+}} xmm3 = [18446744073709551612,18446744065119617022] 794; SSE-NEXT: movaps 48(%rsi), %xmm4 795; SSE-NEXT: andps %xmm3, %xmm4 796; SSE-NEXT: movaps 32(%rsi), %xmm5 797; SSE-NEXT: andps %xmm3, %xmm5 798; SSE-NEXT: movaps 16(%rsi), %xmm2 799; SSE-NEXT: andps %xmm3, %xmm2 800; SSE-NEXT: movaps (%rsi), %xmm1 801; SSE-NEXT: andps %xmm3, %xmm1 802; SSE-NEXT: movaps %xmm3, %xmm0 803; SSE-NEXT: andnps (%rdi), %xmm0 804; SSE-NEXT: orps %xmm1, %xmm0 805; SSE-NEXT: movaps %xmm3, %xmm1 806; SSE-NEXT: andnps 16(%rdi), %xmm1 807; SSE-NEXT: orps %xmm2, %xmm1 808; SSE-NEXT: movaps %xmm3, %xmm2 809; SSE-NEXT: andnps 32(%rdi), %xmm2 810; SSE-NEXT: orps %xmm5, %xmm2 811; SSE-NEXT: andnps 48(%rdi), %xmm3 812; SSE-NEXT: orps %xmm4, %xmm3 813; SSE-NEXT: retq 814; 815; XOP-LABEL: bitselect_v8i64_mm: 816; XOP: # %bb.0: 817; XOP-NEXT: vmovdqa (%rsi), %ymm0 818; XOP-NEXT: vmovdqa 32(%rsi), %ymm1 819; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] 820; XOP-NEXT: # ymm2 = mem[0,1,0,1] 821; XOP-NEXT: vpcmov %ymm2, (%rdi), %ymm0, %ymm0 822; XOP-NEXT: vpcmov %ymm2, 32(%rdi), %ymm1, %ymm1 823; XOP-NEXT: retq 824; 825; AVX-LABEL: bitselect_v8i64_mm: 826; AVX: # %bb.0: 827; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] 828; AVX-NEXT: # ymm1 = mem[0,1,0,1] 829; AVX-NEXT: vandps 32(%rsi), %ymm1, %ymm2 830; AVX-NEXT: vandps (%rsi), %ymm1, %ymm0 831; AVX-NEXT: vandnps (%rdi), %ymm1, %ymm3 832; AVX-NEXT: vorps %ymm3, %ymm0, %ymm0 833; AVX-NEXT: vandnps 32(%rdi), %ymm1, %ymm1 834; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1 835; AVX-NEXT: retq 836; 837; AVX512-LABEL: bitselect_v8i64_mm: 838; AVX512: # %bb.0: 839; AVX512-NEXT: vmovdqa64 (%rsi), %zmm1 840; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] 841; AVX512-NEXT: vpternlogq $202, (%rdi), %zmm1, %zmm0 842; AVX512-NEXT: retq 843 %3 = load <8 x i64>, <8 x i64>* %0 844 %4 = load <8 x i64>, <8 x i64>* %1 845 %5 = and <8 x i64> %3, <i64 3, i64 8589934593, i64 3, i64 8589934593, i64 3, i64 8589934593, i64 3, i64 8589934593> 846 %6 = and <8 x i64> %4, <i64 -4, i64 -8589934594, i64 -4, i64 -8589934594, i64 -4, i64 -8589934594, i64 -4, i64 -8589934594> 847 %7 = or <8 x i64> %6, %5 848 ret <8 x i64> %7 849} 850 851define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i64 %a2) { 852; SSE-LABEL: bitselect_v8i64_broadcast_rrr: 853; SSE: # %bb.0: 854; SSE-NEXT: movq %rdi, %xmm8 855; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] 856; SSE-NEXT: pand %xmm8, %xmm3 857; SSE-NEXT: pand %xmm8, %xmm2 858; SSE-NEXT: pand %xmm8, %xmm1 859; SSE-NEXT: pand %xmm8, %xmm0 860; SSE-NEXT: movdqa %xmm8, %xmm9 861; SSE-NEXT: pandn %xmm7, %xmm9 862; SSE-NEXT: por %xmm9, %xmm3 863; SSE-NEXT: movdqa %xmm8, %xmm7 864; SSE-NEXT: pandn %xmm6, %xmm7 865; SSE-NEXT: por %xmm7, %xmm2 866; SSE-NEXT: movdqa %xmm8, %xmm6 867; SSE-NEXT: pandn %xmm5, %xmm6 868; SSE-NEXT: por %xmm6, %xmm1 869; SSE-NEXT: pandn %xmm4, %xmm8 870; SSE-NEXT: por %xmm8, %xmm0 871; SSE-NEXT: retq 872; 873; XOP-LABEL: bitselect_v8i64_broadcast_rrr: 874; XOP: # %bb.0: 875; XOP-NEXT: vmovq %rdi, %xmm4 876; XOP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] 877; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 878; XOP-NEXT: vpcmov %ymm4, %ymm2, %ymm0, %ymm0 879; XOP-NEXT: vpcmov %ymm4, %ymm3, %ymm1, %ymm1 880; XOP-NEXT: retq 881; 882; AVX1-LABEL: bitselect_v8i64_broadcast_rrr: 883; AVX1: # %bb.0: 884; AVX1-NEXT: vmovq %rdi, %xmm4 885; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] 886; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 887; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 888; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 889; AVX1-NEXT: vandnps %ymm3, %ymm4, %ymm3 890; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 891; AVX1-NEXT: vandnps %ymm2, %ymm4, %ymm2 892; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 893; AVX1-NEXT: retq 894; 895; AVX2-LABEL: bitselect_v8i64_broadcast_rrr: 896; AVX2: # %bb.0: 897; AVX2-NEXT: vmovq %rdi, %xmm4 898; AVX2-NEXT: vpbroadcastq %xmm4, %ymm4 899; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 900; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 901; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3 902; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 903; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 904; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 905; AVX2-NEXT: retq 906; 907; AVX512-LABEL: bitselect_v8i64_broadcast_rrr: 908; AVX512: # %bb.0: 909; AVX512-NEXT: vpbroadcastq %rdi, %zmm2 910; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm0 911; AVX512-NEXT: retq 912 %1 = insertelement <8 x i64> undef, i64 %a2, i32 0 913 %2 = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> zeroinitializer 914 %3 = xor <8 x i64> %1, <i64 -1, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef> 915 %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> zeroinitializer 916 %5 = and <8 x i64> %a0, %2 917 %6 = and <8 x i64> %a1, %4 918 %7 = or <8 x i64> %5, %6 919 ret <8 x i64> %7 920} 921 922define <8 x i64> @bitselect_v8i64_broadcast_rrm(<8 x i64> %a0, <8 x i64> %a1, i64* %p2) { 923; SSE-LABEL: bitselect_v8i64_broadcast_rrm: 924; SSE: # %bb.0: 925; SSE-NEXT: movq {{.*#+}} xmm8 = mem[0],zero 926; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] 927; SSE-NEXT: pand %xmm8, %xmm3 928; SSE-NEXT: pand %xmm8, %xmm2 929; SSE-NEXT: pand %xmm8, %xmm1 930; SSE-NEXT: pand %xmm8, %xmm0 931; SSE-NEXT: movdqa %xmm8, %xmm9 932; SSE-NEXT: pandn %xmm7, %xmm9 933; SSE-NEXT: por %xmm9, %xmm3 934; SSE-NEXT: movdqa %xmm8, %xmm7 935; SSE-NEXT: pandn %xmm6, %xmm7 936; SSE-NEXT: por %xmm7, %xmm2 937; SSE-NEXT: movdqa %xmm8, %xmm6 938; SSE-NEXT: pandn %xmm5, %xmm6 939; SSE-NEXT: por %xmm6, %xmm1 940; SSE-NEXT: pandn %xmm4, %xmm8 941; SSE-NEXT: por %xmm8, %xmm0 942; SSE-NEXT: retq 943; 944; XOP-LABEL: bitselect_v8i64_broadcast_rrm: 945; XOP: # %bb.0: 946; XOP-NEXT: vbroadcastsd (%rdi), %ymm4 947; XOP-NEXT: vpcmov %ymm4, %ymm2, %ymm0, %ymm0 948; XOP-NEXT: vpcmov %ymm4, %ymm3, %ymm1, %ymm1 949; XOP-NEXT: retq 950; 951; AVX-LABEL: bitselect_v8i64_broadcast_rrm: 952; AVX: # %bb.0: 953; AVX-NEXT: vbroadcastsd (%rdi), %ymm4 954; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1 955; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0 956; AVX-NEXT: vandnps %ymm3, %ymm4, %ymm3 957; AVX-NEXT: vorps %ymm3, %ymm1, %ymm1 958; AVX-NEXT: vandnps %ymm2, %ymm4, %ymm2 959; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 960; AVX-NEXT: retq 961; 962; AVX512-LABEL: bitselect_v8i64_broadcast_rrm: 963; AVX512: # %bb.0: 964; AVX512-NEXT: vpternlogq $228, (%rdi){1to8}, %zmm1, %zmm0 965; AVX512-NEXT: retq 966 %a2 = load i64, i64* %p2 967 %1 = insertelement <8 x i64> undef, i64 %a2, i32 0 968 %2 = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> zeroinitializer 969 %3 = xor <8 x i64> %1, <i64 -1, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef> 970 %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> zeroinitializer 971 %5 = and <8 x i64> %a0, %2 972 %6 = and <8 x i64> %a1, %4 973 %7 = or <8 x i64> %5, %6 974 ret <8 x i64> %7 975} 976 977; Check that mask registers don't get canonicalized. 978define <4 x i1> @bitselect_v4i1_loop(<4 x i32> %a0, <4 x i32> %a1) { 979; SSE-LABEL: bitselect_v4i1_loop: 980; SSE: # %bb.0: # %bb 981; SSE-NEXT: pxor %xmm2, %xmm2 982; SSE-NEXT: pcmpeqd %xmm0, %xmm2 983; SSE-NEXT: movdqa {{.*#+}} xmm0 = [12,12,12,12] 984; SSE-NEXT: pcmpeqd %xmm1, %xmm0 985; SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 986; SSE-NEXT: pand %xmm2, %xmm1 987; SSE-NEXT: pandn %xmm0, %xmm2 988; SSE-NEXT: por %xmm1, %xmm2 989; SSE-NEXT: movdqa %xmm2, %xmm0 990; SSE-NEXT: retq 991; 992; XOP-LABEL: bitselect_v4i1_loop: 993; XOP: # %bb.0: # %bb 994; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 995; XOP-NEXT: vpcomneqd %xmm2, %xmm0, %xmm0 996; XOP-NEXT: vpcomeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 997; XOP-NEXT: vpcomeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 998; XOP-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 999; XOP-NEXT: retq 1000; 1001; AVX1-LABEL: bitselect_v4i1_loop: 1002; AVX1: # %bb.0: # %bb 1003; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1004; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 1005; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 1006; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1007; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 1008; AVX1-NEXT: retq 1009; 1010; AVX2-LABEL: bitselect_v4i1_loop: 1011; AVX2: # %bb.0: # %bb 1012; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1013; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 1014; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [12,12,12,12] 1015; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 1016; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [15,15,15,15] 1017; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 1018; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 1019; AVX2-NEXT: retq 1020; 1021; AVX512F-LABEL: bitselect_v4i1_loop: 1022; AVX512F: # %bb.0: # %bb 1023; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 1024; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1025; AVX512F-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1 1026; AVX512F-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k2 1027; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 {%k2} 1028; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} 1029; AVX512F-NEXT: korw %k0, %k1, %k1 1030; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1031; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1032; AVX512F-NEXT: vzeroupper 1033; AVX512F-NEXT: retq 1034; 1035; AVX512VL-LABEL: bitselect_v4i1_loop: 1036; AVX512VL: # %bb.0: # %bb 1037; AVX512VL-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1 1038; AVX512VL-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k2 1039; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k0 {%k2} 1040; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 {%k1} 1041; AVX512VL-NEXT: korw %k0, %k1, %k1 1042; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1043; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 1044; AVX512VL-NEXT: retq 1045bb: 1046 %tmp = icmp ne <4 x i32> %a0, zeroinitializer 1047 %tmp2 = icmp eq <4 x i32> %a1, <i32 12, i32 12, i32 12, i32 12> 1048 %tmp3 = icmp eq <4 x i32> %a1, <i32 15, i32 15, i32 15, i32 15> 1049 %tmp4 = select <4 x i1> %tmp, <4 x i1> %tmp2, <4 x i1> %tmp3 1050 ret <4 x i1> %tmp4 1051} 1052 1053