1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck --check-prefixes=CHECK,KNL %s 3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck --check-prefixes=CHECK,SKX %s 4; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+avx512vbmi | FileCheck --check-prefixes=CHECK,SKX %s 5 6define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind { 7; CHECK-LABEL: test1: 8; CHECK: ## %bb.0: 9; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3] 10; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2 11; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0 12; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 13; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0 14; CHECK-NEXT: retq 15 %rrr = load float, float* %br 16 %rrr2 = insertelement <16 x float> %x, float %rrr, i32 1 17 %rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14 18 ret <16 x float> %rrr3 19} 20 21define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind { 22; CHECK-LABEL: test2: 23; CHECK: ## %bb.0: 24; CHECK-NEXT: vmovhps {{.*#+}} xmm2 = xmm0[0,1],mem[0,1] 25; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2 26; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0 27; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 28; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0 29; CHECK-NEXT: retq 30 %rrr = load double, double* %br 31 %rrr2 = insertelement <8 x double> %x, double %rrr, i32 1 32 %rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6 33 ret <8 x double> %rrr3 34} 35 36define <16 x float> @test3(<16 x float> %x) nounwind { 37; CHECK-LABEL: test3: 38; CHECK: ## %bb.0: 39; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 40; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3] 41; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 42; CHECK-NEXT: retq 43 %eee = extractelement <16 x float> %x, i32 4 44 %rrr2 = insertelement <16 x float> %x, float %eee, i32 1 45 ret <16 x float> %rrr2 46} 47 48define <8 x i64> @test4(<8 x i64> %x) nounwind { 49; CHECK-LABEL: test4: 50; CHECK: ## %bb.0: 51; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1 52; CHECK-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0] 53; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 54; CHECK-NEXT: retq 55 %eee = extractelement <8 x i64> %x, i32 4 56 %rrr2 = insertelement <8 x i64> %x, i64 %eee, i32 1 57 ret <8 x i64> %rrr2 58} 59 60define i32 @test5(<4 x float> %x) nounwind { 61; CHECK-LABEL: test5: 62; CHECK: ## %bb.0: 63; CHECK-NEXT: vextractps $3, %xmm0, %eax 64; CHECK-NEXT: retq 65 %ef = extractelement <4 x float> %x, i32 3 66 %ei = bitcast float %ef to i32 67 ret i32 %ei 68} 69 70define void @test6(<4 x float> %x, float* %out) nounwind { 71; CHECK-LABEL: test6: 72; CHECK: ## %bb.0: 73; CHECK-NEXT: vextractps $3, %xmm0, (%rdi) 74; CHECK-NEXT: retq 75 %ef = extractelement <4 x float> %x, i32 3 76 store float %ef, float* %out, align 4 77 ret void 78} 79 80define float @test7(<16 x float> %x, i32 %ind) nounwind { 81; CHECK-LABEL: test7: 82; CHECK: ## %bb.0: 83; CHECK-NEXT: pushq %rbp 84; CHECK-NEXT: movq %rsp, %rbp 85; CHECK-NEXT: andq $-64, %rsp 86; CHECK-NEXT: subq $128, %rsp 87; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 88; CHECK-NEXT: vmovaps %zmm0, (%rsp) 89; CHECK-NEXT: andl $15, %edi 90; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 91; CHECK-NEXT: movq %rbp, %rsp 92; CHECK-NEXT: popq %rbp 93; CHECK-NEXT: vzeroupper 94; CHECK-NEXT: retq 95 %e = extractelement <16 x float> %x, i32 %ind 96 ret float %e 97} 98 99define double @test8(<8 x double> %x, i32 %ind) nounwind { 100; CHECK-LABEL: test8: 101; CHECK: ## %bb.0: 102; CHECK-NEXT: pushq %rbp 103; CHECK-NEXT: movq %rsp, %rbp 104; CHECK-NEXT: andq $-64, %rsp 105; CHECK-NEXT: subq $128, %rsp 106; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 107; CHECK-NEXT: vmovaps %zmm0, (%rsp) 108; CHECK-NEXT: andl $7, %edi 109; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 110; CHECK-NEXT: movq %rbp, %rsp 111; CHECK-NEXT: popq %rbp 112; CHECK-NEXT: vzeroupper 113; CHECK-NEXT: retq 114 %e = extractelement <8 x double> %x, i32 %ind 115 ret double %e 116} 117 118define float @test9(<8 x float> %x, i32 %ind) nounwind { 119; CHECK-LABEL: test9: 120; CHECK: ## %bb.0: 121; CHECK-NEXT: pushq %rbp 122; CHECK-NEXT: movq %rsp, %rbp 123; CHECK-NEXT: andq $-32, %rsp 124; CHECK-NEXT: subq $64, %rsp 125; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 126; CHECK-NEXT: vmovaps %ymm0, (%rsp) 127; CHECK-NEXT: andl $7, %edi 128; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 129; CHECK-NEXT: movq %rbp, %rsp 130; CHECK-NEXT: popq %rbp 131; CHECK-NEXT: vzeroupper 132; CHECK-NEXT: retq 133 %e = extractelement <8 x float> %x, i32 %ind 134 ret float %e 135} 136 137define i32 @test10(<16 x i32> %x, i32 %ind) nounwind { 138; CHECK-LABEL: test10: 139; CHECK: ## %bb.0: 140; CHECK-NEXT: pushq %rbp 141; CHECK-NEXT: movq %rsp, %rbp 142; CHECK-NEXT: andq $-64, %rsp 143; CHECK-NEXT: subq $128, %rsp 144; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 145; CHECK-NEXT: vmovaps %zmm0, (%rsp) 146; CHECK-NEXT: andl $15, %edi 147; CHECK-NEXT: movl (%rsp,%rdi,4), %eax 148; CHECK-NEXT: movq %rbp, %rsp 149; CHECK-NEXT: popq %rbp 150; CHECK-NEXT: vzeroupper 151; CHECK-NEXT: retq 152 %e = extractelement <16 x i32> %x, i32 %ind 153 ret i32 %e 154} 155 156define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) { 157; KNL-LABEL: test11: 158; KNL: ## %bb.0: 159; KNL-NEXT: vpcmpltud %zmm1, %zmm0, %k0 160; KNL-NEXT: kshiftrw $4, %k0, %k0 161; KNL-NEXT: kmovw %k0, %eax 162; KNL-NEXT: testb $1, %al 163; KNL-NEXT: je LBB10_2 164; KNL-NEXT: ## %bb.1: ## %A 165; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 166; KNL-NEXT: retq 167; KNL-NEXT: LBB10_2: ## %B 168; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 169; KNL-NEXT: retq 170; 171; SKX-LABEL: test11: 172; SKX: ## %bb.0: 173; SKX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 174; SKX-NEXT: kshiftrw $4, %k0, %k0 175; SKX-NEXT: kmovd %k0, %eax 176; SKX-NEXT: testb $1, %al 177; SKX-NEXT: je LBB10_2 178; SKX-NEXT: ## %bb.1: ## %A 179; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 180; SKX-NEXT: retq 181; SKX-NEXT: LBB10_2: ## %B 182; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0 183; SKX-NEXT: retq 184 %cmp_res = icmp ult <16 x i32> %a, %b 185 %ia = extractelement <16 x i1> %cmp_res, i32 4 186 br i1 %ia, label %A, label %B 187 A: 188 ret <16 x i32>%b 189 B: 190 %c = add <16 x i32>%b, %a 191 ret <16 x i32>%c 192} 193 194define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) { 195; KNL-LABEL: test12: 196; KNL: ## %bb.0: 197; KNL-NEXT: movq %rdi, %rax 198; KNL-NEXT: vpcmpgtq %zmm0, %zmm2, %k0 199; KNL-NEXT: kmovw %k0, %ecx 200; KNL-NEXT: testb $1, %cl 201; KNL-NEXT: cmoveq %rsi, %rax 202; KNL-NEXT: vzeroupper 203; KNL-NEXT: retq 204; 205; SKX-LABEL: test12: 206; SKX: ## %bb.0: 207; SKX-NEXT: movq %rdi, %rax 208; SKX-NEXT: vpcmpgtq %zmm0, %zmm2, %k0 209; SKX-NEXT: kmovd %k0, %ecx 210; SKX-NEXT: testb $1, %cl 211; SKX-NEXT: cmoveq %rsi, %rax 212; SKX-NEXT: vzeroupper 213; SKX-NEXT: retq 214 %cmpvector_func.i = icmp slt <16 x i64> %a, %b 215 %extract24vector_func.i = extractelement <16 x i1> %cmpvector_func.i, i32 0 216 %res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1 217 ret i64 %res 218} 219 220define i16 @test13(i32 %a, i32 %b) { 221; KNL-LABEL: test13: 222; KNL: ## %bb.0: 223; KNL-NEXT: cmpl %esi, %edi 224; KNL-NEXT: setb %al 225; KNL-NEXT: movw $-4, %cx 226; KNL-NEXT: kmovw %ecx, %k0 227; KNL-NEXT: kshiftrw $1, %k0, %k0 228; KNL-NEXT: kshiftlw $1, %k0, %k0 229; KNL-NEXT: andl $1, %eax 230; KNL-NEXT: kmovw %eax, %k1 231; KNL-NEXT: korw %k1, %k0, %k0 232; KNL-NEXT: kmovw %k0, %eax 233; KNL-NEXT: ## kill: def $ax killed $ax killed $eax 234; KNL-NEXT: retq 235; 236; SKX-LABEL: test13: 237; SKX: ## %bb.0: 238; SKX-NEXT: cmpl %esi, %edi 239; SKX-NEXT: setb %al 240; SKX-NEXT: movw $-4, %cx 241; SKX-NEXT: kmovd %ecx, %k0 242; SKX-NEXT: kshiftrw $1, %k0, %k0 243; SKX-NEXT: kshiftlw $1, %k0, %k0 244; SKX-NEXT: andl $1, %eax 245; SKX-NEXT: kmovw %eax, %k1 246; SKX-NEXT: korw %k1, %k0, %k0 247; SKX-NEXT: kmovd %k0, %eax 248; SKX-NEXT: ## kill: def $ax killed $ax killed $eax 249; SKX-NEXT: retq 250 %cmp_res = icmp ult i32 %a, %b 251 %maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %cmp_res, i32 0 252 %res = bitcast <16 x i1> %maskv to i16 253 ret i16 %res 254} 255 256define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) { 257; KNL-LABEL: test14: 258; KNL: ## %bb.0: 259; KNL-NEXT: movq %rdi, %rax 260; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k0 261; KNL-NEXT: kshiftrw $4, %k0, %k0 262; KNL-NEXT: kmovw %k0, %ecx 263; KNL-NEXT: testb $1, %cl 264; KNL-NEXT: cmoveq %rsi, %rax 265; KNL-NEXT: vzeroupper 266; KNL-NEXT: retq 267; 268; SKX-LABEL: test14: 269; SKX: ## %bb.0: 270; SKX-NEXT: movq %rdi, %rax 271; SKX-NEXT: vpcmpgtq %zmm0, %zmm1, %k0 272; SKX-NEXT: kshiftrb $4, %k0, %k0 273; SKX-NEXT: kmovd %k0, %ecx 274; SKX-NEXT: testb $1, %cl 275; SKX-NEXT: cmoveq %rsi, %rax 276; SKX-NEXT: vzeroupper 277; SKX-NEXT: retq 278 %cmpvector_func.i = icmp slt <8 x i64> %a, %b 279 %extract24vector_func.i = extractelement <8 x i1> %cmpvector_func.i, i32 4 280 %res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1 281 ret i64 %res 282} 283 284define i16 @test15(i1 *%addr) { 285; CHECK-LABEL: test15: 286; CHECK: ## %bb.0: 287; CHECK-NEXT: xorl %ecx, %ecx 288; CHECK-NEXT: cmpb $0, (%rdi) 289; CHECK-NEXT: movl $65535, %eax ## imm = 0xFFFF 290; CHECK-NEXT: cmovel %ecx, %eax 291; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax 292; CHECK-NEXT: retq 293 %x = load i1 , i1 * %addr, align 1 294 %x1 = insertelement <16 x i1> undef, i1 %x, i32 10 295 %x2 = bitcast <16 x i1>%x1 to i16 296 ret i16 %x2 297} 298 299define i16 @test16(i1 *%addr, i16 %a) { 300; KNL-LABEL: test16: 301; KNL: ## %bb.0: 302; KNL-NEXT: movb (%rdi), %al 303; KNL-NEXT: kmovw %esi, %k0 304; KNL-NEXT: movw $-1025, %cx ## imm = 0xFBFF 305; KNL-NEXT: kmovw %ecx, %k1 306; KNL-NEXT: kandw %k1, %k0, %k0 307; KNL-NEXT: kmovw %eax, %k1 308; KNL-NEXT: kshiftlw $15, %k1, %k1 309; KNL-NEXT: kshiftrw $5, %k1, %k1 310; KNL-NEXT: korw %k1, %k0, %k0 311; KNL-NEXT: kmovw %k0, %eax 312; KNL-NEXT: ## kill: def $ax killed $ax killed $eax 313; KNL-NEXT: retq 314; 315; SKX-LABEL: test16: 316; SKX: ## %bb.0: 317; SKX-NEXT: kmovb (%rdi), %k0 318; SKX-NEXT: kmovd %esi, %k1 319; SKX-NEXT: movw $-1025, %ax ## imm = 0xFBFF 320; SKX-NEXT: kmovd %eax, %k2 321; SKX-NEXT: kandw %k2, %k1, %k1 322; SKX-NEXT: kshiftlw $15, %k0, %k0 323; SKX-NEXT: kshiftrw $5, %k0, %k0 324; SKX-NEXT: korw %k0, %k1, %k0 325; SKX-NEXT: kmovd %k0, %eax 326; SKX-NEXT: ## kill: def $ax killed $ax killed $eax 327; SKX-NEXT: retq 328 %x = load i1 , i1 * %addr, align 128 329 %a1 = bitcast i16 %a to <16 x i1> 330 %x1 = insertelement <16 x i1> %a1, i1 %x, i32 10 331 %x2 = bitcast <16 x i1>%x1 to i16 332 ret i16 %x2 333} 334 335define i8 @test17(i1 *%addr, i8 %a) { 336; KNL-LABEL: test17: 337; KNL: ## %bb.0: 338; KNL-NEXT: movb (%rdi), %al 339; KNL-NEXT: kmovw %esi, %k0 340; KNL-NEXT: movw $-17, %cx 341; KNL-NEXT: kmovw %ecx, %k1 342; KNL-NEXT: kandw %k1, %k0, %k0 343; KNL-NEXT: kmovw %eax, %k1 344; KNL-NEXT: kshiftlw $15, %k1, %k1 345; KNL-NEXT: kshiftrw $11, %k1, %k1 346; KNL-NEXT: korw %k1, %k0, %k0 347; KNL-NEXT: kmovw %k0, %eax 348; KNL-NEXT: ## kill: def $al killed $al killed $eax 349; KNL-NEXT: retq 350; 351; SKX-LABEL: test17: 352; SKX: ## %bb.0: 353; SKX-NEXT: kmovb (%rdi), %k0 354; SKX-NEXT: kmovd %esi, %k1 355; SKX-NEXT: movb $-17, %al 356; SKX-NEXT: kmovd %eax, %k2 357; SKX-NEXT: kandb %k2, %k1, %k1 358; SKX-NEXT: kshiftlb $7, %k0, %k0 359; SKX-NEXT: kshiftrb $3, %k0, %k0 360; SKX-NEXT: korb %k0, %k1, %k0 361; SKX-NEXT: kmovd %k0, %eax 362; SKX-NEXT: ## kill: def $al killed $al killed $eax 363; SKX-NEXT: retq 364 %x = load i1 , i1 * %addr, align 128 365 %a1 = bitcast i8 %a to <8 x i1> 366 %x1 = insertelement <8 x i1> %a1, i1 %x, i32 4 367 %x2 = bitcast <8 x i1>%x1 to i8 368 ret i8 %x2 369} 370 371define i64 @extract_v8i64(<8 x i64> %x, i64* %dst) { 372; CHECK-LABEL: extract_v8i64: 373; CHECK: ## %bb.0: 374; CHECK-NEXT: vpextrq $1, %xmm0, %rax 375; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 376; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi) 377; CHECK-NEXT: vzeroupper 378; CHECK-NEXT: retq 379 %r1 = extractelement <8 x i64> %x, i32 1 380 %r2 = extractelement <8 x i64> %x, i32 3 381 store i64 %r2, i64* %dst, align 1 382 ret i64 %r1 383} 384 385define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) { 386; CHECK-LABEL: extract_v4i64: 387; CHECK: ## %bb.0: 388; CHECK-NEXT: vpextrq $1, %xmm0, %rax 389; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 390; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi) 391; CHECK-NEXT: vzeroupper 392; CHECK-NEXT: retq 393 %r1 = extractelement <4 x i64> %x, i32 1 394 %r2 = extractelement <4 x i64> %x, i32 3 395 store i64 %r2, i64* %dst, align 1 396 ret i64 %r1 397} 398 399define i64 @extract_v2i64(<2 x i64> %x, i64* %dst) { 400; CHECK-LABEL: extract_v2i64: 401; CHECK: ## %bb.0: 402; CHECK-NEXT: vmovq %xmm0, %rax 403; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi) 404; CHECK-NEXT: retq 405 %r1 = extractelement <2 x i64> %x, i32 0 406 %r2 = extractelement <2 x i64> %x, i32 1 407 store i64 %r2, i64* %dst, align 1 408 ret i64 %r1 409} 410 411define i32 @extract_v16i32(<16 x i32> %x, i32* %dst) { 412; CHECK-LABEL: extract_v16i32: 413; CHECK: ## %bb.0: 414; CHECK-NEXT: vextractps $1, %xmm0, %eax 415; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 416; CHECK-NEXT: vextractps $1, %xmm0, (%rdi) 417; CHECK-NEXT: vzeroupper 418; CHECK-NEXT: retq 419 %r1 = extractelement <16 x i32> %x, i32 1 420 %r2 = extractelement <16 x i32> %x, i32 5 421 store i32 %r2, i32* %dst, align 1 422 ret i32 %r1 423} 424 425define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) { 426; CHECK-LABEL: extract_v8i32: 427; CHECK: ## %bb.0: 428; CHECK-NEXT: vextractps $1, %xmm0, %eax 429; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 430; CHECK-NEXT: vextractps $1, %xmm0, (%rdi) 431; CHECK-NEXT: vzeroupper 432; CHECK-NEXT: retq 433 %r1 = extractelement <8 x i32> %x, i32 1 434 %r2 = extractelement <8 x i32> %x, i32 5 435 store i32 %r2, i32* %dst, align 1 436 ret i32 %r1 437} 438 439define i32 @extract_v4i32(<4 x i32> %x, i32* %dst) { 440; CHECK-LABEL: extract_v4i32: 441; CHECK: ## %bb.0: 442; CHECK-NEXT: vextractps $1, %xmm0, %eax 443; CHECK-NEXT: vextractps $3, %xmm0, (%rdi) 444; CHECK-NEXT: retq 445 %r1 = extractelement <4 x i32> %x, i32 1 446 %r2 = extractelement <4 x i32> %x, i32 3 447 store i32 %r2, i32* %dst, align 1 448 ret i32 %r1 449} 450 451define i16 @extract_v32i16(<32 x i16> %x, i16* %dst) { 452; CHECK-LABEL: extract_v32i16: 453; CHECK: ## %bb.0: 454; CHECK-NEXT: vpextrw $1, %xmm0, %eax 455; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 456; CHECK-NEXT: vpextrw $1, %xmm0, (%rdi) 457; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax 458; CHECK-NEXT: vzeroupper 459; CHECK-NEXT: retq 460 %r1 = extractelement <32 x i16> %x, i32 1 461 %r2 = extractelement <32 x i16> %x, i32 9 462 store i16 %r2, i16* %dst, align 1 463 ret i16 %r1 464} 465 466define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) { 467; CHECK-LABEL: extract_v16i16: 468; CHECK: ## %bb.0: 469; CHECK-NEXT: vpextrw $1, %xmm0, %eax 470; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 471; CHECK-NEXT: vpextrw $1, %xmm0, (%rdi) 472; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax 473; CHECK-NEXT: vzeroupper 474; CHECK-NEXT: retq 475 %r1 = extractelement <16 x i16> %x, i32 1 476 %r2 = extractelement <16 x i16> %x, i32 9 477 store i16 %r2, i16* %dst, align 1 478 ret i16 %r1 479} 480 481define i16 @extract_v8i16(<8 x i16> %x, i16* %dst) { 482; CHECK-LABEL: extract_v8i16: 483; CHECK: ## %bb.0: 484; CHECK-NEXT: vpextrw $1, %xmm0, %eax 485; CHECK-NEXT: vpextrw $3, %xmm0, (%rdi) 486; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax 487; CHECK-NEXT: retq 488 %r1 = extractelement <8 x i16> %x, i32 1 489 %r2 = extractelement <8 x i16> %x, i32 3 490 store i16 %r2, i16* %dst, align 1 491 ret i16 %r1 492} 493 494define i8 @extract_v64i8(<64 x i8> %x, i8* %dst) { 495; CHECK-LABEL: extract_v64i8: 496; CHECK: ## %bb.0: 497; CHECK-NEXT: vpextrb $1, %xmm0, %eax 498; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 499; CHECK-NEXT: vpextrb $1, %xmm0, (%rdi) 500; CHECK-NEXT: ## kill: def $al killed $al killed $eax 501; CHECK-NEXT: vzeroupper 502; CHECK-NEXT: retq 503 %r1 = extractelement <64 x i8> %x, i32 1 504 %r2 = extractelement <64 x i8> %x, i32 17 505 store i8 %r2, i8* %dst, align 1 506 ret i8 %r1 507} 508 509define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) { 510; CHECK-LABEL: extract_v32i8: 511; CHECK: ## %bb.0: 512; CHECK-NEXT: vpextrb $1, %xmm0, %eax 513; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 514; CHECK-NEXT: vpextrb $1, %xmm0, (%rdi) 515; CHECK-NEXT: ## kill: def $al killed $al killed $eax 516; CHECK-NEXT: vzeroupper 517; CHECK-NEXT: retq 518 %r1 = extractelement <32 x i8> %x, i32 1 519 %r2 = extractelement <32 x i8> %x, i32 17 520 store i8 %r2, i8* %dst, align 1 521 ret i8 %r1 522} 523 524define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) { 525; CHECK-LABEL: extract_v16i8: 526; CHECK: ## %bb.0: 527; CHECK-NEXT: vpextrb $1, %xmm0, %eax 528; CHECK-NEXT: vpextrb $3, %xmm0, (%rdi) 529; CHECK-NEXT: ## kill: def $al killed $al killed $eax 530; CHECK-NEXT: retq 531 %r1 = extractelement <16 x i8> %x, i32 1 532 %r2 = extractelement <16 x i8> %x, i32 3 533 store i8 %r2, i8* %dst, align 1 534 ret i8 %r1 535} 536 537define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) { 538; CHECK-LABEL: insert_v8i64: 539; CHECK: ## %bb.0: 540; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 541; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 542; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 543; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 544; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 545; CHECK-NEXT: retq 546 %val = load i64, i64* %ptr 547 %r1 = insertelement <8 x i64> %x, i64 %val, i32 1 548 %r2 = insertelement <8 x i64> %r1, i64 %y, i32 3 549 ret <8 x i64> %r2 550} 551 552define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) { 553; CHECK-LABEL: insert_v4i64: 554; CHECK: ## %bb.0: 555; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1 556; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 557; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 558; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 559; CHECK-NEXT: retq 560 %val = load i64, i64* %ptr 561 %r1 = insertelement <4 x i64> %x, i64 %val, i32 1 562 %r2 = insertelement <4 x i64> %r1, i64 %y, i32 3 563 ret <4 x i64> %r2 564} 565 566define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) { 567; CHECK-LABEL: insert_v2i64: 568; CHECK: ## %bb.0: 569; CHECK-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 570; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm0 571; CHECK-NEXT: retq 572 %val = load i64, i64* %ptr 573 %r1 = insertelement <2 x i64> %x, i64 %val, i32 1 574 %r2 = insertelement <2 x i64> %r1, i64 %y, i32 0 575 ret <2 x i64> %r2 576} 577 578define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) { 579; CHECK-LABEL: insert_v16i32: 580; CHECK: ## %bb.0: 581; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 582; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 583; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 584; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 585; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 586; CHECK-NEXT: retq 587 %val = load i32, i32* %ptr 588 %r1 = insertelement <16 x i32> %x, i32 %val, i32 1 589 %r2 = insertelement <16 x i32> %r1, i32 %y, i32 5 590 ret <16 x i32> %r2 591} 592 593define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) { 594; CHECK-LABEL: insert_v8i32: 595; CHECK: ## %bb.0: 596; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1 597; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 598; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 599; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 600; CHECK-NEXT: retq 601 %val = load i32, i32* %ptr 602 %r1 = insertelement <8 x i32> %x, i32 %val, i32 1 603 %r2 = insertelement <8 x i32> %r1, i32 %y, i32 5 604 ret <8 x i32> %r2 605} 606 607define <4 x i32> @insert_v4i32(<4 x i32> %x, i32 %y, i32* %ptr) { 608; CHECK-LABEL: insert_v4i32: 609; CHECK: ## %bb.0: 610; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm0 611; CHECK-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 612; CHECK-NEXT: retq 613 %val = load i32, i32* %ptr 614 %r1 = insertelement <4 x i32> %x, i32 %val, i32 1 615 %r2 = insertelement <4 x i32> %r1, i32 %y, i32 3 616 ret <4 x i32> %r2 617} 618 619define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) { 620; CHECK-LABEL: insert_v32i16: 621; CHECK: ## %bb.0: 622; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 623; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 624; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 625; CHECK-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 626; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 627; CHECK-NEXT: retq 628 %val = load i16, i16* %ptr 629 %r1 = insertelement <32 x i16> %x, i16 %val, i32 1 630 %r2 = insertelement <32 x i16> %r1, i16 %y, i32 9 631 ret <32 x i16> %r2 632} 633 634define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) { 635; CHECK-LABEL: insert_v16i16: 636; CHECK: ## %bb.0: 637; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 638; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 639; CHECK-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 640; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 641; CHECK-NEXT: retq 642 %val = load i16, i16* %ptr 643 %r1 = insertelement <16 x i16> %x, i16 %val, i32 1 644 %r2 = insertelement <16 x i16> %r1, i16 %y, i32 9 645 ret <16 x i16> %r2 646} 647 648define <8 x i16> @insert_v8i16(<8 x i16> %x, i16 %y, i16* %ptr) { 649; CHECK-LABEL: insert_v8i16: 650; CHECK: ## %bb.0: 651; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm0 652; CHECK-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0 653; CHECK-NEXT: retq 654 %val = load i16, i16* %ptr 655 %r1 = insertelement <8 x i16> %x, i16 %val, i32 1 656 %r2 = insertelement <8 x i16> %r1, i16 %y, i32 5 657 ret <8 x i16> %r2 658} 659 660define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) { 661; CHECK-LABEL: insert_v64i8: 662; CHECK: ## %bb.0: 663; CHECK-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1 664; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1 665; CHECK-NEXT: vextracti32x4 $3, %zmm0, %xmm0 666; CHECK-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0 667; CHECK-NEXT: vinserti32x4 $3, %xmm0, %zmm1, %zmm0 668; CHECK-NEXT: retq 669 %val = load i8, i8* %ptr 670 %r1 = insertelement <64 x i8> %x, i8 %val, i32 1 671 %r2 = insertelement <64 x i8> %r1, i8 %y, i32 50 672 ret <64 x i8> %r2 673} 674 675define <32 x i8> @insert_v32i8(<32 x i8> %x, i8 %y, i8* %ptr) { 676; CHECK-LABEL: insert_v32i8: 677; CHECK: ## %bb.0: 678; CHECK-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1 679; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 680; CHECK-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 681; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 682; CHECK-NEXT: retq 683 %val = load i8, i8* %ptr 684 %r1 = insertelement <32 x i8> %x, i8 %val, i32 1 685 %r2 = insertelement <32 x i8> %r1, i8 %y, i32 17 686 ret <32 x i8> %r2 687} 688 689define <16 x i8> @insert_v16i8(<16 x i8> %x, i8 %y, i8* %ptr) { 690; CHECK-LABEL: insert_v16i8: 691; CHECK: ## %bb.0: 692; CHECK-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 693; CHECK-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 694; CHECK-NEXT: retq 695 %val = load i8, i8* %ptr 696 %r1 = insertelement <16 x i8> %x, i8 %val, i32 3 697 %r2 = insertelement <16 x i8> %r1, i8 %y, i32 10 698 ret <16 x i8> %r2 699} 700 701define <8 x i64> @test_insert_128_v8i64(<8 x i64> %x, i64 %y) { 702; CHECK-LABEL: test_insert_128_v8i64: 703; CHECK: ## %bb.0: 704; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 705; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 706; CHECK-NEXT: retq 707 %r = insertelement <8 x i64> %x, i64 %y, i32 1 708 ret <8 x i64> %r 709} 710 711define <16 x i32> @test_insert_128_v16i32(<16 x i32> %x, i32 %y) { 712; CHECK-LABEL: test_insert_128_v16i32: 713; CHECK: ## %bb.0: 714; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm1 715; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 716; CHECK-NEXT: retq 717 %r = insertelement <16 x i32> %x, i32 %y, i32 1 718 ret <16 x i32> %r 719} 720 721define <8 x double> @test_insert_128_v8f64(<8 x double> %x, double %y) { 722; CHECK-LABEL: test_insert_128_v8f64: 723; CHECK: ## %bb.0: 724; CHECK-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0] 725; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 726; CHECK-NEXT: retq 727 %r = insertelement <8 x double> %x, double %y, i32 1 728 ret <8 x double> %r 729} 730 731define <16 x float> @test_insert_128_v16f32(<16 x float> %x, float %y) { 732; CHECK-LABEL: test_insert_128_v16f32: 733; CHECK: ## %bb.0: 734; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3] 735; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 736; CHECK-NEXT: retq 737 %r = insertelement <16 x float> %x, float %y, i32 1 738 ret <16 x float> %r 739} 740 741define <16 x i16> @test_insert_128_v16i16(<16 x i16> %x, i16 %y) { 742; CHECK-LABEL: test_insert_128_v16i16: 743; CHECK: ## %bb.0: 744; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 745; CHECK-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1 746; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 747; CHECK-NEXT: retq 748 %r = insertelement <16 x i16> %x, i16 %y, i32 10 749 ret <16 x i16> %r 750} 751 752define <32 x i8> @test_insert_128_v32i8(<32 x i8> %x, i8 %y) { 753; CHECK-LABEL: test_insert_128_v32i8: 754; CHECK: ## %bb.0: 755; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 756; CHECK-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1 757; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 758; CHECK-NEXT: retq 759 %r = insertelement <32 x i8> %x, i8 %y, i32 20 760 ret <32 x i8> %r 761} 762 763define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> %y) { 764; KNL-LABEL: test_insertelement_v32i1: 765; KNL: ## %bb.0: 766; KNL-NEXT: cmpl %esi, %edi 767; KNL-NEXT: setb %al 768; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k0 769; KNL-NEXT: kmovw %k0, %ecx 770; KNL-NEXT: shll $16, %ecx 771; KNL-NEXT: movw $-17, %dx 772; KNL-NEXT: kmovw %edx, %k1 773; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k0 {%k1} 774; KNL-NEXT: kmovw %eax, %k1 775; KNL-NEXT: kshiftlw $15, %k1, %k1 776; KNL-NEXT: kshiftrw $11, %k1, %k1 777; KNL-NEXT: korw %k1, %k0, %k0 778; KNL-NEXT: kmovw %k0, %eax 779; KNL-NEXT: orl %ecx, %eax 780; KNL-NEXT: vzeroupper 781; KNL-NEXT: retq 782; 783; SKX-LABEL: test_insertelement_v32i1: 784; SKX: ## %bb.0: 785; SKX-NEXT: cmpl %esi, %edi 786; SKX-NEXT: setb %al 787; SKX-NEXT: vpcmpltud %zmm2, %zmm0, %k0 788; SKX-NEXT: vpcmpltud %zmm3, %zmm1, %k1 789; SKX-NEXT: kunpckwd %k0, %k1, %k0 790; SKX-NEXT: movl $-17, %ecx 791; SKX-NEXT: kmovd %ecx, %k1 792; SKX-NEXT: kandd %k1, %k0, %k0 793; SKX-NEXT: kmovd %eax, %k1 794; SKX-NEXT: kshiftld $31, %k1, %k1 795; SKX-NEXT: kshiftrd $27, %k1, %k1 796; SKX-NEXT: kord %k1, %k0, %k0 797; SKX-NEXT: kmovd %k0, %eax 798; SKX-NEXT: vzeroupper 799; SKX-NEXT: retq 800 %cmp_res_i1 = icmp ult i32 %a, %b 801 %cmp_cmp_vec = icmp ult <32 x i32> %x, %y 802 %maskv = insertelement <32 x i1> %cmp_cmp_vec, i1 %cmp_res_i1, i32 4 803 %res = bitcast <32 x i1> %maskv to i32 804 ret i32 %res 805} 806 807define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) { 808; KNL-LABEL: test_iinsertelement_v4i1: 809; KNL: ## %bb.0: 810; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 811; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 812; KNL-NEXT: cmpl %esi, %edi 813; KNL-NEXT: setb %al 814; KNL-NEXT: movw $-5, %cx 815; KNL-NEXT: kmovw %ecx, %k1 816; KNL-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} 817; KNL-NEXT: kmovw %eax, %k1 818; KNL-NEXT: kshiftlw $15, %k1, %k1 819; KNL-NEXT: kshiftrw $13, %k1, %k1 820; KNL-NEXT: korw %k1, %k0, %k0 821; KNL-NEXT: kmovw %k0, %eax 822; KNL-NEXT: ## kill: def $al killed $al killed $eax 823; KNL-NEXT: vzeroupper 824; KNL-NEXT: retq 825; 826; SKX-LABEL: test_iinsertelement_v4i1: 827; SKX: ## %bb.0: 828; SKX-NEXT: cmpl %esi, %edi 829; SKX-NEXT: setb %al 830; SKX-NEXT: movb $-5, %cl 831; SKX-NEXT: kmovd %ecx, %k1 832; SKX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} 833; SKX-NEXT: kmovd %eax, %k1 834; SKX-NEXT: kshiftlb $7, %k1, %k1 835; SKX-NEXT: kshiftrb $5, %k1, %k1 836; SKX-NEXT: korw %k1, %k0, %k0 837; SKX-NEXT: kmovd %k0, %eax 838; SKX-NEXT: ## kill: def $al killed $al killed $eax 839; SKX-NEXT: retq 840 %cmp_res_i1 = icmp ult i32 %a, %b 841 %cmp_cmp_vec = icmp ult <4 x i32> %x, %y 842 %maskv = insertelement <4 x i1> %cmp_cmp_vec, i1 %cmp_res_i1, i32 2 843 %res0 = shufflevector <4 x i1> %maskv, <4 x i1> undef , <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4> 844 %res = bitcast <8 x i1> %res0 to i8 845 ret i8 %res 846} 847 848define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y) { 849; KNL-LABEL: test_iinsertelement_v2i1: 850; KNL: ## %bb.0: 851; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 852; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 853; KNL-NEXT: cmpl %esi, %edi 854; KNL-NEXT: setb %al 855; KNL-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 856; KNL-NEXT: kshiftlw $15, %k0, %k0 857; KNL-NEXT: kshiftrw $15, %k0, %k0 858; KNL-NEXT: kmovw %eax, %k1 859; KNL-NEXT: kshiftlw $1, %k1, %k1 860; KNL-NEXT: korw %k1, %k0, %k0 861; KNL-NEXT: kmovw %k0, %eax 862; KNL-NEXT: ## kill: def $al killed $al killed $eax 863; KNL-NEXT: vzeroupper 864; KNL-NEXT: retq 865; 866; SKX-LABEL: test_iinsertelement_v2i1: 867; SKX: ## %bb.0: 868; SKX-NEXT: cmpl %esi, %edi 869; SKX-NEXT: setb %al 870; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 871; SKX-NEXT: kshiftlb $7, %k0, %k0 872; SKX-NEXT: kshiftrb $7, %k0, %k0 873; SKX-NEXT: kmovd %eax, %k1 874; SKX-NEXT: kshiftlb $1, %k1, %k1 875; SKX-NEXT: korw %k1, %k0, %k0 876; SKX-NEXT: kmovd %k0, %eax 877; SKX-NEXT: ## kill: def $al killed $al killed $eax 878; SKX-NEXT: retq 879 %cmp_res_i1 = icmp ult i32 %a, %b 880 %cmp_cmp_vec = icmp ult <2 x i64> %x, %y 881 %maskv = insertelement <2 x i1> %cmp_cmp_vec, i1 %cmp_res_i1, i32 1 882 %res0 = shufflevector <2 x i1> %maskv, <2 x i1> undef , <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 883 %res = bitcast <8 x i1> %res0 to i8 884 ret i8 %res 885} 886 887define zeroext i8 @test_extractelement_v2i1(<2 x i64> %a, <2 x i64> %b) { 888; KNL-LABEL: test_extractelement_v2i1: 889; KNL: ## %bb.0: 890; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 891; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 892; KNL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k0 893; KNL-NEXT: kmovw %k0, %ecx 894; KNL-NEXT: andl $1, %ecx 895; KNL-NEXT: movl $4, %eax 896; KNL-NEXT: subl %ecx, %eax 897; KNL-NEXT: vzeroupper 898; KNL-NEXT: retq 899; 900; SKX-LABEL: test_extractelement_v2i1: 901; SKX: ## %bb.0: 902; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0 903; SKX-NEXT: kmovd %k0, %ecx 904; SKX-NEXT: andl $1, %ecx 905; SKX-NEXT: movl $4, %eax 906; SKX-NEXT: subl %ecx, %eax 907; SKX-NEXT: retq 908 %t1 = icmp ugt <2 x i64> %a, %b 909 %t2 = extractelement <2 x i1> %t1, i32 0 910 %res = select i1 %t2, i8 3, i8 4 911 ret i8 %res 912} 913 914define zeroext i8 @extractelement_v2i1_alt(<2 x i64> %a, <2 x i64> %b) { 915; KNL-LABEL: extractelement_v2i1_alt: 916; KNL: ## %bb.0: 917; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 918; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 919; KNL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k0 920; KNL-NEXT: kmovw %k0, %eax 921; KNL-NEXT: andb $1, %al 922; KNL-NEXT: movb $4, %cl 923; KNL-NEXT: subb %al, %cl 924; KNL-NEXT: movzbl %cl, %eax 925; KNL-NEXT: vzeroupper 926; KNL-NEXT: retq 927; 928; SKX-LABEL: extractelement_v2i1_alt: 929; SKX: ## %bb.0: 930; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0 931; SKX-NEXT: kmovd %k0, %eax 932; SKX-NEXT: andb $1, %al 933; SKX-NEXT: movb $4, %cl 934; SKX-NEXT: subb %al, %cl 935; SKX-NEXT: movzbl %cl, %eax 936; SKX-NEXT: retq 937 %t1 = icmp ugt <2 x i64> %a, %b 938 %t2 = extractelement <2 x i1> %t1, i32 0 939 %sext = sext i1 %t2 to i8 940 %res = add i8 %sext, 4 941 ret i8 %res 942} 943 944define zeroext i8 @test_extractelement_v4i1(<4 x i32> %a, <4 x i32> %b) { 945; KNL-LABEL: test_extractelement_v4i1: 946; KNL: ## %bb.0: 947; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 948; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 949; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 950; KNL-NEXT: kshiftrw $3, %k0, %k0 951; KNL-NEXT: kmovw %k0, %eax 952; KNL-NEXT: andl $1, %eax 953; KNL-NEXT: vzeroupper 954; KNL-NEXT: retq 955; 956; SKX-LABEL: test_extractelement_v4i1: 957; SKX: ## %bb.0: 958; SKX-NEXT: vpcmpnleud %xmm1, %xmm0, %k0 959; SKX-NEXT: kshiftrb $3, %k0, %k0 960; SKX-NEXT: kmovd %k0, %eax 961; SKX-NEXT: andl $1, %eax 962; SKX-NEXT: retq 963 %t1 = icmp ugt <4 x i32> %a, %b 964 %t2 = extractelement <4 x i1> %t1, i32 3 965 %res = zext i1 %t2 to i8 966 ret i8 %res 967} 968 969define zeroext i8 @test_extractelement_v32i1(<32 x i8> %a, <32 x i8> %b) { 970; KNL-LABEL: test_extractelement_v32i1: 971; KNL: ## %bb.0: 972; KNL-NEXT: vpminub %xmm1, %xmm0, %xmm1 973; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 974; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 975; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 976; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 977; KNL-NEXT: kshiftrw $2, %k0, %k0 978; KNL-NEXT: kmovw %k0, %eax 979; KNL-NEXT: andl $1, %eax 980; KNL-NEXT: vzeroupper 981; KNL-NEXT: retq 982; 983; SKX-LABEL: test_extractelement_v32i1: 984; SKX: ## %bb.0: 985; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 986; SKX-NEXT: kshiftrd $2, %k0, %k0 987; SKX-NEXT: kmovd %k0, %eax 988; SKX-NEXT: andl $1, %eax 989; SKX-NEXT: vzeroupper 990; SKX-NEXT: retq 991 %t1 = icmp ugt <32 x i8> %a, %b 992 %t2 = extractelement <32 x i1> %t1, i32 2 993 %res = zext i1 %t2 to i8 994 ret i8 %res 995} 996 997define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) { 998; KNL-LABEL: test_extractelement_v64i1: 999; KNL: ## %bb.0: 1000; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1001; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 1002; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1003; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 1004; KNL-NEXT: vpminub %xmm1, %xmm0, %xmm1 1005; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 1006; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 1007; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 1008; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 1009; KNL-NEXT: kshiftrw $15, %k0, %k0 1010; KNL-NEXT: kmovw %k0, %ecx 1011; KNL-NEXT: andl $1, %ecx 1012; KNL-NEXT: movl $4, %eax 1013; KNL-NEXT: subl %ecx, %eax 1014; KNL-NEXT: vzeroupper 1015; KNL-NEXT: retq 1016; 1017; SKX-LABEL: test_extractelement_v64i1: 1018; SKX: ## %bb.0: 1019; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 1020; SKX-NEXT: kshiftrq $63, %k0, %k0 1021; SKX-NEXT: kmovd %k0, %ecx 1022; SKX-NEXT: andl $1, %ecx 1023; SKX-NEXT: movl $4, %eax 1024; SKX-NEXT: subl %ecx, %eax 1025; SKX-NEXT: vzeroupper 1026; SKX-NEXT: retq 1027 %t1 = icmp ugt <64 x i8> %a, %b 1028 %t2 = extractelement <64 x i1> %t1, i32 63 1029 %res = select i1 %t2, i8 3, i8 4 1030 ret i8 %res 1031} 1032 1033define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) { 1034; KNL-LABEL: extractelement_v64i1_alt: 1035; KNL: ## %bb.0: 1036; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1037; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 1038; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1039; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 1040; KNL-NEXT: vpminub %xmm1, %xmm0, %xmm1 1041; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 1042; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 1043; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 1044; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 1045; KNL-NEXT: kshiftrw $15, %k0, %k0 1046; KNL-NEXT: kmovw %k0, %eax 1047; KNL-NEXT: andb $1, %al 1048; KNL-NEXT: movb $4, %cl 1049; KNL-NEXT: subb %al, %cl 1050; KNL-NEXT: movzbl %cl, %eax 1051; KNL-NEXT: vzeroupper 1052; KNL-NEXT: retq 1053; 1054; SKX-LABEL: extractelement_v64i1_alt: 1055; SKX: ## %bb.0: 1056; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 1057; SKX-NEXT: kshiftrq $63, %k0, %k0 1058; SKX-NEXT: kmovd %k0, %eax 1059; SKX-NEXT: andb $1, %al 1060; SKX-NEXT: movb $4, %cl 1061; SKX-NEXT: subb %al, %cl 1062; SKX-NEXT: movzbl %cl, %eax 1063; SKX-NEXT: vzeroupper 1064; SKX-NEXT: retq 1065 %t1 = icmp ugt <64 x i8> %a, %b 1066 %t2 = extractelement <64 x i1> %t1, i32 63 1067 %sext = sext i1 %t2 to i8 1068 %res = add i8 %sext, 4 1069 ret i8 %res 1070} 1071 1072define i64 @test_extractelement_variable_v2i64(<2 x i64> %t1, i32 %index) { 1073; CHECK-LABEL: test_extractelement_variable_v2i64: 1074; CHECK: ## %bb.0: 1075; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 1076; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 1077; CHECK-NEXT: andl $1, %edi 1078; CHECK-NEXT: movq -24(%rsp,%rdi,8), %rax 1079; CHECK-NEXT: retq 1080 %t2 = extractelement <2 x i64> %t1, i32 %index 1081 ret i64 %t2 1082} 1083 1084define i64 @test_extractelement_variable_v4i64(<4 x i64> %t1, i32 %index) { 1085; CHECK-LABEL: test_extractelement_variable_v4i64: 1086; CHECK: ## %bb.0: 1087; CHECK-NEXT: pushq %rbp 1088; CHECK-NEXT: .cfi_def_cfa_offset 16 1089; CHECK-NEXT: .cfi_offset %rbp, -16 1090; CHECK-NEXT: movq %rsp, %rbp 1091; CHECK-NEXT: .cfi_def_cfa_register %rbp 1092; CHECK-NEXT: andq $-32, %rsp 1093; CHECK-NEXT: subq $64, %rsp 1094; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 1095; CHECK-NEXT: vmovaps %ymm0, (%rsp) 1096; CHECK-NEXT: andl $3, %edi 1097; CHECK-NEXT: movq (%rsp,%rdi,8), %rax 1098; CHECK-NEXT: movq %rbp, %rsp 1099; CHECK-NEXT: popq %rbp 1100; CHECK-NEXT: vzeroupper 1101; CHECK-NEXT: retq 1102 %t2 = extractelement <4 x i64> %t1, i32 %index 1103 ret i64 %t2 1104} 1105 1106define i64 @test_extractelement_variable_v8i64(<8 x i64> %t1, i32 %index) { 1107; CHECK-LABEL: test_extractelement_variable_v8i64: 1108; CHECK: ## %bb.0: 1109; CHECK-NEXT: pushq %rbp 1110; CHECK-NEXT: .cfi_def_cfa_offset 16 1111; CHECK-NEXT: .cfi_offset %rbp, -16 1112; CHECK-NEXT: movq %rsp, %rbp 1113; CHECK-NEXT: .cfi_def_cfa_register %rbp 1114; CHECK-NEXT: andq $-64, %rsp 1115; CHECK-NEXT: subq $128, %rsp 1116; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 1117; CHECK-NEXT: vmovaps %zmm0, (%rsp) 1118; CHECK-NEXT: andl $7, %edi 1119; CHECK-NEXT: movq (%rsp,%rdi,8), %rax 1120; CHECK-NEXT: movq %rbp, %rsp 1121; CHECK-NEXT: popq %rbp 1122; CHECK-NEXT: vzeroupper 1123; CHECK-NEXT: retq 1124 %t2 = extractelement <8 x i64> %t1, i32 %index 1125 ret i64 %t2 1126} 1127 1128define double @test_extractelement_variable_v2f64(<2 x double> %t1, i32 %index) { 1129; CHECK-LABEL: test_extractelement_variable_v2f64: 1130; CHECK: ## %bb.0: 1131; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 1132; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 1133; CHECK-NEXT: andl $1, %edi 1134; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1135; CHECK-NEXT: retq 1136 %t2 = extractelement <2 x double> %t1, i32 %index 1137 ret double %t2 1138} 1139 1140define double @test_extractelement_variable_v4f64(<4 x double> %t1, i32 %index) { 1141; CHECK-LABEL: test_extractelement_variable_v4f64: 1142; CHECK: ## %bb.0: 1143; CHECK-NEXT: pushq %rbp 1144; CHECK-NEXT: .cfi_def_cfa_offset 16 1145; CHECK-NEXT: .cfi_offset %rbp, -16 1146; CHECK-NEXT: movq %rsp, %rbp 1147; CHECK-NEXT: .cfi_def_cfa_register %rbp 1148; CHECK-NEXT: andq $-32, %rsp 1149; CHECK-NEXT: subq $64, %rsp 1150; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 1151; CHECK-NEXT: vmovaps %ymm0, (%rsp) 1152; CHECK-NEXT: andl $3, %edi 1153; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1154; CHECK-NEXT: movq %rbp, %rsp 1155; CHECK-NEXT: popq %rbp 1156; CHECK-NEXT: vzeroupper 1157; CHECK-NEXT: retq 1158 %t2 = extractelement <4 x double> %t1, i32 %index 1159 ret double %t2 1160} 1161 1162define double @test_extractelement_variable_v8f64(<8 x double> %t1, i32 %index) { 1163; CHECK-LABEL: test_extractelement_variable_v8f64: 1164; CHECK: ## %bb.0: 1165; CHECK-NEXT: pushq %rbp 1166; CHECK-NEXT: .cfi_def_cfa_offset 16 1167; CHECK-NEXT: .cfi_offset %rbp, -16 1168; CHECK-NEXT: movq %rsp, %rbp 1169; CHECK-NEXT: .cfi_def_cfa_register %rbp 1170; CHECK-NEXT: andq $-64, %rsp 1171; CHECK-NEXT: subq $128, %rsp 1172; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 1173; CHECK-NEXT: vmovaps %zmm0, (%rsp) 1174; CHECK-NEXT: andl $7, %edi 1175; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1176; CHECK-NEXT: movq %rbp, %rsp 1177; CHECK-NEXT: popq %rbp 1178; CHECK-NEXT: vzeroupper 1179; CHECK-NEXT: retq 1180 %t2 = extractelement <8 x double> %t1, i32 %index 1181 ret double %t2 1182} 1183 1184define i32 @test_extractelement_variable_v4i32(<4 x i32> %t1, i32 %index) { 1185; CHECK-LABEL: test_extractelement_variable_v4i32: 1186; CHECK: ## %bb.0: 1187; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 1188; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 1189; CHECK-NEXT: andl $3, %edi 1190; CHECK-NEXT: movl -24(%rsp,%rdi,4), %eax 1191; CHECK-NEXT: retq 1192 %t2 = extractelement <4 x i32> %t1, i32 %index 1193 ret i32 %t2 1194} 1195 1196define i32 @test_extractelement_variable_v8i32(<8 x i32> %t1, i32 %index) { 1197; CHECK-LABEL: test_extractelement_variable_v8i32: 1198; CHECK: ## %bb.0: 1199; CHECK-NEXT: pushq %rbp 1200; CHECK-NEXT: .cfi_def_cfa_offset 16 1201; CHECK-NEXT: .cfi_offset %rbp, -16 1202; CHECK-NEXT: movq %rsp, %rbp 1203; CHECK-NEXT: .cfi_def_cfa_register %rbp 1204; CHECK-NEXT: andq $-32, %rsp 1205; CHECK-NEXT: subq $64, %rsp 1206; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 1207; CHECK-NEXT: vmovaps %ymm0, (%rsp) 1208; CHECK-NEXT: andl $7, %edi 1209; CHECK-NEXT: movl (%rsp,%rdi,4), %eax 1210; CHECK-NEXT: movq %rbp, %rsp 1211; CHECK-NEXT: popq %rbp 1212; CHECK-NEXT: vzeroupper 1213; CHECK-NEXT: retq 1214 %t2 = extractelement <8 x i32> %t1, i32 %index 1215 ret i32 %t2 1216} 1217 1218define i32 @test_extractelement_variable_v16i32(<16 x i32> %t1, i32 %index) { 1219; CHECK-LABEL: test_extractelement_variable_v16i32: 1220; CHECK: ## %bb.0: 1221; CHECK-NEXT: pushq %rbp 1222; CHECK-NEXT: .cfi_def_cfa_offset 16 1223; CHECK-NEXT: .cfi_offset %rbp, -16 1224; CHECK-NEXT: movq %rsp, %rbp 1225; CHECK-NEXT: .cfi_def_cfa_register %rbp 1226; CHECK-NEXT: andq $-64, %rsp 1227; CHECK-NEXT: subq $128, %rsp 1228; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 1229; CHECK-NEXT: vmovaps %zmm0, (%rsp) 1230; CHECK-NEXT: andl $15, %edi 1231; CHECK-NEXT: movl (%rsp,%rdi,4), %eax 1232; CHECK-NEXT: movq %rbp, %rsp 1233; CHECK-NEXT: popq %rbp 1234; CHECK-NEXT: vzeroupper 1235; CHECK-NEXT: retq 1236 %t2 = extractelement <16 x i32> %t1, i32 %index 1237 ret i32 %t2 1238} 1239 1240define float @test_extractelement_variable_v4f32(<4 x float> %t1, i32 %index) { 1241; CHECK-LABEL: test_extractelement_variable_v4f32: 1242; CHECK: ## %bb.0: 1243; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 1244; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 1245; CHECK-NEXT: andl $3, %edi 1246; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1247; CHECK-NEXT: retq 1248 %t2 = extractelement <4 x float> %t1, i32 %index 1249 ret float %t2 1250} 1251 1252define float @test_extractelement_variable_v8f32(<8 x float> %t1, i32 %index) { 1253; CHECK-LABEL: test_extractelement_variable_v8f32: 1254; CHECK: ## %bb.0: 1255; CHECK-NEXT: pushq %rbp 1256; CHECK-NEXT: .cfi_def_cfa_offset 16 1257; CHECK-NEXT: .cfi_offset %rbp, -16 1258; CHECK-NEXT: movq %rsp, %rbp 1259; CHECK-NEXT: .cfi_def_cfa_register %rbp 1260; CHECK-NEXT: andq $-32, %rsp 1261; CHECK-NEXT: subq $64, %rsp 1262; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 1263; CHECK-NEXT: vmovaps %ymm0, (%rsp) 1264; CHECK-NEXT: andl $7, %edi 1265; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1266; CHECK-NEXT: movq %rbp, %rsp 1267; CHECK-NEXT: popq %rbp 1268; CHECK-NEXT: vzeroupper 1269; CHECK-NEXT: retq 1270 %t2 = extractelement <8 x float> %t1, i32 %index 1271 ret float %t2 1272} 1273 1274define float @test_extractelement_variable_v16f32(<16 x float> %t1, i32 %index) { 1275; CHECK-LABEL: test_extractelement_variable_v16f32: 1276; CHECK: ## %bb.0: 1277; CHECK-NEXT: pushq %rbp 1278; CHECK-NEXT: .cfi_def_cfa_offset 16 1279; CHECK-NEXT: .cfi_offset %rbp, -16 1280; CHECK-NEXT: movq %rsp, %rbp 1281; CHECK-NEXT: .cfi_def_cfa_register %rbp 1282; CHECK-NEXT: andq $-64, %rsp 1283; CHECK-NEXT: subq $128, %rsp 1284; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 1285; CHECK-NEXT: vmovaps %zmm0, (%rsp) 1286; CHECK-NEXT: andl $15, %edi 1287; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1288; CHECK-NEXT: movq %rbp, %rsp 1289; CHECK-NEXT: popq %rbp 1290; CHECK-NEXT: vzeroupper 1291; CHECK-NEXT: retq 1292 %t2 = extractelement <16 x float> %t1, i32 %index 1293 ret float %t2 1294} 1295 1296define i16 @test_extractelement_variable_v8i16(<8 x i16> %t1, i32 %index) { 1297; CHECK-LABEL: test_extractelement_variable_v8i16: 1298; CHECK: ## %bb.0: 1299; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 1300; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 1301; CHECK-NEXT: andl $7, %edi 1302; CHECK-NEXT: movzwl -24(%rsp,%rdi,2), %eax 1303; CHECK-NEXT: retq 1304 %t2 = extractelement <8 x i16> %t1, i32 %index 1305 ret i16 %t2 1306} 1307 1308define i16 @test_extractelement_variable_v16i16(<16 x i16> %t1, i32 %index) { 1309; CHECK-LABEL: test_extractelement_variable_v16i16: 1310; CHECK: ## %bb.0: 1311; CHECK-NEXT: pushq %rbp 1312; CHECK-NEXT: .cfi_def_cfa_offset 16 1313; CHECK-NEXT: .cfi_offset %rbp, -16 1314; CHECK-NEXT: movq %rsp, %rbp 1315; CHECK-NEXT: .cfi_def_cfa_register %rbp 1316; CHECK-NEXT: andq $-32, %rsp 1317; CHECK-NEXT: subq $64, %rsp 1318; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 1319; CHECK-NEXT: vmovaps %ymm0, (%rsp) 1320; CHECK-NEXT: andl $15, %edi 1321; CHECK-NEXT: movzwl (%rsp,%rdi,2), %eax 1322; CHECK-NEXT: movq %rbp, %rsp 1323; CHECK-NEXT: popq %rbp 1324; CHECK-NEXT: vzeroupper 1325; CHECK-NEXT: retq 1326 %t2 = extractelement <16 x i16> %t1, i32 %index 1327 ret i16 %t2 1328} 1329 1330define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) { 1331; CHECK-LABEL: test_extractelement_variable_v32i16: 1332; CHECK: ## %bb.0: 1333; CHECK-NEXT: pushq %rbp 1334; CHECK-NEXT: .cfi_def_cfa_offset 16 1335; CHECK-NEXT: .cfi_offset %rbp, -16 1336; CHECK-NEXT: movq %rsp, %rbp 1337; CHECK-NEXT: .cfi_def_cfa_register %rbp 1338; CHECK-NEXT: andq $-64, %rsp 1339; CHECK-NEXT: subq $128, %rsp 1340; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 1341; CHECK-NEXT: vmovaps %zmm0, (%rsp) 1342; CHECK-NEXT: andl $31, %edi 1343; CHECK-NEXT: movzwl (%rsp,%rdi,2), %eax 1344; CHECK-NEXT: movq %rbp, %rsp 1345; CHECK-NEXT: popq %rbp 1346; CHECK-NEXT: vzeroupper 1347; CHECK-NEXT: retq 1348 %t2 = extractelement <32 x i16> %t1, i32 %index 1349 ret i16 %t2 1350} 1351 1352define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) { 1353; CHECK-LABEL: test_extractelement_variable_v16i8: 1354; CHECK: ## %bb.0: 1355; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 1356; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) 1357; CHECK-NEXT: andl $15, %edi 1358; CHECK-NEXT: movb -24(%rsp,%rdi), %al 1359; CHECK-NEXT: retq 1360 %t2 = extractelement <16 x i8> %t1, i32 %index 1361 ret i8 %t2 1362} 1363 1364define i8 @test_extractelement_variable_v32i8(<32 x i8> %t1, i32 %index) { 1365; CHECK-LABEL: test_extractelement_variable_v32i8: 1366; CHECK: ## %bb.0: 1367; CHECK-NEXT: pushq %rbp 1368; CHECK-NEXT: .cfi_def_cfa_offset 16 1369; CHECK-NEXT: .cfi_offset %rbp, -16 1370; CHECK-NEXT: movq %rsp, %rbp 1371; CHECK-NEXT: .cfi_def_cfa_register %rbp 1372; CHECK-NEXT: andq $-32, %rsp 1373; CHECK-NEXT: subq $64, %rsp 1374; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 1375; CHECK-NEXT: vmovaps %ymm0, (%rsp) 1376; CHECK-NEXT: andl $31, %edi 1377; CHECK-NEXT: movb (%rsp,%rdi), %al 1378; CHECK-NEXT: movq %rbp, %rsp 1379; CHECK-NEXT: popq %rbp 1380; CHECK-NEXT: vzeroupper 1381; CHECK-NEXT: retq 1382 1383 %t2 = extractelement <32 x i8> %t1, i32 %index 1384 ret i8 %t2 1385} 1386 1387define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) { 1388; CHECK-LABEL: test_extractelement_variable_v64i8: 1389; CHECK: ## %bb.0: 1390; CHECK-NEXT: pushq %rbp 1391; CHECK-NEXT: .cfi_def_cfa_offset 16 1392; CHECK-NEXT: .cfi_offset %rbp, -16 1393; CHECK-NEXT: movq %rsp, %rbp 1394; CHECK-NEXT: .cfi_def_cfa_register %rbp 1395; CHECK-NEXT: andq $-64, %rsp 1396; CHECK-NEXT: subq $128, %rsp 1397; CHECK-NEXT: ## kill: def $edi killed $edi def $rdi 1398; CHECK-NEXT: vmovaps %zmm0, (%rsp) 1399; CHECK-NEXT: andl $63, %edi 1400; CHECK-NEXT: movb (%rsp,%rdi), %al 1401; CHECK-NEXT: movq %rbp, %rsp 1402; CHECK-NEXT: popq %rbp 1403; CHECK-NEXT: vzeroupper 1404; CHECK-NEXT: retq 1405 1406 %t2 = extractelement <64 x i8> %t1, i32 %index 1407 ret i8 %t2 1408} 1409 1410define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index) { 1411; CHECK-LABEL: test_extractelement_variable_v64i8_indexi8: 1412; CHECK: ## %bb.0: 1413; CHECK-NEXT: pushq %rbp 1414; CHECK-NEXT: .cfi_def_cfa_offset 16 1415; CHECK-NEXT: .cfi_offset %rbp, -16 1416; CHECK-NEXT: movq %rsp, %rbp 1417; CHECK-NEXT: .cfi_def_cfa_register %rbp 1418; CHECK-NEXT: andq $-64, %rsp 1419; CHECK-NEXT: subq $128, %rsp 1420; CHECK-NEXT: addb %dil, %dil 1421; CHECK-NEXT: vmovaps %zmm0, (%rsp) 1422; CHECK-NEXT: movzbl %dil, %eax 1423; CHECK-NEXT: andl $63, %eax 1424; CHECK-NEXT: movb (%rsp,%rax), %al 1425; CHECK-NEXT: movq %rbp, %rsp 1426; CHECK-NEXT: popq %rbp 1427; CHECK-NEXT: vzeroupper 1428; CHECK-NEXT: retq 1429 1430 %i = add i8 %index, %index 1431 %t2 = extractelement <64 x i8> %t1, i8 %i 1432 ret i8 %t2 1433} 1434 1435define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b, i32 %index) { 1436; KNL-LABEL: test_extractelement_varible_v2i1: 1437; KNL: ## %bb.0: 1438; KNL-NEXT: ## kill: def $edi killed $edi def $rdi 1439; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 1440; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 1441; KNL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 1442; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1443; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) 1444; KNL-NEXT: andl $1, %edi 1445; KNL-NEXT: movzbl -24(%rsp,%rdi,8), %eax 1446; KNL-NEXT: andl $1, %eax 1447; KNL-NEXT: vzeroupper 1448; KNL-NEXT: retq 1449; 1450; SKX-LABEL: test_extractelement_varible_v2i1: 1451; SKX: ## %bb.0: 1452; SKX-NEXT: ## kill: def $edi killed $edi def $rdi 1453; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0 1454; SKX-NEXT: vpmovm2q %k0, %xmm0 1455; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) 1456; SKX-NEXT: andl $1, %edi 1457; SKX-NEXT: movzbl -24(%rsp,%rdi,8), %eax 1458; SKX-NEXT: andl $1, %eax 1459; SKX-NEXT: retq 1460 %t1 = icmp ugt <2 x i64> %a, %b 1461 %t2 = extractelement <2 x i1> %t1, i32 %index 1462 %res = zext i1 %t2 to i8 1463 ret i8 %res 1464} 1465 1466define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b, i32 %index) { 1467; KNL-LABEL: test_extractelement_varible_v4i1: 1468; KNL: ## %bb.0: 1469; KNL-NEXT: ## kill: def $edi killed $edi def $rdi 1470; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 1471; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 1472; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 1473; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1474; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) 1475; KNL-NEXT: andl $3, %edi 1476; KNL-NEXT: movzbl -24(%rsp,%rdi,4), %eax 1477; KNL-NEXT: andl $1, %eax 1478; KNL-NEXT: vzeroupper 1479; KNL-NEXT: retq 1480; 1481; SKX-LABEL: test_extractelement_varible_v4i1: 1482; SKX: ## %bb.0: 1483; SKX-NEXT: ## kill: def $edi killed $edi def $rdi 1484; SKX-NEXT: vpcmpnleud %xmm1, %xmm0, %k0 1485; SKX-NEXT: vpmovm2d %k0, %xmm0 1486; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) 1487; SKX-NEXT: andl $3, %edi 1488; SKX-NEXT: movzbl -24(%rsp,%rdi,4), %eax 1489; SKX-NEXT: andl $1, %eax 1490; SKX-NEXT: retq 1491 %t1 = icmp ugt <4 x i32> %a, %b 1492 %t2 = extractelement <4 x i1> %t1, i32 %index 1493 %res = zext i1 %t2 to i8 1494 ret i8 %res 1495} 1496 1497define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b, i32 %index) { 1498; KNL-LABEL: test_extractelement_varible_v8i1: 1499; KNL: ## %bb.0: 1500; KNL-NEXT: ## kill: def $edi killed $edi def $rdi 1501; KNL-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 1502; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 1503; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 1504; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1505; KNL-NEXT: vpmovdw %zmm0, %ymm0 1506; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) 1507; KNL-NEXT: andl $7, %edi 1508; KNL-NEXT: movzbl -24(%rsp,%rdi,2), %eax 1509; KNL-NEXT: andl $1, %eax 1510; KNL-NEXT: vzeroupper 1511; KNL-NEXT: retq 1512; 1513; SKX-LABEL: test_extractelement_varible_v8i1: 1514; SKX: ## %bb.0: 1515; SKX-NEXT: ## kill: def $edi killed $edi def $rdi 1516; SKX-NEXT: vpcmpnleud %ymm1, %ymm0, %k0 1517; SKX-NEXT: vpmovm2w %k0, %xmm0 1518; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) 1519; SKX-NEXT: andl $7, %edi 1520; SKX-NEXT: movzbl -24(%rsp,%rdi,2), %eax 1521; SKX-NEXT: andl $1, %eax 1522; SKX-NEXT: vzeroupper 1523; SKX-NEXT: retq 1524 %t1 = icmp ugt <8 x i32> %a, %b 1525 %t2 = extractelement <8 x i1> %t1, i32 %index 1526 %res = zext i1 %t2 to i8 1527 ret i8 %res 1528} 1529 1530define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %b, i32 %index) { 1531; KNL-LABEL: test_extractelement_varible_v16i1: 1532; KNL: ## %bb.0: 1533; KNL-NEXT: ## kill: def $edi killed $edi def $rdi 1534; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 1535; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1536; KNL-NEXT: vpmovdb %zmm0, -{{[0-9]+}}(%rsp) 1537; KNL-NEXT: andl $15, %edi 1538; KNL-NEXT: movzbl -24(%rsp,%rdi), %eax 1539; KNL-NEXT: andl $1, %eax 1540; KNL-NEXT: vzeroupper 1541; KNL-NEXT: retq 1542; 1543; SKX-LABEL: test_extractelement_varible_v16i1: 1544; SKX: ## %bb.0: 1545; SKX-NEXT: ## kill: def $edi killed $edi def $rdi 1546; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 1547; SKX-NEXT: vpmovm2b %k0, %xmm0 1548; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) 1549; SKX-NEXT: andl $15, %edi 1550; SKX-NEXT: movzbl -24(%rsp,%rdi), %eax 1551; SKX-NEXT: andl $1, %eax 1552; SKX-NEXT: vzeroupper 1553; SKX-NEXT: retq 1554 %t1 = icmp ugt <16 x i32> %a, %b 1555 %t2 = extractelement <16 x i1> %t1, i32 %index 1556 %res = zext i1 %t2 to i8 1557 ret i8 %res 1558} 1559 1560define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b, i32 %index) { 1561; KNL-LABEL: test_extractelement_varible_v32i1: 1562; KNL: ## %bb.0: 1563; KNL-NEXT: pushq %rbp 1564; KNL-NEXT: .cfi_def_cfa_offset 16 1565; KNL-NEXT: .cfi_offset %rbp, -16 1566; KNL-NEXT: movq %rsp, %rbp 1567; KNL-NEXT: .cfi_def_cfa_register %rbp 1568; KNL-NEXT: andq $-32, %rsp 1569; KNL-NEXT: subq $64, %rsp 1570; KNL-NEXT: ## kill: def $edi killed $edi def $rdi 1571; KNL-NEXT: vpminub %ymm1, %ymm0, %ymm1 1572; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 1573; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 1574; KNL-NEXT: vmovdqa %ymm0, (%rsp) 1575; KNL-NEXT: andl $31, %edi 1576; KNL-NEXT: movzbl (%rsp,%rdi), %eax 1577; KNL-NEXT: andl $1, %eax 1578; KNL-NEXT: movq %rbp, %rsp 1579; KNL-NEXT: popq %rbp 1580; KNL-NEXT: vzeroupper 1581; KNL-NEXT: retq 1582; 1583; SKX-LABEL: test_extractelement_varible_v32i1: 1584; SKX: ## %bb.0: 1585; SKX-NEXT: pushq %rbp 1586; SKX-NEXT: .cfi_def_cfa_offset 16 1587; SKX-NEXT: .cfi_offset %rbp, -16 1588; SKX-NEXT: movq %rsp, %rbp 1589; SKX-NEXT: .cfi_def_cfa_register %rbp 1590; SKX-NEXT: andq $-32, %rsp 1591; SKX-NEXT: subq $64, %rsp 1592; SKX-NEXT: ## kill: def $edi killed $edi def $rdi 1593; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 1594; SKX-NEXT: vpmovm2b %k0, %ymm0 1595; SKX-NEXT: vmovdqa %ymm0, (%rsp) 1596; SKX-NEXT: andl $31, %edi 1597; SKX-NEXT: movzbl (%rsp,%rdi), %eax 1598; SKX-NEXT: andl $1, %eax 1599; SKX-NEXT: movq %rbp, %rsp 1600; SKX-NEXT: popq %rbp 1601; SKX-NEXT: vzeroupper 1602; SKX-NEXT: retq 1603 %t1 = icmp ugt <32 x i8> %a, %b 1604 %t2 = extractelement <32 x i1> %t1, i32 %index 1605 %res = zext i1 %t2 to i8 1606 ret i8 %res 1607} 1608 1609define <8 x i64> @insert_double_zero(<2 x i64> %a) nounwind { 1610; CHECK-LABEL: insert_double_zero: 1611; CHECK: ## %bb.0: 1612; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 1613; CHECK-NEXT: vinsertf32x4 $2, %xmm0, %zmm1, %zmm0 1614; CHECK-NEXT: retq 1615 %b = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1616 %d = shufflevector <4 x i64> %b, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1617 %e = shufflevector <8 x i64> %d, <8 x i64> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3> 1618 ret <8 x i64> %e 1619} 1620 1621define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) { 1622; KNL-LABEL: test_insertelement_variable_v32i1: 1623; KNL: ## %bb.0: 1624; KNL-NEXT: pushq %rbp 1625; KNL-NEXT: .cfi_def_cfa_offset 16 1626; KNL-NEXT: .cfi_offset %rbp, -16 1627; KNL-NEXT: movq %rsp, %rbp 1628; KNL-NEXT: .cfi_def_cfa_register %rbp 1629; KNL-NEXT: andq $-32, %rsp 1630; KNL-NEXT: subq $64, %rsp 1631; KNL-NEXT: ## kill: def $esi killed $esi def $rsi 1632; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1633; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 1634; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 1635; KNL-NEXT: andl $31, %esi 1636; KNL-NEXT: testb %dil, %dil 1637; KNL-NEXT: vmovdqa %ymm0, (%rsp) 1638; KNL-NEXT: setne (%rsp,%rsi) 1639; KNL-NEXT: vpmovsxbd (%rsp), %zmm0 1640; KNL-NEXT: vpslld $31, %zmm0, %zmm0 1641; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 1642; KNL-NEXT: kmovw %k0, %ecx 1643; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 1644; KNL-NEXT: vpslld $31, %zmm0, %zmm0 1645; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 1646; KNL-NEXT: kmovw %k0, %eax 1647; KNL-NEXT: shll $16, %eax 1648; KNL-NEXT: orl %ecx, %eax 1649; KNL-NEXT: movq %rbp, %rsp 1650; KNL-NEXT: popq %rbp 1651; KNL-NEXT: vzeroupper 1652; KNL-NEXT: retq 1653; 1654; SKX-LABEL: test_insertelement_variable_v32i1: 1655; SKX: ## %bb.0: 1656; SKX-NEXT: vptestmb %ymm0, %ymm0, %k0 1657; SKX-NEXT: testb %dil, %dil 1658; SKX-NEXT: setne %al 1659; SKX-NEXT: vpbroadcastb %esi, %ymm0 1660; SKX-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k1 1661; SKX-NEXT: vpmovm2b %k0, %ymm0 1662; SKX-NEXT: vpbroadcastb %eax, %ymm0 {%k1} 1663; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 1664; SKX-NEXT: vpmovb2m %ymm0, %k0 1665; SKX-NEXT: kmovd %k0, %eax 1666; SKX-NEXT: vzeroupper 1667; SKX-NEXT: retq 1668 %t1 = icmp ugt <32 x i8> %a, zeroinitializer 1669 %t2 = icmp ugt i8 %b, 0 1670 %t3 = insertelement <32 x i1> %t1, i1 %t2, i32 %index 1671 %t4 = bitcast <32 x i1> %t3 to i32 1672 ret i32 %t4 1673} 1674 1675define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) { 1676; KNL-LABEL: test_insertelement_variable_v64i1: 1677; KNL: ## %bb.0: 1678; KNL-NEXT: pushq %rbp 1679; KNL-NEXT: .cfi_def_cfa_offset 16 1680; KNL-NEXT: .cfi_offset %rbp, -16 1681; KNL-NEXT: movq %rsp, %rbp 1682; KNL-NEXT: .cfi_def_cfa_register %rbp 1683; KNL-NEXT: andq $-64, %rsp 1684; KNL-NEXT: subq $128, %rsp 1685; KNL-NEXT: ## kill: def $esi killed $esi def $rsi 1686; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1687; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 1688; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 1689; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 1690; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 1691; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 1692; KNL-NEXT: andl $63, %esi 1693; KNL-NEXT: testb %dil, %dil 1694; KNL-NEXT: vmovdqa64 %zmm0, (%rsp) 1695; KNL-NEXT: setne (%rsp,%rsi) 1696; KNL-NEXT: vpmovsxbd (%rsp), %zmm0 1697; KNL-NEXT: vpslld $31, %zmm0, %zmm0 1698; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 1699; KNL-NEXT: kmovw %k0, %eax 1700; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 1701; KNL-NEXT: vpslld $31, %zmm0, %zmm0 1702; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 1703; KNL-NEXT: kmovw %k0, %ecx 1704; KNL-NEXT: shll $16, %ecx 1705; KNL-NEXT: orl %eax, %ecx 1706; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 1707; KNL-NEXT: vpslld $31, %zmm0, %zmm0 1708; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 1709; KNL-NEXT: kmovw %k0, %edx 1710; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 1711; KNL-NEXT: vpslld $31, %zmm0, %zmm0 1712; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 1713; KNL-NEXT: kmovw %k0, %eax 1714; KNL-NEXT: shll $16, %eax 1715; KNL-NEXT: orl %edx, %eax 1716; KNL-NEXT: shlq $32, %rax 1717; KNL-NEXT: orq %rcx, %rax 1718; KNL-NEXT: movq %rbp, %rsp 1719; KNL-NEXT: popq %rbp 1720; KNL-NEXT: vzeroupper 1721; KNL-NEXT: retq 1722; 1723; SKX-LABEL: test_insertelement_variable_v64i1: 1724; SKX: ## %bb.0: 1725; SKX-NEXT: vptestmb %zmm0, %zmm0, %k0 1726; SKX-NEXT: testb %dil, %dil 1727; SKX-NEXT: setne %al 1728; SKX-NEXT: vpbroadcastb %esi, %zmm0 1729; SKX-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k1 1730; SKX-NEXT: vpmovm2b %k0, %zmm0 1731; SKX-NEXT: vpbroadcastb %eax, %zmm0 {%k1} 1732; SKX-NEXT: vpsllw $7, %zmm0, %zmm0 1733; SKX-NEXT: vpmovb2m %zmm0, %k0 1734; SKX-NEXT: kmovq %k0, %rax 1735; SKX-NEXT: vzeroupper 1736; SKX-NEXT: retq 1737 %t1 = icmp ugt <64 x i8> %a, zeroinitializer 1738 %t2 = icmp ugt i8 %b, 0 1739 %t3 = insertelement <64 x i1> %t1, i1 %t2, i32 %index 1740 %t4 = bitcast <64 x i1> %t3 to i64 1741 ret i64 %t4 1742} 1743 1744define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) { 1745; KNL-LABEL: test_insertelement_variable_v96i1: 1746; KNL: ## %bb.0: 1747; KNL-NEXT: pushq %rbp 1748; KNL-NEXT: .cfi_def_cfa_offset 16 1749; KNL-NEXT: .cfi_offset %rbp, -16 1750; KNL-NEXT: movq %rsp, %rbp 1751; KNL-NEXT: .cfi_def_cfa_register %rbp 1752; KNL-NEXT: andq $-64, %rsp 1753; KNL-NEXT: subq $192, %rsp 1754; KNL-NEXT: movl 744(%rbp), %eax 1755; KNL-NEXT: andl $127, %eax 1756; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1757; KNL-NEXT: vpinsrb $1, 232(%rbp), %xmm0, %xmm0 1758; KNL-NEXT: vpinsrb $2, 240(%rbp), %xmm0, %xmm0 1759; KNL-NEXT: vpinsrb $3, 248(%rbp), %xmm0, %xmm0 1760; KNL-NEXT: vpinsrb $4, 256(%rbp), %xmm0, %xmm0 1761; KNL-NEXT: vpinsrb $5, 264(%rbp), %xmm0, %xmm0 1762; KNL-NEXT: vpinsrb $6, 272(%rbp), %xmm0, %xmm0 1763; KNL-NEXT: vpinsrb $7, 280(%rbp), %xmm0, %xmm0 1764; KNL-NEXT: vpinsrb $8, 288(%rbp), %xmm0, %xmm0 1765; KNL-NEXT: vpinsrb $9, 296(%rbp), %xmm0, %xmm0 1766; KNL-NEXT: vpinsrb $10, 304(%rbp), %xmm0, %xmm0 1767; KNL-NEXT: vpinsrb $11, 312(%rbp), %xmm0, %xmm0 1768; KNL-NEXT: vpinsrb $12, 320(%rbp), %xmm0, %xmm0 1769; KNL-NEXT: vpinsrb $13, 328(%rbp), %xmm0, %xmm0 1770; KNL-NEXT: vpinsrb $14, 336(%rbp), %xmm0, %xmm0 1771; KNL-NEXT: vpinsrb $15, 344(%rbp), %xmm0, %xmm0 1772; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1773; KNL-NEXT: vpinsrb $1, 360(%rbp), %xmm1, %xmm1 1774; KNL-NEXT: vpinsrb $2, 368(%rbp), %xmm1, %xmm1 1775; KNL-NEXT: vpinsrb $3, 376(%rbp), %xmm1, %xmm1 1776; KNL-NEXT: vpinsrb $4, 384(%rbp), %xmm1, %xmm1 1777; KNL-NEXT: vpinsrb $5, 392(%rbp), %xmm1, %xmm1 1778; KNL-NEXT: vpinsrb $6, 400(%rbp), %xmm1, %xmm1 1779; KNL-NEXT: vpinsrb $7, 408(%rbp), %xmm1, %xmm1 1780; KNL-NEXT: vpinsrb $8, 416(%rbp), %xmm1, %xmm1 1781; KNL-NEXT: vpinsrb $9, 424(%rbp), %xmm1, %xmm1 1782; KNL-NEXT: vpinsrb $10, 432(%rbp), %xmm1, %xmm1 1783; KNL-NEXT: vpinsrb $11, 440(%rbp), %xmm1, %xmm1 1784; KNL-NEXT: vpinsrb $12, 448(%rbp), %xmm1, %xmm1 1785; KNL-NEXT: vpinsrb $13, 456(%rbp), %xmm1, %xmm1 1786; KNL-NEXT: vpinsrb $14, 464(%rbp), %xmm1, %xmm1 1787; KNL-NEXT: vpinsrb $15, 472(%rbp), %xmm1, %xmm1 1788; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 1789; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 1790; KNL-NEXT: vpcmpeqb %ymm0, %ymm1, %ymm1 1791; KNL-NEXT: vmovd %edi, %xmm2 1792; KNL-NEXT: vpinsrb $1, %esi, %xmm2, %xmm2 1793; KNL-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 1794; KNL-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 1795; KNL-NEXT: vpinsrb $4, %r8d, %xmm2, %xmm2 1796; KNL-NEXT: vpinsrb $5, %r9d, %xmm2, %xmm2 1797; KNL-NEXT: vpinsrb $6, 16(%rbp), %xmm2, %xmm2 1798; KNL-NEXT: vpinsrb $7, 24(%rbp), %xmm2, %xmm2 1799; KNL-NEXT: vpinsrb $8, 32(%rbp), %xmm2, %xmm2 1800; KNL-NEXT: vpinsrb $9, 40(%rbp), %xmm2, %xmm2 1801; KNL-NEXT: vpinsrb $10, 48(%rbp), %xmm2, %xmm2 1802; KNL-NEXT: vpinsrb $11, 56(%rbp), %xmm2, %xmm2 1803; KNL-NEXT: vpinsrb $12, 64(%rbp), %xmm2, %xmm2 1804; KNL-NEXT: vpinsrb $13, 72(%rbp), %xmm2, %xmm2 1805; KNL-NEXT: vpinsrb $14, 80(%rbp), %xmm2, %xmm2 1806; KNL-NEXT: vpinsrb $15, 88(%rbp), %xmm2, %xmm2 1807; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero 1808; KNL-NEXT: vpinsrb $1, 104(%rbp), %xmm3, %xmm3 1809; KNL-NEXT: vpinsrb $2, 112(%rbp), %xmm3, %xmm3 1810; KNL-NEXT: vpinsrb $3, 120(%rbp), %xmm3, %xmm3 1811; KNL-NEXT: vpinsrb $4, 128(%rbp), %xmm3, %xmm3 1812; KNL-NEXT: vpinsrb $5, 136(%rbp), %xmm3, %xmm3 1813; KNL-NEXT: vpinsrb $6, 144(%rbp), %xmm3, %xmm3 1814; KNL-NEXT: vpinsrb $7, 152(%rbp), %xmm3, %xmm3 1815; KNL-NEXT: vpinsrb $8, 160(%rbp), %xmm3, %xmm3 1816; KNL-NEXT: vpinsrb $9, 168(%rbp), %xmm3, %xmm3 1817; KNL-NEXT: vpinsrb $10, 176(%rbp), %xmm3, %xmm3 1818; KNL-NEXT: vpinsrb $11, 184(%rbp), %xmm3, %xmm3 1819; KNL-NEXT: vpinsrb $12, 192(%rbp), %xmm3, %xmm3 1820; KNL-NEXT: vpinsrb $13, 200(%rbp), %xmm3, %xmm3 1821; KNL-NEXT: vpinsrb $14, 208(%rbp), %xmm3, %xmm3 1822; KNL-NEXT: vpinsrb $15, 216(%rbp), %xmm3, %xmm3 1823; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1824; KNL-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm2 1825; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 1826; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 1827; KNL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 1828; KNL-NEXT: vpinsrb $1, 488(%rbp), %xmm2, %xmm2 1829; KNL-NEXT: vpinsrb $2, 496(%rbp), %xmm2, %xmm2 1830; KNL-NEXT: vpinsrb $3, 504(%rbp), %xmm2, %xmm2 1831; KNL-NEXT: vpinsrb $4, 512(%rbp), %xmm2, %xmm2 1832; KNL-NEXT: vpinsrb $5, 520(%rbp), %xmm2, %xmm2 1833; KNL-NEXT: vpinsrb $6, 528(%rbp), %xmm2, %xmm2 1834; KNL-NEXT: vpinsrb $7, 536(%rbp), %xmm2, %xmm2 1835; KNL-NEXT: vpinsrb $8, 544(%rbp), %xmm2, %xmm2 1836; KNL-NEXT: vpinsrb $9, 552(%rbp), %xmm2, %xmm2 1837; KNL-NEXT: vpinsrb $10, 560(%rbp), %xmm2, %xmm2 1838; KNL-NEXT: vpinsrb $11, 568(%rbp), %xmm2, %xmm2 1839; KNL-NEXT: vpinsrb $12, 576(%rbp), %xmm2, %xmm2 1840; KNL-NEXT: vpinsrb $13, 584(%rbp), %xmm2, %xmm2 1841; KNL-NEXT: vpinsrb $14, 592(%rbp), %xmm2, %xmm2 1842; KNL-NEXT: vpinsrb $15, 600(%rbp), %xmm2, %xmm2 1843; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero 1844; KNL-NEXT: vpinsrb $1, 616(%rbp), %xmm3, %xmm3 1845; KNL-NEXT: vpinsrb $2, 624(%rbp), %xmm3, %xmm3 1846; KNL-NEXT: vpinsrb $3, 632(%rbp), %xmm3, %xmm3 1847; KNL-NEXT: vpinsrb $4, 640(%rbp), %xmm3, %xmm3 1848; KNL-NEXT: vpinsrb $5, 648(%rbp), %xmm3, %xmm3 1849; KNL-NEXT: vpinsrb $6, 656(%rbp), %xmm3, %xmm3 1850; KNL-NEXT: vpinsrb $7, 664(%rbp), %xmm3, %xmm3 1851; KNL-NEXT: vpinsrb $8, 672(%rbp), %xmm3, %xmm3 1852; KNL-NEXT: vpinsrb $9, 680(%rbp), %xmm3, %xmm3 1853; KNL-NEXT: vpinsrb $10, 688(%rbp), %xmm3, %xmm3 1854; KNL-NEXT: vpinsrb $11, 696(%rbp), %xmm3, %xmm3 1855; KNL-NEXT: vpinsrb $12, 704(%rbp), %xmm3, %xmm3 1856; KNL-NEXT: vpinsrb $13, 712(%rbp), %xmm3, %xmm3 1857; KNL-NEXT: vpinsrb $14, 720(%rbp), %xmm3, %xmm3 1858; KNL-NEXT: vpinsrb $15, 728(%rbp), %xmm3, %xmm3 1859; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 1860; KNL-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0 1861; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 1862; KNL-NEXT: cmpb $0, 736(%rbp) 1863; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) 1864; KNL-NEXT: vmovdqa64 %zmm1, (%rsp) 1865; KNL-NEXT: setne (%rsp,%rax) 1866; KNL-NEXT: vpmovsxbd (%rsp), %zmm0 1867; KNL-NEXT: vpslld $31, %zmm0, %zmm0 1868; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 1869; KNL-NEXT: kmovw %k0, %eax 1870; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 1871; KNL-NEXT: vpslld $31, %zmm0, %zmm0 1872; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 1873; KNL-NEXT: kmovw %k0, %ecx 1874; KNL-NEXT: shll $16, %ecx 1875; KNL-NEXT: orl %eax, %ecx 1876; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 1877; KNL-NEXT: vpslld $31, %zmm0, %zmm0 1878; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 1879; KNL-NEXT: kmovw %k0, %edx 1880; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 1881; KNL-NEXT: vpslld $31, %zmm0, %zmm0 1882; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 1883; KNL-NEXT: kmovw %k0, %eax 1884; KNL-NEXT: shll $16, %eax 1885; KNL-NEXT: orl %edx, %eax 1886; KNL-NEXT: shlq $32, %rax 1887; KNL-NEXT: orq %rcx, %rax 1888; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 1889; KNL-NEXT: vpslld $31, %zmm0, %zmm0 1890; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 1891; KNL-NEXT: kmovw %k0, %ecx 1892; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 1893; KNL-NEXT: vpslld $31, %zmm0, %zmm0 1894; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 1895; KNL-NEXT: kmovw %k0, %esi 1896; KNL-NEXT: shll $16, %esi 1897; KNL-NEXT: orl %ecx, %esi 1898; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 1899; KNL-NEXT: vpslld $31, %zmm0, %zmm0 1900; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 1901; KNL-NEXT: kmovw %k0, %ecx 1902; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 1903; KNL-NEXT: vpslld $31, %zmm0, %zmm0 1904; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 1905; KNL-NEXT: kmovw %k0, %edx 1906; KNL-NEXT: shll $16, %edx 1907; KNL-NEXT: orl %ecx, %edx 1908; KNL-NEXT: shlq $32, %rdx 1909; KNL-NEXT: orq %rsi, %rdx 1910; KNL-NEXT: movq %rbp, %rsp 1911; KNL-NEXT: popq %rbp 1912; KNL-NEXT: vzeroupper 1913; KNL-NEXT: retq 1914; 1915; SKX-LABEL: test_insertelement_variable_v96i1: 1916; SKX: ## %bb.0: 1917; SKX-NEXT: pushq %rbp 1918; SKX-NEXT: .cfi_def_cfa_offset 16 1919; SKX-NEXT: .cfi_offset %rbp, -16 1920; SKX-NEXT: movq %rsp, %rbp 1921; SKX-NEXT: .cfi_def_cfa_register %rbp 1922; SKX-NEXT: andq $-64, %rsp 1923; SKX-NEXT: subq $192, %rsp 1924; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1925; SKX-NEXT: vpinsrb $1, 232(%rbp), %xmm0, %xmm0 1926; SKX-NEXT: vpinsrb $2, 240(%rbp), %xmm0, %xmm0 1927; SKX-NEXT: vpinsrb $3, 248(%rbp), %xmm0, %xmm0 1928; SKX-NEXT: vpinsrb $4, 256(%rbp), %xmm0, %xmm0 1929; SKX-NEXT: vpinsrb $5, 264(%rbp), %xmm0, %xmm0 1930; SKX-NEXT: vpinsrb $6, 272(%rbp), %xmm0, %xmm0 1931; SKX-NEXT: vpinsrb $7, 280(%rbp), %xmm0, %xmm0 1932; SKX-NEXT: vpinsrb $8, 288(%rbp), %xmm0, %xmm0 1933; SKX-NEXT: vpinsrb $9, 296(%rbp), %xmm0, %xmm0 1934; SKX-NEXT: vpinsrb $10, 304(%rbp), %xmm0, %xmm0 1935; SKX-NEXT: vpinsrb $11, 312(%rbp), %xmm0, %xmm0 1936; SKX-NEXT: vpinsrb $12, 320(%rbp), %xmm0, %xmm0 1937; SKX-NEXT: vpinsrb $13, 328(%rbp), %xmm0, %xmm0 1938; SKX-NEXT: vpinsrb $14, 336(%rbp), %xmm0, %xmm0 1939; SKX-NEXT: vpinsrb $15, 344(%rbp), %xmm0, %xmm0 1940; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1941; SKX-NEXT: vpinsrb $1, 360(%rbp), %xmm1, %xmm1 1942; SKX-NEXT: vpinsrb $2, 368(%rbp), %xmm1, %xmm1 1943; SKX-NEXT: vpinsrb $3, 376(%rbp), %xmm1, %xmm1 1944; SKX-NEXT: vpinsrb $4, 384(%rbp), %xmm1, %xmm1 1945; SKX-NEXT: vpinsrb $5, 392(%rbp), %xmm1, %xmm1 1946; SKX-NEXT: vpinsrb $6, 400(%rbp), %xmm1, %xmm1 1947; SKX-NEXT: vpinsrb $7, 408(%rbp), %xmm1, %xmm1 1948; SKX-NEXT: vpinsrb $8, 416(%rbp), %xmm1, %xmm1 1949; SKX-NEXT: vpinsrb $9, 424(%rbp), %xmm1, %xmm1 1950; SKX-NEXT: vpinsrb $10, 432(%rbp), %xmm1, %xmm1 1951; SKX-NEXT: vpinsrb $11, 440(%rbp), %xmm1, %xmm1 1952; SKX-NEXT: vpinsrb $12, 448(%rbp), %xmm1, %xmm1 1953; SKX-NEXT: vpinsrb $13, 456(%rbp), %xmm1, %xmm1 1954; SKX-NEXT: vpinsrb $14, 464(%rbp), %xmm1, %xmm1 1955; SKX-NEXT: vpinsrb $15, 472(%rbp), %xmm1, %xmm1 1956; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1957; SKX-NEXT: vmovd %edi, %xmm1 1958; SKX-NEXT: vpinsrb $1, %esi, %xmm1, %xmm1 1959; SKX-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1 1960; SKX-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 1961; SKX-NEXT: vpinsrb $4, %r8d, %xmm1, %xmm1 1962; SKX-NEXT: vpinsrb $5, %r9d, %xmm1, %xmm1 1963; SKX-NEXT: vpinsrb $6, 16(%rbp), %xmm1, %xmm1 1964; SKX-NEXT: vpinsrb $7, 24(%rbp), %xmm1, %xmm1 1965; SKX-NEXT: vpinsrb $8, 32(%rbp), %xmm1, %xmm1 1966; SKX-NEXT: vpinsrb $9, 40(%rbp), %xmm1, %xmm1 1967; SKX-NEXT: vpinsrb $10, 48(%rbp), %xmm1, %xmm1 1968; SKX-NEXT: vpinsrb $11, 56(%rbp), %xmm1, %xmm1 1969; SKX-NEXT: vpinsrb $12, 64(%rbp), %xmm1, %xmm1 1970; SKX-NEXT: vpinsrb $13, 72(%rbp), %xmm1, %xmm1 1971; SKX-NEXT: vpinsrb $14, 80(%rbp), %xmm1, %xmm1 1972; SKX-NEXT: vpinsrb $15, 88(%rbp), %xmm1, %xmm1 1973; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 1974; SKX-NEXT: vpinsrb $1, 104(%rbp), %xmm2, %xmm2 1975; SKX-NEXT: vpinsrb $2, 112(%rbp), %xmm2, %xmm2 1976; SKX-NEXT: vpinsrb $3, 120(%rbp), %xmm2, %xmm2 1977; SKX-NEXT: vpinsrb $4, 128(%rbp), %xmm2, %xmm2 1978; SKX-NEXT: vpinsrb $5, 136(%rbp), %xmm2, %xmm2 1979; SKX-NEXT: vpinsrb $6, 144(%rbp), %xmm2, %xmm2 1980; SKX-NEXT: vpinsrb $7, 152(%rbp), %xmm2, %xmm2 1981; SKX-NEXT: vpinsrb $8, 160(%rbp), %xmm2, %xmm2 1982; SKX-NEXT: vpinsrb $9, 168(%rbp), %xmm2, %xmm2 1983; SKX-NEXT: vpinsrb $10, 176(%rbp), %xmm2, %xmm2 1984; SKX-NEXT: vpinsrb $11, 184(%rbp), %xmm2, %xmm2 1985; SKX-NEXT: vpinsrb $12, 192(%rbp), %xmm2, %xmm2 1986; SKX-NEXT: vpinsrb $13, 200(%rbp), %xmm2, %xmm2 1987; SKX-NEXT: vpinsrb $14, 208(%rbp), %xmm2, %xmm2 1988; SKX-NEXT: vpinsrb $15, 216(%rbp), %xmm2, %xmm2 1989; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 1990; SKX-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 1991; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1992; SKX-NEXT: vpinsrb $1, 488(%rbp), %xmm1, %xmm1 1993; SKX-NEXT: vpinsrb $2, 496(%rbp), %xmm1, %xmm1 1994; SKX-NEXT: vpinsrb $3, 504(%rbp), %xmm1, %xmm1 1995; SKX-NEXT: vpinsrb $4, 512(%rbp), %xmm1, %xmm1 1996; SKX-NEXT: vpinsrb $5, 520(%rbp), %xmm1, %xmm1 1997; SKX-NEXT: vpinsrb $6, 528(%rbp), %xmm1, %xmm1 1998; SKX-NEXT: vpinsrb $7, 536(%rbp), %xmm1, %xmm1 1999; SKX-NEXT: vpinsrb $8, 544(%rbp), %xmm1, %xmm1 2000; SKX-NEXT: vpinsrb $9, 552(%rbp), %xmm1, %xmm1 2001; SKX-NEXT: vpinsrb $10, 560(%rbp), %xmm1, %xmm1 2002; SKX-NEXT: vpinsrb $11, 568(%rbp), %xmm1, %xmm1 2003; SKX-NEXT: vpinsrb $12, 576(%rbp), %xmm1, %xmm1 2004; SKX-NEXT: vpinsrb $13, 584(%rbp), %xmm1, %xmm1 2005; SKX-NEXT: vpinsrb $14, 592(%rbp), %xmm1, %xmm1 2006; SKX-NEXT: vpinsrb $15, 600(%rbp), %xmm1, %xmm1 2007; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 2008; SKX-NEXT: vpinsrb $1, 616(%rbp), %xmm2, %xmm2 2009; SKX-NEXT: vpinsrb $2, 624(%rbp), %xmm2, %xmm2 2010; SKX-NEXT: vpinsrb $3, 632(%rbp), %xmm2, %xmm2 2011; SKX-NEXT: vpinsrb $4, 640(%rbp), %xmm2, %xmm2 2012; SKX-NEXT: vpinsrb $5, 648(%rbp), %xmm2, %xmm2 2013; SKX-NEXT: vpinsrb $6, 656(%rbp), %xmm2, %xmm2 2014; SKX-NEXT: vpinsrb $7, 664(%rbp), %xmm2, %xmm2 2015; SKX-NEXT: vpinsrb $8, 672(%rbp), %xmm2, %xmm2 2016; SKX-NEXT: vpinsrb $9, 680(%rbp), %xmm2, %xmm2 2017; SKX-NEXT: vpinsrb $10, 688(%rbp), %xmm2, %xmm2 2018; SKX-NEXT: vpinsrb $11, 696(%rbp), %xmm2, %xmm2 2019; SKX-NEXT: vpinsrb $12, 704(%rbp), %xmm2, %xmm2 2020; SKX-NEXT: vpinsrb $13, 712(%rbp), %xmm2, %xmm2 2021; SKX-NEXT: vpinsrb $14, 720(%rbp), %xmm2, %xmm2 2022; SKX-NEXT: vpinsrb $15, 728(%rbp), %xmm2, %xmm2 2023; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2024; SKX-NEXT: movl 744(%rbp), %eax 2025; SKX-NEXT: andl $127, %eax 2026; SKX-NEXT: vptestmb %zmm0, %zmm0, %k0 2027; SKX-NEXT: vptestmb %zmm1, %zmm1, %k1 2028; SKX-NEXT: cmpb $0, 736(%rbp) 2029; SKX-NEXT: vpmovm2b %k1, %zmm0 2030; SKX-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 2031; SKX-NEXT: vpmovm2b %k0, %zmm0 2032; SKX-NEXT: vmovdqa64 %zmm0, (%rsp) 2033; SKX-NEXT: setne (%rsp,%rax) 2034; SKX-NEXT: vpsllw $7, {{[0-9]+}}(%rsp), %zmm0 2035; SKX-NEXT: vpmovb2m %zmm0, %k0 2036; SKX-NEXT: vpsllw $7, (%rsp), %zmm0 2037; SKX-NEXT: vpmovb2m %zmm0, %k1 2038; SKX-NEXT: kmovq %k1, %rax 2039; SKX-NEXT: kmovq %k0, %rdx 2040; SKX-NEXT: movq %rbp, %rsp 2041; SKX-NEXT: popq %rbp 2042; SKX-NEXT: vzeroupper 2043; SKX-NEXT: retq 2044 %t1 = icmp ugt <96 x i8> %a, zeroinitializer 2045 %t2 = icmp ugt i8 %b, 0 2046 %t3 = insertelement <96 x i1> %t1, i1 %t2, i32 %index 2047 %t4 = bitcast <96 x i1> %t3 to i96 2048 ret i96 %t4 2049} 2050 2051define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index) { 2052; KNL-LABEL: test_insertelement_variable_v128i1: 2053; KNL: ## %bb.0: 2054; KNL-NEXT: pushq %rbp 2055; KNL-NEXT: .cfi_def_cfa_offset 16 2056; KNL-NEXT: .cfi_offset %rbp, -16 2057; KNL-NEXT: movq %rsp, %rbp 2058; KNL-NEXT: .cfi_def_cfa_register %rbp 2059; KNL-NEXT: andq $-64, %rsp 2060; KNL-NEXT: subq $192, %rsp 2061; KNL-NEXT: ## kill: def $esi killed $esi def $rsi 2062; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 2063; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3 2064; KNL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 2065; KNL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 2066; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 2067; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 2068; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 2069; KNL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 2070; KNL-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 2071; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 2072; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1 2073; KNL-NEXT: andl $127, %esi 2074; KNL-NEXT: testb %dil, %dil 2075; KNL-NEXT: vmovdqa64 %zmm1, {{[0-9]+}}(%rsp) 2076; KNL-NEXT: vmovdqa64 %zmm0, (%rsp) 2077; KNL-NEXT: setne (%rsp,%rsi) 2078; KNL-NEXT: vpmovsxbd (%rsp), %zmm0 2079; KNL-NEXT: vpslld $31, %zmm0, %zmm0 2080; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 2081; KNL-NEXT: kmovw %k0, %eax 2082; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 2083; KNL-NEXT: vpslld $31, %zmm0, %zmm0 2084; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 2085; KNL-NEXT: kmovw %k0, %ecx 2086; KNL-NEXT: shll $16, %ecx 2087; KNL-NEXT: orl %eax, %ecx 2088; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 2089; KNL-NEXT: vpslld $31, %zmm0, %zmm0 2090; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 2091; KNL-NEXT: kmovw %k0, %edx 2092; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 2093; KNL-NEXT: vpslld $31, %zmm0, %zmm0 2094; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 2095; KNL-NEXT: kmovw %k0, %eax 2096; KNL-NEXT: shll $16, %eax 2097; KNL-NEXT: orl %edx, %eax 2098; KNL-NEXT: shlq $32, %rax 2099; KNL-NEXT: orq %rcx, %rax 2100; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 2101; KNL-NEXT: vpslld $31, %zmm0, %zmm0 2102; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 2103; KNL-NEXT: kmovw %k0, %ecx 2104; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 2105; KNL-NEXT: vpslld $31, %zmm0, %zmm0 2106; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 2107; KNL-NEXT: kmovw %k0, %esi 2108; KNL-NEXT: shll $16, %esi 2109; KNL-NEXT: orl %ecx, %esi 2110; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 2111; KNL-NEXT: vpslld $31, %zmm0, %zmm0 2112; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 2113; KNL-NEXT: kmovw %k0, %ecx 2114; KNL-NEXT: vpmovsxbd {{[0-9]+}}(%rsp), %zmm0 2115; KNL-NEXT: vpslld $31, %zmm0, %zmm0 2116; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 2117; KNL-NEXT: kmovw %k0, %edx 2118; KNL-NEXT: shll $16, %edx 2119; KNL-NEXT: orl %ecx, %edx 2120; KNL-NEXT: shlq $32, %rdx 2121; KNL-NEXT: orq %rsi, %rdx 2122; KNL-NEXT: movq %rbp, %rsp 2123; KNL-NEXT: popq %rbp 2124; KNL-NEXT: vzeroupper 2125; KNL-NEXT: retq 2126; 2127; SKX-LABEL: test_insertelement_variable_v128i1: 2128; SKX: ## %bb.0: 2129; SKX-NEXT: pushq %rbp 2130; SKX-NEXT: .cfi_def_cfa_offset 16 2131; SKX-NEXT: .cfi_offset %rbp, -16 2132; SKX-NEXT: movq %rsp, %rbp 2133; SKX-NEXT: .cfi_def_cfa_register %rbp 2134; SKX-NEXT: andq $-64, %rsp 2135; SKX-NEXT: subq $192, %rsp 2136; SKX-NEXT: ## kill: def $esi killed $esi def $rsi 2137; SKX-NEXT: vptestmb %zmm0, %zmm0, %k0 2138; SKX-NEXT: vptestmb %zmm1, %zmm1, %k1 2139; SKX-NEXT: andl $127, %esi 2140; SKX-NEXT: testb %dil, %dil 2141; SKX-NEXT: vpmovm2b %k1, %zmm0 2142; SKX-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) 2143; SKX-NEXT: vpmovm2b %k0, %zmm0 2144; SKX-NEXT: vmovdqa64 %zmm0, (%rsp) 2145; SKX-NEXT: setne (%rsp,%rsi) 2146; SKX-NEXT: vpsllw $7, {{[0-9]+}}(%rsp), %zmm0 2147; SKX-NEXT: vpmovb2m %zmm0, %k0 2148; SKX-NEXT: vpsllw $7, (%rsp), %zmm0 2149; SKX-NEXT: vpmovb2m %zmm0, %k1 2150; SKX-NEXT: kmovq %k1, %rax 2151; SKX-NEXT: kmovq %k0, %rdx 2152; SKX-NEXT: movq %rbp, %rsp 2153; SKX-NEXT: popq %rbp 2154; SKX-NEXT: vzeroupper 2155; SKX-NEXT: retq 2156 %t1 = icmp ugt <128 x i8> %a, zeroinitializer 2157 %t2 = icmp ugt i8 %b, 0 2158 %t3 = insertelement <128 x i1> %t1, i1 %t2, i32 %index 2159 %t4 = bitcast <128 x i1> %t3 to i128 2160 ret i128 %t4 2161} 2162 2163define void @test_concat_v2i1(<2 x half>* %arg, <2 x half>* %arg1, <2 x half>* %arg2) { 2164; KNL-LABEL: test_concat_v2i1: 2165; KNL: ## %bb.0: 2166; KNL-NEXT: movzwl 2(%rdi), %eax 2167; KNL-NEXT: movzwl (%rdi), %ecx 2168; KNL-NEXT: vmovd %ecx, %xmm0 2169; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 2170; KNL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 2171; KNL-NEXT: vucomiss %xmm1, %xmm0 2172; KNL-NEXT: setb %cl 2173; KNL-NEXT: andl $1, %ecx 2174; KNL-NEXT: kmovw %ecx, %k0 2175; KNL-NEXT: vmovd %eax, %xmm2 2176; KNL-NEXT: vcvtph2ps %xmm2, %xmm2 2177; KNL-NEXT: vucomiss %xmm1, %xmm2 2178; KNL-NEXT: setb %al 2179; KNL-NEXT: kmovw %eax, %k1 2180; KNL-NEXT: kshiftlw $1, %k1, %k1 2181; KNL-NEXT: korw %k1, %k0, %k0 2182; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 2183; KNL-NEXT: vucomiss %xmm1, %xmm0 2184; KNL-NEXT: seta %al 2185; KNL-NEXT: andl $1, %eax 2186; KNL-NEXT: kmovw %eax, %k1 2187; KNL-NEXT: vucomiss %xmm1, %xmm2 2188; KNL-NEXT: seta %al 2189; KNL-NEXT: kmovw %eax, %k2 2190; KNL-NEXT: kshiftlw $1, %k2, %k2 2191; KNL-NEXT: korw %k2, %k1, %k1 2192; KNL-NEXT: kandw %k1, %k0, %k0 2193; KNL-NEXT: kshiftrw $1, %k0, %k1 2194; KNL-NEXT: kmovw %k1, %ecx 2195; KNL-NEXT: xorl %eax, %eax 2196; KNL-NEXT: testb $1, %cl 2197; KNL-NEXT: movl $0, %ecx 2198; KNL-NEXT: je LBB85_2 2199; KNL-NEXT: ## %bb.1: 2200; KNL-NEXT: movzwl 2(%rsi), %ecx 2201; KNL-NEXT: LBB85_2: 2202; KNL-NEXT: kmovw %k0, %edi 2203; KNL-NEXT: testb $1, %dil 2204; KNL-NEXT: je LBB85_4 2205; KNL-NEXT: ## %bb.3: 2206; KNL-NEXT: movzwl (%rsi), %eax 2207; KNL-NEXT: LBB85_4: 2208; KNL-NEXT: movw %ax, (%rdx) 2209; KNL-NEXT: movw %cx, 2(%rdx) 2210; KNL-NEXT: retq 2211; 2212; SKX-LABEL: test_concat_v2i1: 2213; SKX: ## %bb.0: 2214; SKX-NEXT: movzwl (%rdi), %eax 2215; SKX-NEXT: movzwl 2(%rdi), %ecx 2216; SKX-NEXT: vmovd %ecx, %xmm0 2217; SKX-NEXT: vcvtph2ps %xmm0, %xmm0 2218; SKX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 2219; SKX-NEXT: vucomiss %xmm1, %xmm0 2220; SKX-NEXT: setb %cl 2221; SKX-NEXT: kmovd %ecx, %k0 2222; SKX-NEXT: kshiftlb $1, %k0, %k0 2223; SKX-NEXT: vmovd %eax, %xmm2 2224; SKX-NEXT: vcvtph2ps %xmm2, %xmm2 2225; SKX-NEXT: vucomiss %xmm1, %xmm2 2226; SKX-NEXT: setb %al 2227; SKX-NEXT: kmovd %eax, %k1 2228; SKX-NEXT: kshiftlb $7, %k1, %k1 2229; SKX-NEXT: kshiftrb $7, %k1, %k1 2230; SKX-NEXT: korw %k0, %k1, %k0 2231; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 2232; SKX-NEXT: vucomiss %xmm1, %xmm0 2233; SKX-NEXT: seta %al 2234; SKX-NEXT: kmovd %eax, %k1 2235; SKX-NEXT: kshiftlb $1, %k1, %k1 2236; SKX-NEXT: vucomiss %xmm1, %xmm2 2237; SKX-NEXT: seta %al 2238; SKX-NEXT: kmovd %eax, %k2 2239; SKX-NEXT: kshiftlb $7, %k2, %k2 2240; SKX-NEXT: kshiftrb $7, %k2, %k2 2241; SKX-NEXT: korw %k1, %k2, %k1 2242; SKX-NEXT: kandw %k1, %k0, %k0 2243; SKX-NEXT: kshiftrb $1, %k0, %k1 2244; SKX-NEXT: kmovd %k1, %ecx 2245; SKX-NEXT: xorl %eax, %eax 2246; SKX-NEXT: testb $1, %cl 2247; SKX-NEXT: movl $0, %ecx 2248; SKX-NEXT: je LBB85_2 2249; SKX-NEXT: ## %bb.1: 2250; SKX-NEXT: movzwl 2(%rsi), %ecx 2251; SKX-NEXT: LBB85_2: 2252; SKX-NEXT: kmovd %k0, %edi 2253; SKX-NEXT: testb $1, %dil 2254; SKX-NEXT: je LBB85_4 2255; SKX-NEXT: ## %bb.3: 2256; SKX-NEXT: movzwl (%rsi), %eax 2257; SKX-NEXT: LBB85_4: 2258; SKX-NEXT: movw %ax, (%rdx) 2259; SKX-NEXT: movw %cx, 2(%rdx) 2260; SKX-NEXT: retq 2261 %tmp = load <2 x half>, <2 x half>* %arg, align 8 2262 %tmp3 = fcmp fast olt <2 x half> %tmp, <half 0xH4600, half 0xH4600> 2263 %tmp4 = fcmp fast ogt <2 x half> %tmp, zeroinitializer 2264 %tmp5 = and <2 x i1> %tmp3, %tmp4 2265 %tmp6 = load <2 x half>, <2 x half>* %arg1, align 8 2266 %tmp7 = select <2 x i1> %tmp5, <2 x half> %tmp6, <2 x half> zeroinitializer 2267 store <2 x half> %tmp7, <2 x half>* %arg2, align 8 2268 ret void 2269} 2270