1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s 3 4declare <4 x float> @foo(<4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>) 5 6; Due to a bug in X86RegisterInfo::getLargestLegalSuperClass this test case was trying to use XMM16 and spill it without VLX support for the necessary store instruction. We briefly implemented the spill using VEXTRACTF32X4, but the bug in getLargestLegalSuperClass has now been fixed so we no longer use XMM16. 7 8define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <4 x float> %a4, <16 x float>%c1, <16 x float>%c2) { 9; CHECK-LABEL: bar: 10; CHECK: # %bb.0: 11; CHECK-NEXT: subq $72, %rsp 12; CHECK-NEXT: .cfi_def_cfa_offset 80 13; CHECK-NEXT: vmovaps %xmm1, %xmm9 14; CHECK-NEXT: vmovaps {{.*#+}} xmm14 = [4,22,1,17] 15; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm14 16; CHECK-NEXT: vmovaps {{.*#+}} xmm10 = [4,30,1,22] 17; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm10 18; CHECK-NEXT: vmovaps {{.*#+}} xmm8 = [4,28,1,29] 19; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm8 20; CHECK-NEXT: vmovaps {{.*#+}} xmm7 = <5,20,u,u> 21; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm7 22; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [4,21,1,7] 23; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm4 24; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm5 25; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm6 26; CHECK-NEXT: vunpcklps {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 27; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm11[0,1],xmm2[1],xmm11[3] 28; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm1[0,1,2],xmm3[1] 29; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1] 30; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 31; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4 32; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] 33; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[3,3,3,3] 34; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 35; CHECK-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1,3] 36; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1] 37; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3] 38; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm3[3] 39; CHECK-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm3[3] 40; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[1] 41; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[1] 42; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm8 43; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3,3] 44; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2] 45; CHECK-NEXT: vaddps %xmm2, %xmm14, %xmm2 46; CHECK-NEXT: vmovaps %xmm13, %xmm1 47; CHECK-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 48; CHECK-NEXT: vaddps %xmm10, %xmm13, %xmm10 49; CHECK-NEXT: vaddps %xmm13, %xmm13, %xmm3 50; CHECK-NEXT: vaddps %xmm12, %xmm14, %xmm0 51; CHECK-NEXT: vaddps %xmm0, %xmm8, %xmm0 52; CHECK-NEXT: vaddps %xmm0, %xmm13, %xmm0 53; CHECK-NEXT: vmovaps %xmm3, {{[0-9]+}}(%rsp) 54; CHECK-NEXT: vmovaps %xmm10, (%rsp) 55; CHECK-NEXT: vmovaps %xmm9, %xmm3 56; CHECK-NEXT: vzeroupper 57; CHECK-NEXT: callq foo@PLT 58; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 59; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 60; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 61; CHECK-NEXT: addq $72, %rsp 62; CHECK-NEXT: .cfi_def_cfa_offset 8 63; CHECK-NEXT: retq 64 %a1 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17> 65 %a2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17> 66 %a5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 27> 67 %a6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 3, i32 20, i32 1, i32 17> 68 %a7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 21, i32 1, i32 17> 69 %a8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 19> 70 %a9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17> 71 %a10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17> 72 %ax2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19> 73 %ax5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17> 74 %ax6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 18> 75 %ax7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 1, i32 20, i32 1, i32 17> 76 %ax8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 19> 77 %ax9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17> 78 %ax10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17> 79 %ay2 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17> 80 %ay5 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 28, i32 1, i32 17> 81 %ay6 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 5, i32 20, i32 1, i32 17> 82 %ay7 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 30, i32 1, i32 22> 83 %ay8 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 1, i32 17> 84 %ay9 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 22, i32 1, i32 17> 85 %ay10 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> <i32 4, i32 20, i32 3, i32 18> 86 87 %r1 = fadd <4 x float> %ay10, %ay9 88 %r2 = fadd <4 x float> %ay8, %ay7 89 %r3 = fadd <4 x float> %ay6, %ay5 90 %r4 = fadd <4 x float> %ay2, %ax10 91 %r5 = fadd <4 x float> %ay9, %ax8 92 %r6 = fadd <4 x float> %r5, %r3 93 %r7 = fadd <4 x float> %a9, %r6 94 %a11 = call <4 x float> @foo(<4 x float> %r7, <4 x float> %a10, <4 x float> %r1, <4 x float> %a4, <4 x float> %a5, <4 x float> %a6, <4 x float> %a7, <4 x float> %a8, <4 x float> %r2, <4 x float> %r4) 95 %a12 = fadd <4 x float> %a2, %a1 96 %a13 = fadd <4 x float> %a12, %a11 97 98 ret <4 x float> %a13 99} 100