1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck --check-prefixes=AVX2-SLOW %s 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2-FAST,AVX2-FAST-ALL %s 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2-FAST,AVX2-FAST-PERLANE %s 5 6; These patterns are produced by LoopVectorizer for interleaved stores. 7 8define void @vf2(<2 x i16>* %in.vecptr0, <2 x i16>* %in.vecptr1, <2 x i16>* %in.vecptr2, <2 x i16>* %in.vecptr3, <8 x i16>* %out.vec) nounwind { 9; AVX2-SLOW-LABEL: vf2: 10; AVX2-SLOW: # %bb.0: 11; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 12; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 13; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 14; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 15; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 16; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15] 17; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%r8) 18; AVX2-SLOW-NEXT: retq 19; 20; AVX2-FAST-LABEL: vf2: 21; AVX2-FAST: # %bb.0: 22; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 23; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 24; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 25; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 26; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 27; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15] 28; AVX2-FAST-NEXT: vmovdqa %xmm0, (%r8) 29; AVX2-FAST-NEXT: retq 30 %in.vec0 = load <2 x i16>, <2 x i16>* %in.vecptr0, align 32 31 %in.vec1 = load <2 x i16>, <2 x i16>* %in.vecptr1, align 32 32 %in.vec2 = load <2 x i16>, <2 x i16>* %in.vecptr2, align 32 33 %in.vec3 = load <2 x i16>, <2 x i16>* %in.vecptr3, align 32 34 35 %concat01 = shufflevector <2 x i16> %in.vec0, <2 x i16> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 36 %concat23 = shufflevector <2 x i16> %in.vec2, <2 x i16> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 37 %concat0123 = shufflevector <4 x i16> %concat01, <4 x i16> %concat23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 38 %interleaved.vec = shufflevector <8 x i16> %concat0123, <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7> 39 40 store <8 x i16> %interleaved.vec, <8 x i16>* %out.vec, align 32 41 42 ret void 43} 44 45define void @vf4(<4 x i16>* %in.vecptr0, <4 x i16>* %in.vecptr1, <4 x i16>* %in.vecptr2, <4 x i16>* %in.vecptr3, <16 x i16>* %out.vec) nounwind { 46; AVX2-SLOW-LABEL: vf4: 47; AVX2-SLOW: # %bb.0: 48; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 49; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 50; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 51; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 52; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 53; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 54; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 55; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,22,23,30,31] 56; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 57; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,20,21,28,29,u,u,u,u,22,23,30,31,u,u,u,u] 58; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7] 59; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) 60; AVX2-SLOW-NEXT: vzeroupper 61; AVX2-SLOW-NEXT: retq 62; 63; AVX2-FAST-ALL-LABEL: vf4: 64; AVX2-FAST-ALL: # %bb.0: 65; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 66; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 67; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 68; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 69; AVX2-FAST-ALL-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 70; AVX2-FAST-ALL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 71; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 72; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,1,3,5,7] 73; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 74; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,2,3,6,7,10,11,14,15,16,17,20,21,24,25,28,29,18,19,22,23,26,27,30,31] 75; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%r8) 76; AVX2-FAST-ALL-NEXT: vzeroupper 77; AVX2-FAST-ALL-NEXT: retq 78; 79; AVX2-FAST-PERLANE-LABEL: vf4: 80; AVX2-FAST-PERLANE: # %bb.0: 81; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 82; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 83; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 84; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 85; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 86; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 87; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 88; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u,22,23,30,31] 89; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 90; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,20,21,28,29,u,u,u,u,22,23,30,31,u,u,u,u] 91; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7] 92; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) 93; AVX2-FAST-PERLANE-NEXT: vzeroupper 94; AVX2-FAST-PERLANE-NEXT: retq 95 %in.vec0 = load <4 x i16>, <4 x i16>* %in.vecptr0, align 32 96 %in.vec1 = load <4 x i16>, <4 x i16>* %in.vecptr1, align 32 97 %in.vec2 = load <4 x i16>, <4 x i16>* %in.vecptr2, align 32 98 %in.vec3 = load <4 x i16>, <4 x i16>* %in.vecptr3, align 32 99 100 %concat01 = shufflevector <4 x i16> %in.vec0, <4 x i16> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 101 %concat23 = shufflevector <4 x i16> %in.vec2, <4 x i16> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 102 %concat0123 = shufflevector <8 x i16> %concat01, <8 x i16> %concat23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 103 %interleaved.vec = shufflevector <16 x i16> %concat0123, <16 x i16> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15> 104 105 store <16 x i16> %interleaved.vec, <16 x i16>* %out.vec, align 32 106 107 ret void 108} 109 110define void @vf8(<8 x i16>* %in.vecptr0, <8 x i16>* %in.vecptr1, <8 x i16>* %in.vecptr2, <8 x i16>* %in.vecptr3, <32 x i16>* %out.vec) nounwind { 111; AVX2-SLOW-LABEL: vf8: 112; AVX2-SLOW: # %bb.0: 113; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 114; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 115; AVX2-SLOW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 116; AVX2-SLOW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 117; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,0,2] 118; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15> 119; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 120; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] 121; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u> 122; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm4 123; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] 124; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] 125; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 126; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] 127; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm0 128; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 129; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r8) 130; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r8) 131; AVX2-SLOW-NEXT: vzeroupper 132; AVX2-SLOW-NEXT: retq 133; 134; AVX2-FAST-LABEL: vf8: 135; AVX2-FAST: # %bb.0: 136; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 137; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 138; AVX2-FAST-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 139; AVX2-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 140; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,0,2] 141; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15> 142; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 143; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] 144; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u> 145; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm4, %ymm4 146; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7] 147; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] 148; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 149; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] 150; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 151; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] 152; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%r8) 153; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r8) 154; AVX2-FAST-NEXT: vzeroupper 155; AVX2-FAST-NEXT: retq 156 %in.vec0 = load <8 x i16>, <8 x i16>* %in.vecptr0, align 32 157 %in.vec1 = load <8 x i16>, <8 x i16>* %in.vecptr1, align 32 158 %in.vec2 = load <8 x i16>, <8 x i16>* %in.vecptr2, align 32 159 %in.vec3 = load <8 x i16>, <8 x i16>* %in.vecptr3, align 32 160 161 %concat01 = shufflevector <8 x i16> %in.vec0, <8 x i16> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 162 %concat23 = shufflevector <8 x i16> %in.vec2, <8 x i16> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 163 %concat0123 = shufflevector <16 x i16> %concat01, <16 x i16> %concat23, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 164 %interleaved.vec = shufflevector <32 x i16> %concat0123, <32 x i16> poison, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31> 165 166 store <32 x i16> %interleaved.vec, <32 x i16>* %out.vec, align 32 167 168 ret void 169} 170 171define void @vf16(<16 x i16>* %in.vecptr0, <16 x i16>* %in.vecptr1, <16 x i16>* %in.vecptr2, <16 x i16>* %in.vecptr3, <64 x i16>* %out.vec) nounwind { 172; AVX2-SLOW-LABEL: vf16: 173; AVX2-SLOW: # %bb.0: 174; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm5 175; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm8 176; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 177; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm9 178; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 179; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] 180; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] 181; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 182; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm7 183; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm3 184; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 185; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm4 186; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 187; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero 188; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 189; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 190; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] 191; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 192; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,1,1] 193; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 194; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 195; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] 196; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero 197; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] 198; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2 199; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] 200; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] 201; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] 202; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] 203; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 204; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] 205; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero 206; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 207; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 208; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] 209; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 210; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] 211; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] 212; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 213; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 214; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero 215; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] 216; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 217; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] 218; AVX2-SLOW-NEXT: vmovdqa %ymm2, 96(%r8) 219; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) 220; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%r8) 221; AVX2-SLOW-NEXT: vmovdqa %ymm11, 64(%r8) 222; AVX2-SLOW-NEXT: vzeroupper 223; AVX2-SLOW-NEXT: retq 224; 225; AVX2-FAST-LABEL: vf16: 226; AVX2-FAST: # %bb.0: 227; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm5 228; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm8 229; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm6 230; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm9 231; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] 232; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] 233; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] 234; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 235; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm7 236; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm3 237; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 238; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm4 239; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 240; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero 241; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 242; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 243; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] 244; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 245; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,1,1] 246; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 247; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 248; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] 249; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero 250; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] 251; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm10, %ymm2 252; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] 253; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] 254; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] 255; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] 256; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 257; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] 258; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero 259; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 260; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 261; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] 262; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] 263; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,0,1,1] 264; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] 265; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 266; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 267; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero 268; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] 269; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 270; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] 271; AVX2-FAST-NEXT: vmovdqa %ymm2, 96(%r8) 272; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r8) 273; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r8) 274; AVX2-FAST-NEXT: vmovdqa %ymm11, 64(%r8) 275; AVX2-FAST-NEXT: vzeroupper 276; AVX2-FAST-NEXT: retq 277 %in.vec0 = load <16 x i16>, <16 x i16>* %in.vecptr0, align 32 278 %in.vec1 = load <16 x i16>, <16 x i16>* %in.vecptr1, align 32 279 %in.vec2 = load <16 x i16>, <16 x i16>* %in.vecptr2, align 32 280 %in.vec3 = load <16 x i16>, <16 x i16>* %in.vecptr3, align 32 281 282 %concat01 = shufflevector <16 x i16> %in.vec0, <16 x i16> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 283 %concat23 = shufflevector <16 x i16> %in.vec2, <16 x i16> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 284 %concat0123 = shufflevector <32 x i16> %concat01, <32 x i16> %concat23, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 285 %interleaved.vec = shufflevector <64 x i16> %concat0123, <64 x i16> poison, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63> 286 287 store <64 x i16> %interleaved.vec, <64 x i16>* %out.vec, align 32 288 289 ret void 290} 291 292define void @vf32(<32 x i16>* %in.vecptr0, <32 x i16>* %in.vecptr1, <32 x i16>* %in.vecptr2, <32 x i16>* %in.vecptr3, <128 x i16>* %out.vec) nounwind { 293; AVX2-SLOW-LABEL: vf32: 294; AVX2-SLOW: # %bb.0: 295; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm15 296; AVX2-SLOW-NEXT: vmovdqa 16(%rcx), %xmm12 297; AVX2-SLOW-NEXT: vmovdqa 32(%rcx), %xmm11 298; AVX2-SLOW-NEXT: vmovdqa 48(%rcx), %xmm2 299; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm6 300; AVX2-SLOW-NEXT: vmovdqa 16(%rdx), %xmm13 301; AVX2-SLOW-NEXT: vmovdqa 32(%rdx), %xmm1 302; AVX2-SLOW-NEXT: vmovdqa 48(%rdx), %xmm7 303; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] 304; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] 305; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] 306; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm8 307; AVX2-SLOW-NEXT: vmovdqa 16(%rsi), %xmm14 308; AVX2-SLOW-NEXT: vmovdqa 32(%rsi), %xmm3 309; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm5 310; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm4 311; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] 312; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero 313; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 314; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 315; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] 316; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 317; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] 318; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] 319; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 320; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm8 321; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 322; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm0[0],zero,xmm0[1],zero 323; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 324; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 325; AVX2-SLOW-NEXT: vmovdqa 48(%rsi), %xmm10 326; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] 327; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0 328; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] 329; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] 330; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 331; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1 332; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 333; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero 334; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] 335; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 336; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] 337; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] 338; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 339; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 340; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 341; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] 342; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero 343; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] 344; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 345; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm4 346; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] 347; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 348; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] 349; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] 350; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] 351; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 352; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] 353; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero 354; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 355; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 356; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] 357; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] 358; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] 359; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 360; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 361; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 362; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero 363; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] 364; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm7, %ymm3 365; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] 366; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] 367; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] 368; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] 369; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 370; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] 371; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero 372; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 373; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 374; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] 375; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] 376; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] 377; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] 378; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 379; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] 380; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero 381; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] 382; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 383; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] 384; AVX2-SLOW-NEXT: vmovdqa %ymm3, 96(%r8) 385; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r8) 386; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%r8) 387; AVX2-SLOW-NEXT: vmovdqa %ymm2, 192(%r8) 388; AVX2-SLOW-NEXT: vmovdqa %ymm11, 224(%r8) 389; AVX2-SLOW-NEXT: vmovdqa %ymm8, 128(%r8) 390; AVX2-SLOW-NEXT: vmovdqa %ymm9, 160(%r8) 391; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 392; AVX2-SLOW-NEXT: vmovaps %ymm0, 64(%r8) 393; AVX2-SLOW-NEXT: vzeroupper 394; AVX2-SLOW-NEXT: retq 395; 396; AVX2-FAST-LABEL: vf32: 397; AVX2-FAST: # %bb.0: 398; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm15 399; AVX2-FAST-NEXT: vmovdqa 16(%rcx), %xmm12 400; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm11 401; AVX2-FAST-NEXT: vmovdqa 48(%rcx), %xmm2 402; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm6 403; AVX2-FAST-NEXT: vmovdqa 16(%rdx), %xmm13 404; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm1 405; AVX2-FAST-NEXT: vmovdqa 48(%rdx), %xmm7 406; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] 407; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,1,1] 408; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] 409; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm8 410; AVX2-FAST-NEXT: vmovdqa 16(%rsi), %xmm14 411; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm3 412; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm5 413; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 414; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] 415; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero 416; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 417; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm0 418; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] 419; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 420; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] 421; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,0,1,1] 422; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 423; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm8 424; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 425; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm0[0],zero,xmm0[1],zero 426; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 427; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm10, %ymm0 428; AVX2-FAST-NEXT: vmovdqa 48(%rsi), %xmm10 429; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7] 430; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm0 431; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] 432; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,0,1,1] 433; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 434; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm1 435; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 436; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero 437; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] 438; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 439; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] 440; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] 441; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 442; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 443; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 444; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] 445; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero 446; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] 447; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 448; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm4 449; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] 450; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 451; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] 452; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1] 453; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] 454; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 455; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3] 456; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero 457; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 458; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 459; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] 460; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] 461; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,1,1] 462; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 463; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 464; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 465; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero 466; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] 467; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm7, %ymm3 468; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] 469; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] 470; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,0,1,1] 471; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] 472; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm3 473; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] 474; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero 475; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 476; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 477; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] 478; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] 479; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,0,1,1] 480; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] 481; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 482; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] 483; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero 484; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] 485; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 486; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] 487; AVX2-FAST-NEXT: vmovdqa %ymm3, 96(%r8) 488; AVX2-FAST-NEXT: vmovdqa %ymm1, (%r8) 489; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%r8) 490; AVX2-FAST-NEXT: vmovdqa %ymm2, 192(%r8) 491; AVX2-FAST-NEXT: vmovdqa %ymm11, 224(%r8) 492; AVX2-FAST-NEXT: vmovdqa %ymm8, 128(%r8) 493; AVX2-FAST-NEXT: vmovdqa %ymm9, 160(%r8) 494; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 495; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r8) 496; AVX2-FAST-NEXT: vzeroupper 497; AVX2-FAST-NEXT: retq 498 %in.vec0 = load <32 x i16>, <32 x i16>* %in.vecptr0, align 32 499 %in.vec1 = load <32 x i16>, <32 x i16>* %in.vecptr1, align 32 500 %in.vec2 = load <32 x i16>, <32 x i16>* %in.vecptr2, align 32 501 %in.vec3 = load <32 x i16>, <32 x i16>* %in.vecptr3, align 32 502 503 %concat01 = shufflevector <32 x i16> %in.vec0, <32 x i16> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 504 %concat23 = shufflevector <32 x i16> %in.vec2, <32 x i16> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 505 %concat0123 = shufflevector <64 x i16> %concat01, <64 x i16> %concat23, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 506 %interleaved.vec = shufflevector <128 x i16> %concat0123, <128 x i16> poison, <128 x i32> <i32 0, i32 32, i32 64, i32 96, i32 1, i32 33, i32 65, i32 97, i32 2, i32 34, i32 66, i32 98, i32 3, i32 35, i32 67, i32 99, i32 4, i32 36, i32 68, i32 100, i32 5, i32 37, i32 69, i32 101, i32 6, i32 38, i32 70, i32 102, i32 7, i32 39, i32 71, i32 103, i32 8, i32 40, i32 72, i32 104, i32 9, i32 41, i32 73, i32 105, i32 10, i32 42, i32 74, i32 106, i32 11, i32 43, i32 75, i32 107, i32 12, i32 44, i32 76, i32 108, i32 13, i32 45, i32 77, i32 109, i32 14, i32 46, i32 78, i32 110, i32 15, i32 47, i32 79, i32 111, i32 16, i32 48, i32 80, i32 112, i32 17, i32 49, i32 81, i32 113, i32 18, i32 50, i32 82, i32 114, i32 19, i32 51, i32 83, i32 115, i32 20, i32 52, i32 84, i32 116, i32 21, i32 53, i32 85, i32 117, i32 22, i32 54, i32 86, i32 118, i32 23, i32 55, i32 87, i32 119, i32 24, i32 56, i32 88, i32 120, i32 25, i32 57, i32 89, i32 121, i32 26, i32 58, i32 90, i32 122, i32 27, i32 59, i32 91, i32 123, i32 28, i32 60, i32 92, i32 124, i32 29, i32 61, i32 93, i32 125, i32 30, i32 62, i32 94, i32 126, i32 31, i32 63, i32 95, i32 127> 507 508 store <128 x i16> %interleaved.vec, <128 x i16>* %out.vec, align 32 509 510 ret void 511} 512