1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck --check-prefixes=AVX2-SLOW %s 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2-FAST,AVX2-FAST-ALL %s 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2-FAST,AVX2-FAST-PERLANE %s 5 6; These patterns are produced by LoopVectorizer for interleaved stores. 7 8define void @vf2(<2 x i16>* %in.vecptr0, <2 x i16>* %in.vecptr1, <2 x i16>* %in.vecptr2, <2 x i16>* %in.vecptr3, <2 x i16>* %in.vecptr4, <10 x i16>* %out.vec) nounwind { 9; AVX2-SLOW-LABEL: vf2: 10; AVX2-SLOW: # %bb.0: 11; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 12; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 13; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 14; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 15; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 16; AVX2-SLOW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 17; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,u,u,2,3,6,7,10,11,u,u,18,19,u,u,u,u,u,u,u,u,u,u,u,u] 18; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 19; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,0,3,4,7,4,7] 20; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] 21; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u> 22; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 23; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 24; AVX2-SLOW-NEXT: vmovd %xmm1, 16(%r9) 25; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%r9) 26; AVX2-SLOW-NEXT: vzeroupper 27; AVX2-SLOW-NEXT: retq 28; 29; AVX2-FAST-LABEL: vf2: 30; AVX2-FAST: # %bb.0: 31; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 32; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm1 33; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 34; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 35; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 36; AVX2-FAST-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0 37; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,ymm0[2,3,6,7,10,11],zero,zero,ymm0[18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 38; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 39; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,ymm0[30,31],zero,zero,ymm0[30,31,30,31,16,17,18,19,28,29,30,31] 40; AVX2-FAST-NEXT: vpor %ymm0, %ymm1, %ymm0 41; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 42; AVX2-FAST-NEXT: vmovd %xmm1, 16(%r9) 43; AVX2-FAST-NEXT: vmovdqa %xmm0, (%r9) 44; AVX2-FAST-NEXT: vzeroupper 45; AVX2-FAST-NEXT: retq 46 %in.vec0 = load <2 x i16>, <2 x i16>* %in.vecptr0, align 32 47 %in.vec1 = load <2 x i16>, <2 x i16>* %in.vecptr1, align 32 48 %in.vec2 = load <2 x i16>, <2 x i16>* %in.vecptr2, align 32 49 %in.vec3 = load <2 x i16>, <2 x i16>* %in.vecptr3, align 32 50 %in.vec4 = load <2 x i16>, <2 x i16>* %in.vecptr4, align 32 51 52 %concat01 = shufflevector <2 x i16> %in.vec0, <2 x i16> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 53 %concat23 = shufflevector <2 x i16> %in.vec2, <2 x i16> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 54 %concat0123 = shufflevector <4 x i16> %concat01, <4 x i16> %concat23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 55 %concat4uuu = shufflevector <2 x i16> %in.vec4, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 56 %concat01234 = shufflevector <8 x i16> %concat0123, <8 x i16> %concat4uuu, <10 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9> 57 %interleaved.vec = shufflevector <10 x i16> %concat01234, <10 x i16> poison, <10 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 1, i32 3, i32 5, i32 7, i32 9> 58 59 store <10 x i16> %interleaved.vec, <10 x i16>* %out.vec, align 32 60 61 ret void 62} 63 64define void @vf4(<4 x i16>* %in.vecptr0, <4 x i16>* %in.vecptr1, <4 x i16>* %in.vecptr2, <4 x i16>* %in.vecptr3, <4 x i16>* %in.vecptr4, <20 x i16>* %out.vec) nounwind { 65; AVX2-SLOW-LABEL: vf4: 66; AVX2-SLOW: # %bb.0: 67; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 68; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 69; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 70; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 71; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 72; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 73; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 74; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 75; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %ymm3 76; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] 77; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 78; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] 79; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] 80; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] 81; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 82; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 83; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 84; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] 85; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 86; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] 87; AVX2-SLOW-NEXT: vmovq %xmm0, 32(%r9) 88; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) 89; AVX2-SLOW-NEXT: vzeroupper 90; AVX2-SLOW-NEXT: retq 91; 92; AVX2-FAST-LABEL: vf4: 93; AVX2-FAST: # %bb.0: 94; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 95; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 96; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 97; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 98; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 99; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 100; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 101; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 102; AVX2-FAST-NEXT: vpbroadcastq %xmm3, %ymm3 103; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9,u,u,u,u,u,u,2,3,10,11,u,u,26,27,u,u,u,u,u,u,20,21,28,29,u,u,u,u] 104; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] 105; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,8,9,u,u,u,u,u,u,2,3,u,u,u,u,20,21,28,29,u,u,u,u,u,u,22,23] 106; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6],ymm2[7],ymm4[8,9],ymm2[10,11],ymm4[12,13,14],ymm2[15] 107; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] 108; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 109; AVX2-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 110; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u] 111; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 112; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] 113; AVX2-FAST-NEXT: vmovq %xmm0, 32(%r9) 114; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9) 115; AVX2-FAST-NEXT: vzeroupper 116; AVX2-FAST-NEXT: retq 117 %in.vec0 = load <4 x i16>, <4 x i16>* %in.vecptr0, align 32 118 %in.vec1 = load <4 x i16>, <4 x i16>* %in.vecptr1, align 32 119 %in.vec2 = load <4 x i16>, <4 x i16>* %in.vecptr2, align 32 120 %in.vec3 = load <4 x i16>, <4 x i16>* %in.vecptr3, align 32 121 %in.vec4 = load <4 x i16>, <4 x i16>* %in.vecptr4, align 32 122 123 %concat01 = shufflevector <4 x i16> %in.vec0, <4 x i16> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 124 %concat23 = shufflevector <4 x i16> %in.vec2, <4 x i16> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 125 %concat0123 = shufflevector <8 x i16> %concat01, <8 x i16> %concat23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 126 %concat4uuu = shufflevector <4 x i16> %in.vec4, <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 127 %concat01234 = shufflevector <16 x i16> %concat0123, <16 x i16> %concat4uuu, <20 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19> 128 %interleaved.vec = shufflevector <20 x i16> %concat01234, <20 x i16> poison, <20 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 1, i32 5, i32 9, i32 13, i32 17, i32 2, i32 6, i32 10, i32 14, i32 18, i32 3, i32 7, i32 11, i32 15, i32 19> 129 130 store <20 x i16> %interleaved.vec, <20 x i16>* %out.vec, align 32 131 132 ret void 133} 134 135define void @vf8(<8 x i16>* %in.vecptr0, <8 x i16>* %in.vecptr1, <8 x i16>* %in.vecptr2, <8 x i16>* %in.vecptr3, <8 x i16>* %in.vecptr4, <40 x i16>* %out.vec) nounwind { 136; AVX2-SLOW-LABEL: vf8: 137; AVX2-SLOW: # %bb.0: 138; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm3 139; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 140; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm1 141; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm2 142; AVX2-SLOW-NEXT: vmovdqa (%r8), %xmm4 143; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 144; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[6,7,u,u,u,u,10,11,u,u,8,9,u,u,u,u,22,23,u,u,u,u,26,27,u,u,24,25,u,u,u,u] 145; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] 146; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,6,7,10,11,u,u,8,9,u,u,8,9,12,13,u,u,22,23,26,27,u,u,24,25,u,u,24,25,28,29] 147; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4],ymm6[5],ymm5[6,7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12],ymm6[13],ymm5[14,15] 148; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm6 149; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,10,11,6,7,u,u,u,u,10,11,12,13,8,9,24,25,26,27,22,23,u,u,u,u,26,27,28,29,24,25] 150; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1] 151; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13] 152; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4],ymm7[5,6,7,8,9,10],ymm6[11,12],ymm7[13,14,15] 153; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> 154; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 155; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %ymm6 156; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] 157; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 158; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 159; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] 160; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,12,13],zero,zero,zero,zero,ymm6[2,3,18,19,18,19],zero,zero,zero,zero,ymm6[28,29,20,21,28,29],zero,zero 161; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm8 162; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] 163; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm8[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm8[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm8[22,23] 164; AVX2-SLOW-NEXT: vpor %ymm6, %ymm8, %ymm6 165; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm8 166; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm6 167; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 168; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,6] 169; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 170; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 171; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6] 172; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 173; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7] 174; AVX2-SLOW-NEXT: vpsrlq $48, %xmm0, %xmm0 175; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] 176; AVX2-SLOW-NEXT: vmovdqa %xmm0, 64(%r9) 177; AVX2-SLOW-NEXT: vmovdqa %ymm6, (%r9) 178; AVX2-SLOW-NEXT: vmovdqa %ymm5, 32(%r9) 179; AVX2-SLOW-NEXT: vzeroupper 180; AVX2-SLOW-NEXT: retq 181; 182; AVX2-FAST-ALL-LABEL: vf8: 183; AVX2-FAST-ALL: # %bb.0: 184; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm1 185; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm0 186; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm2 187; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm3 188; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %xmm4 189; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5 190; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0] 191; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[0,1,8,9,12,13],zero,zero,zero,zero,ymm5[2,3,18,19,18,19],zero,zero,zero,zero,ymm5[28,29,20,21,28,29],zero,zero 192; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm6 193; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] 194; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm6[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm6[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm6[22,23] 195; AVX2-FAST-ALL-NEXT: vpor %ymm5, %ymm6, %ymm5 196; AVX2-FAST-ALL-NEXT: vpbroadcastq (%r8), %ymm6 197; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] 198; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 199; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 200; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = <1,5,2,u,6,2,u,u> 201; AVX2-FAST-ALL-NEXT: vpermd %ymm6, %ymm8, %ymm6 202; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[2,3,6,7,6,7],zero,zero,zero,zero,ymm6[8,9,16,17,18,19],zero,zero,zero,zero,ymm6[22,23,18,19,18,19],zero,zero 203; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm8 204; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm9 = <1,5,2,6,2,6,3,u> 205; AVX2-FAST-ALL-NEXT: vpermd %ymm8, %ymm9, %ymm8 206; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,6,7],zero,zero,zero,zero,zero,zero,ymm8[8,9,12,13],zero,zero,zero,zero,zero,zero,ymm8[18,19,22,23],zero,zero,zero,zero,zero,zero,ymm8[24,25] 207; AVX2-FAST-ALL-NEXT: vpor %ymm6, %ymm8, %ymm6 208; AVX2-FAST-ALL-NEXT: vpbroadcastq 8(%rdi), %ymm8 209; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm6 210; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] 211; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,8,9,14,15,u,u,u,u,u,u,12,13] 212; AVX2-FAST-ALL-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 213; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] 214; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6],xmm1[7] 215; AVX2-FAST-ALL-NEXT: vpsrlq $48, %xmm0, %xmm0 216; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] 217; AVX2-FAST-ALL-NEXT: vmovdqa %xmm0, 64(%r9) 218; AVX2-FAST-ALL-NEXT: vmovdqa %ymm6, 32(%r9) 219; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, (%r9) 220; AVX2-FAST-ALL-NEXT: vzeroupper 221; AVX2-FAST-ALL-NEXT: retq 222; 223; AVX2-FAST-PERLANE-LABEL: vf8: 224; AVX2-FAST-PERLANE: # %bb.0: 225; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm2 226; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm0 227; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm1 228; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm3 229; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %xmm4 230; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 231; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[6,7,u,u,u,u,10,11,u,u,8,9,u,u,u,u,22,23,u,u,u,u,26,27,u,u,24,25,u,u,u,u] 232; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] 233; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,6,7,10,11,u,u,8,9,u,u,8,9,12,13,u,u,22,23,26,27,u,u,24,25,u,u,24,25,28,29] 234; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4],ymm6[5],ymm5[6,7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12],ymm6[13],ymm5[14,15] 235; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 236; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,10,11,6,7,u,u,u,u,10,11,12,13,8,9,24,25,26,27,22,23,u,u,u,u,26,27,28,29,24,25] 237; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1] 238; AVX2-FAST-PERLANE-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,5,5,5,5,8,9,10,11,13,13,13,13] 239; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4],ymm7[5,6,7,8,9,10],ymm6[11,12],ymm7[13,14,15] 240; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> 241; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 242; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %ymm6 243; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] 244; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 245; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm6 246; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] 247; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,12,13],zero,zero,zero,zero,ymm6[2,3,18,19,18,19],zero,zero,zero,zero,ymm6[28,29,20,21,28,29],zero,zero 248; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm8 249; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2] 250; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,ymm8[2,3,10,11],zero,zero,zero,zero,zero,zero,ymm8[20,21,28,29],zero,zero,zero,zero,zero,zero,ymm8[22,23] 251; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm8, %ymm6 252; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm8 253; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm6 254; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 255; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,8,9,14,15,u,u,u,u,u,u,12,13] 256; AVX2-FAST-PERLANE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 257; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] 258; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] 259; AVX2-FAST-PERLANE-NEXT: vpsrlq $48, %xmm0, %xmm0 260; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] 261; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, 64(%r9) 262; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, (%r9) 263; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, 32(%r9) 264; AVX2-FAST-PERLANE-NEXT: vzeroupper 265; AVX2-FAST-PERLANE-NEXT: retq 266 %in.vec0 = load <8 x i16>, <8 x i16>* %in.vecptr0, align 32 267 %in.vec1 = load <8 x i16>, <8 x i16>* %in.vecptr1, align 32 268 %in.vec2 = load <8 x i16>, <8 x i16>* %in.vecptr2, align 32 269 %in.vec3 = load <8 x i16>, <8 x i16>* %in.vecptr3, align 32 270 %in.vec4 = load <8 x i16>, <8 x i16>* %in.vecptr4, align 32 271 272 %concat01 = shufflevector <8 x i16> %in.vec0, <8 x i16> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 273 %concat23 = shufflevector <8 x i16> %in.vec2, <8 x i16> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 274 %concat0123 = shufflevector <16 x i16> %concat01, <16 x i16> %concat23, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 275 %concat4uuu = shufflevector <8 x i16> %in.vec4, <8 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 276 %concat01234 = shufflevector <32 x i16> %concat0123, <32 x i16> %concat4uuu, <40 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39> 277 %interleaved.vec = shufflevector <40 x i16> %concat01234, <40 x i16> poison, <40 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 1, i32 9, i32 17, i32 25, i32 33, i32 2, i32 10, i32 18, i32 26, i32 34, i32 3, i32 11, i32 19, i32 27, i32 35, i32 4, i32 12, i32 20, i32 28, i32 36, i32 5, i32 13, i32 21, i32 29, i32 37, i32 6, i32 14, i32 22, i32 30, i32 38, i32 7, i32 15, i32 23, i32 31, i32 39> 278 279 store <40 x i16> %interleaved.vec, <40 x i16>* %out.vec, align 32 280 281 ret void 282} 283 284define void @vf16(<16 x i16>* %in.vecptr0, <16 x i16>* %in.vecptr1, <16 x i16>* %in.vecptr2, <16 x i16>* %in.vecptr3, <16 x i16>* %in.vecptr4, <80 x i16>* %out.vec) nounwind { 285; AVX2-SLOW-LABEL: vf16: 286; AVX2-SLOW: # %bb.0: 287; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm12 288; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm11 289; AVX2-SLOW-NEXT: vmovdqa (%rdx), %ymm3 290; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm4 291; AVX2-SLOW-NEXT: vmovdqa (%r8), %ymm2 292; AVX2-SLOW-NEXT: vmovdqa (%rdx), %xmm5 293; AVX2-SLOW-NEXT: vmovdqa (%rcx), %xmm6 294; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] 295; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 296; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] 297; AVX2-SLOW-NEXT: vmovdqa (%rsi), %xmm0 298; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm1 299; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 300; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] 301; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] 302; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 303; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> 304; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm7, %ymm1 305; AVX2-SLOW-NEXT: vpbroadcastq (%r8), %ymm8 306; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] 307; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm8, %ymm9 308; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,2,2,2] 309; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,u,u,10,11,u,u,8,9,8,9,u,u,12,13] 310; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] 311; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 312; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,1,2,3] 313; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[3,3,3,3,4,5,6,7] 314; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,6,4] 315; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2],xmm1[3],xmm5[4],xmm1[5,6],xmm5[7] 316; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 317; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 318; AVX2-SLOW-NEXT: vpbroadcastq 8(%rdi), %ymm1 319; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm8 320; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,0,1,4,5,4,5] 321; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,u,u,24,25,20,21,u,u,24,25] 322; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 323; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 324; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[0,1,2,1,4,5,6,5] 325; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,2,4,5,6,7,11,9,10,10,12,13,14,15] 326; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] 327; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3],ymm1[4],ymm5[5,6],ymm1[7],ymm5[8,9],ymm1[10],ymm5[11],ymm1[12],ymm5[13,14],ymm1[15] 328; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 329; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 330; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,2,2,3,5,6,6,7] 331; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 332; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 333; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] 334; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,3,2,3,6,7,6,7] 335; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] 336; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 337; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[3,2,3,3,7,6,7,7] 338; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm11[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] 339; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,3,2,4,6,7,6] 340; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15] 341; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] 342; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm5, %ymm1 343; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} ymm5 = ymm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 344; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] 345; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm1 346; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 347; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] 348; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] 349; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm12[1,1,2,2] 350; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2] 351; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4],ymm4[5],ymm2[6,7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12],ymm4[13],ymm2[14,15] 352; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 353; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] 354; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 355; AVX2-SLOW-NEXT: vmovdqa %ymm2, 64(%r9) 356; AVX2-SLOW-NEXT: vmovdqa %ymm1, 128(%r9) 357; AVX2-SLOW-NEXT: vmovdqa %ymm0, 96(%r9) 358; AVX2-SLOW-NEXT: vmovdqa %ymm8, 32(%r9) 359; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%r9) 360; AVX2-SLOW-NEXT: vzeroupper 361; AVX2-SLOW-NEXT: retq 362; 363; AVX2-FAST-ALL-LABEL: vf16: 364; AVX2-FAST-ALL: # %bb.0: 365; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm12 366; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %ymm11 367; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %ymm3 368; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %ymm4 369; AVX2-FAST-ALL-NEXT: vmovdqa (%r8), %ymm2 370; AVX2-FAST-ALL-NEXT: vmovdqa (%rsi), %xmm5 371; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %xmm6 372; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] 373; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] 374; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,1,0,1] 375; AVX2-FAST-ALL-NEXT: vmovdqa (%rdx), %xmm6 376; AVX2-FAST-ALL-NEXT: vmovdqa (%rcx), %xmm0 377; AVX2-FAST-ALL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] 378; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 379; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 380; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> 381; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm7, %ymm1, %ymm1 382; AVX2-FAST-ALL-NEXT: vpbroadcastq (%r8), %ymm8 383; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] 384; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm1, %ymm8, %ymm9 385; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,2,2,2] 386; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,u,u,10,11,u,u,8,9,8,9,u,u,12,13] 387; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7] 388; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 389; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} xmm5 = mem[2,1,2,3] 390; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,6,7,u,u,10,11,u,u,u,u,8,9] 391; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3],xmm0[4],xmm5[5,6],xmm0[7] 392; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 393; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm0 394; AVX2-FAST-ALL-NEXT: vpbroadcastq 8(%rdi), %ymm1 395; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm8 396; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,0,1,4,5,4,5] 397; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,u,u,24,25,20,21,u,u,24,25] 398; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 399; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 400; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[0,1,2,1,4,5,6,5] 401; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] 402; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3],ymm1[4],ymm5[5,6],ymm1[7],ymm5[8,9],ymm1[10],ymm5[11],ymm1[12],ymm5[13,14],ymm1[15] 403; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 404; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 405; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [25769803781,25769803781,25769803781,25769803781] 406; AVX2-FAST-ALL-NEXT: vpermd %ymm3, %ymm1, %ymm1 407; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 408; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] 409; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,3,2,3,6,7,6,7] 410; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] 411; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 412; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29] 413; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[3,2,3,3,7,6,7,7] 414; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15] 415; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] 416; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm1, %ymm5, %ymm1 417; AVX2-FAST-ALL-NEXT: vpsrldq {{.*#+}} ymm5 = ymm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 418; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] 419; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm1 420; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 421; AVX2-FAST-ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] 422; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] 423; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm4 = ymm12[1,1,2,2] 424; AVX2-FAST-ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2] 425; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4],ymm4[5],ymm2[6,7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12],ymm4[13],ymm2[14,15] 426; AVX2-FAST-ALL-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 427; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] 428; AVX2-FAST-ALL-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 429; AVX2-FAST-ALL-NEXT: vmovdqa %ymm2, 64(%r9) 430; AVX2-FAST-ALL-NEXT: vmovdqa %ymm1, 128(%r9) 431; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, 96(%r9) 432; AVX2-FAST-ALL-NEXT: vmovdqa %ymm8, 32(%r9) 433; AVX2-FAST-ALL-NEXT: vmovdqa %ymm9, (%r9) 434; AVX2-FAST-ALL-NEXT: vzeroupper 435; AVX2-FAST-ALL-NEXT: retq 436; 437; AVX2-FAST-PERLANE-LABEL: vf16: 438; AVX2-FAST-PERLANE: # %bb.0: 439; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm12 440; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %ymm11 441; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %ymm3 442; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %ymm4 443; AVX2-FAST-PERLANE-NEXT: vmovdqa (%r8), %ymm2 444; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rsi), %xmm5 445; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm6 446; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] 447; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] 448; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,1,0,1] 449; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdx), %xmm6 450; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rcx), %xmm0 451; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] 452; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] 453; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 454; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255> 455; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm7, %ymm1, %ymm1 456; AVX2-FAST-PERLANE-NEXT: vpbroadcastq (%r8), %ymm8 457; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255] 458; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm8, %ymm9 459; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,2,2,2] 460; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,u,u,10,11,u,u,8,9,8,9,u,u,12,13] 461; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7] 462; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] 463; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm5 = mem[2,1,2,3] 464; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,6,7,u,u,10,11,u,u,u,u,8,9] 465; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3],xmm0[4],xmm5[5,6],xmm0[7] 466; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] 467; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm0 468; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 8(%rdi), %ymm1 469; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm8 470; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,0,1,4,5,4,5] 471; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,u,u,22,23,u,u,24,25,20,21,u,u,24,25] 472; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 473; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 474; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[0,1,2,1,4,5,6,5] 475; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u] 476; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3],ymm1[4],ymm5[5,6],ymm1[7],ymm5[8,9],ymm1[10],ymm5[11],ymm1[12],ymm5[13,14],ymm1[15] 477; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 478; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 479; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,2,2,3,5,6,6,7] 480; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] 481; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 482; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] 483; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,3,2,3,6,7,6,7] 484; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] 485; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] 486; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29] 487; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[3,2,3,3,7,6,7,7] 488; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15] 489; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] 490; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm5, %ymm1 491; AVX2-FAST-PERLANE-NEXT: vpsrldq {{.*#+}} ymm5 = ymm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 492; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] 493; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm1 494; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] 495; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,0,3,0,7,4,7,4] 496; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] 497; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm12[1,1,2,2] 498; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2] 499; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4],ymm4[5],ymm2[6,7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12],ymm4[13],ymm2[14,15] 500; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 501; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[0,1,0,1,0,1,0,1,14,15,2,3,2,3,14,15,16,17,16,17,16,17,16,17,30,31,18,19,18,19,30,31] 502; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 503; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 64(%r9) 504; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 128(%r9) 505; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 96(%r9) 506; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, 32(%r9) 507; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%r9) 508; AVX2-FAST-PERLANE-NEXT: vzeroupper 509; AVX2-FAST-PERLANE-NEXT: retq 510 %in.vec0 = load <16 x i16>, <16 x i16>* %in.vecptr0, align 32 511 %in.vec1 = load <16 x i16>, <16 x i16>* %in.vecptr1, align 32 512 %in.vec2 = load <16 x i16>, <16 x i16>* %in.vecptr2, align 32 513 %in.vec3 = load <16 x i16>, <16 x i16>* %in.vecptr3, align 32 514 %in.vec4 = load <16 x i16>, <16 x i16>* %in.vecptr4, align 32 515 516 %concat01 = shufflevector <16 x i16> %in.vec0, <16 x i16> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 517 %concat23 = shufflevector <16 x i16> %in.vec2, <16 x i16> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 518 %concat0123 = shufflevector <32 x i16> %concat01, <32 x i16> %concat23, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 519 %concat4uuu = shufflevector <16 x i16> %in.vec4, <16 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 520 %concat01234 = shufflevector <64 x i16> %concat0123, <64 x i16> %concat4uuu, <80 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79> 521 %interleaved.vec = shufflevector <80 x i16> %concat01234, <80 x i16> poison, <80 x i32> <i32 0, i32 16, i32 32, i32 48, i32 64, i32 1, i32 17, i32 33, i32 49, i32 65, i32 2, i32 18, i32 34, i32 50, i32 66, i32 3, i32 19, i32 35, i32 51, i32 67, i32 4, i32 20, i32 36, i32 52, i32 68, i32 5, i32 21, i32 37, i32 53, i32 69, i32 6, i32 22, i32 38, i32 54, i32 70, i32 7, i32 23, i32 39, i32 55, i32 71, i32 8, i32 24, i32 40, i32 56, i32 72, i32 9, i32 25, i32 41, i32 57, i32 73, i32 10, i32 26, i32 42, i32 58, i32 74, i32 11, i32 27, i32 43, i32 59, i32 75, i32 12, i32 28, i32 44, i32 60, i32 76, i32 13, i32 29, i32 45, i32 61, i32 77, i32 14, i32 30, i32 46, i32 62, i32 78, i32 15, i32 31, i32 47, i32 63, i32 79> 522 523 store <80 x i16> %interleaved.vec, <80 x i16>* %out.vec, align 32 524 525 ret void 526} 527