1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck --check-prefixes=AVX2-SLOW %s 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2-FAST,AVX2-FAST-ALL %s 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2-FAST,AVX2-FAST-PERLANE %s 5 6; These patterns are produced by LoopVectorizer for interleaved loads. 7 8define void @vf2(<10 x i16>* %in.vec, <2 x i16>* %out.vec0, <2 x i16>* %out.vec1, <2 x i16>* %out.vec2, <2 x i16>* %out.vec3, <2 x i16>* %out.vec4) nounwind { 9; AVX2-SLOW-LABEL: vf2: 10; AVX2-SLOW: # %bb.0: 11; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 12; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 13; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] 14; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] 15; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3] 16; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 17; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3] 18; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] 19; AVX2-SLOW-NEXT: vpsrlq $48, %xmm0, %xmm0 20; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 21; AVX2-SLOW-NEXT: vpbroadcastw 8(%rdi), %xmm5 22; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7] 23; AVX2-SLOW-NEXT: vmovd %xmm2, (%rsi) 24; AVX2-SLOW-NEXT: vmovd %xmm3, (%rdx) 25; AVX2-SLOW-NEXT: vmovd %xmm4, (%rcx) 26; AVX2-SLOW-NEXT: vmovd %xmm0, (%r8) 27; AVX2-SLOW-NEXT: vmovd %xmm1, (%r9) 28; AVX2-SLOW-NEXT: retq 29; 30; AVX2-FAST-LABEL: vf2: 31; AVX2-FAST: # %bb.0: 32; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 33; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 34; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] 35; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] 36; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] 37; AVX2-FAST-NEXT: vpsrlq $48, %xmm0, %xmm0 38; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 39; AVX2-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm5 40; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7] 41; AVX2-FAST-NEXT: vmovd %xmm2, (%rsi) 42; AVX2-FAST-NEXT: vmovd %xmm3, (%rdx) 43; AVX2-FAST-NEXT: vmovd %xmm4, (%rcx) 44; AVX2-FAST-NEXT: vmovd %xmm0, (%r8) 45; AVX2-FAST-NEXT: vmovd %xmm1, (%r9) 46; AVX2-FAST-NEXT: retq 47 %wide.vec = load <10 x i16>, <10 x i16>* %in.vec, align 32 48 49 %strided.vec0 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 0, i32 5> 50 %strided.vec1 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 1, i32 6> 51 %strided.vec2 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 2, i32 7> 52 %strided.vec3 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 3, i32 8> 53 %strided.vec4 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 4, i32 9> 54 55 store <2 x i16> %strided.vec0, <2 x i16>* %out.vec0, align 32 56 store <2 x i16> %strided.vec1, <2 x i16>* %out.vec1, align 32 57 store <2 x i16> %strided.vec2, <2 x i16>* %out.vec2, align 32 58 store <2 x i16> %strided.vec3, <2 x i16>* %out.vec3, align 32 59 store <2 x i16> %strided.vec4, <2 x i16>* %out.vec4, align 32 60 61 ret void 62} 63 64define void @vf4(<20 x i16>* %in.vec, <4 x i16>* %out.vec0, <4 x i16>* %out.vec1, <4 x i16>* %out.vec2, <4 x i16>* %out.vec3, <4 x i16>* %out.vec4) nounwind { 65; AVX2-SLOW-LABEL: vf4: 66; AVX2-SLOW: # %bb.0: 67; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0 68; AVX2-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1 69; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 70; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,1,2,3] 71; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] 72; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] 73; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] 74; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 75; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] 76; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] 77; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] 78; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] 79; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] 80; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] 81; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] 82; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] 83; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] 84; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] 85; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] 86; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] 87; AVX2-SLOW-NEXT: vmovq %xmm3, (%rsi) 88; AVX2-SLOW-NEXT: vmovq %xmm4, (%rdx) 89; AVX2-SLOW-NEXT: vmovq %xmm5, (%rcx) 90; AVX2-SLOW-NEXT: vmovq %xmm6, (%r8) 91; AVX2-SLOW-NEXT: vmovq %xmm0, (%r9) 92; AVX2-SLOW-NEXT: retq 93; 94; AVX2-FAST-LABEL: vf4: 95; AVX2-FAST: # %bb.0: 96; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 97; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 98; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm2 99; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] 100; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] 101; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 102; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] 103; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] 104; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] 105; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] 106; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] 107; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] 108; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] 109; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] 110; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] 111; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] 112; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] 113; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] 114; AVX2-FAST-NEXT: vmovq %xmm3, (%rsi) 115; AVX2-FAST-NEXT: vmovq %xmm4, (%rdx) 116; AVX2-FAST-NEXT: vmovq %xmm5, (%rcx) 117; AVX2-FAST-NEXT: vmovq %xmm6, (%r8) 118; AVX2-FAST-NEXT: vmovq %xmm0, (%r9) 119; AVX2-FAST-NEXT: retq 120 %wide.vec = load <20 x i16>, <20 x i16>* %in.vec, align 32 121 122 %strided.vec0 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15> 123 %strided.vec1 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16> 124 %strided.vec2 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> <i32 2, i32 7, i32 12, i32 17> 125 %strided.vec3 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> <i32 3, i32 8, i32 13, i32 18> 126 %strided.vec4 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> <i32 4, i32 9, i32 14, i32 19> 127 128 store <4 x i16> %strided.vec0, <4 x i16>* %out.vec0, align 32 129 store <4 x i16> %strided.vec1, <4 x i16>* %out.vec1, align 32 130 store <4 x i16> %strided.vec2, <4 x i16>* %out.vec2, align 32 131 store <4 x i16> %strided.vec3, <4 x i16>* %out.vec3, align 32 132 store <4 x i16> %strided.vec4, <4 x i16>* %out.vec4, align 32 133 134 ret void 135} 136 137define void @vf8(<40 x i16>* %in.vec, <8 x i16>* %out.vec0, <8 x i16>* %out.vec1, <8 x i16>* %out.vec2, <8 x i16>* %out.vec3, <8 x i16>* %out.vec4) nounwind { 138; AVX2-SLOW-LABEL: vf8: 139; AVX2-SLOW: # %bb.0: 140; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 141; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 142; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] 143; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 144; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] 145; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u] 146; AVX2-SLOW-NEXT: vpbroadcastw 70(%rdi), %xmm3 147; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7] 148; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] 149; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 150; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] 151; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] 152; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm4 153; AVX2-SLOW-NEXT: vpsllq $48, %xmm4, %xmm5 154; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm5[7] 155; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] 156; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 157; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] 158; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 159; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,2,0] 160; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,5] 161; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] 162; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13],ymm2[14,15] 163; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm7 164; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] 165; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 166; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,1,0,3] 167; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6] 168; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] 169; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] 170; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 171; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] 172; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] 173; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,1,3] 174; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 175; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] 176; AVX2-SLOW-NEXT: vmovdqa %xmm1, (%rsi) 177; AVX2-SLOW-NEXT: vmovdqa %xmm3, (%rdx) 178; AVX2-SLOW-NEXT: vmovdqa %xmm5, (%rcx) 179; AVX2-SLOW-NEXT: vmovdqa %xmm6, (%r8) 180; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%r9) 181; AVX2-SLOW-NEXT: vzeroupper 182; AVX2-SLOW-NEXT: retq 183; 184; AVX2-FAST-LABEL: vf8: 185; AVX2-FAST: # %bb.0: 186; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 187; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 188; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 189; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 190; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] 191; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u] 192; AVX2-FAST-NEXT: vpbroadcastw 70(%rdi), %xmm3 193; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7] 194; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 195; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 196; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] 197; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] 198; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm4 199; AVX2-FAST-NEXT: vpsllq $48, %xmm4, %xmm5 200; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm5[7] 201; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] 202; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 203; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] 204; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 205; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11] 206; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] 207; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] 208; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 209; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] 210; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 211; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13] 212; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] 213; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 214; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 215; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 216; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] 217; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15] 218; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 219; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rsi) 220; AVX2-FAST-NEXT: vmovdqa %xmm3, (%rdx) 221; AVX2-FAST-NEXT: vmovdqa %xmm5, (%rcx) 222; AVX2-FAST-NEXT: vmovdqa %xmm6, (%r8) 223; AVX2-FAST-NEXT: vmovdqa %xmm0, (%r9) 224; AVX2-FAST-NEXT: vzeroupper 225; AVX2-FAST-NEXT: retq 226 %wide.vec = load <40 x i16>, <40 x i16>* %in.vec, align 32 227 228 %strided.vec0 = shufflevector <40 x i16> %wide.vec, <40 x i16> poison, <8 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35> 229 %strided.vec1 = shufflevector <40 x i16> %wide.vec, <40 x i16> poison, <8 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36> 230 %strided.vec2 = shufflevector <40 x i16> %wide.vec, <40 x i16> poison, <8 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37> 231 %strided.vec3 = shufflevector <40 x i16> %wide.vec, <40 x i16> poison, <8 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38> 232 %strided.vec4 = shufflevector <40 x i16> %wide.vec, <40 x i16> poison, <8 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39> 233 234 store <8 x i16> %strided.vec0, <8 x i16>* %out.vec0, align 32 235 store <8 x i16> %strided.vec1, <8 x i16>* %out.vec1, align 32 236 store <8 x i16> %strided.vec2, <8 x i16>* %out.vec2, align 32 237 store <8 x i16> %strided.vec3, <8 x i16>* %out.vec3, align 32 238 store <8 x i16> %strided.vec4, <8 x i16>* %out.vec4, align 32 239 240 ret void 241} 242 243define void @vf16(<80 x i16>* %in.vec, <16 x i16>* %out.vec0, <16 x i16>* %out.vec1, <16 x i16>* %out.vec2, <16 x i16>* %out.vec3, <16 x i16>* %out.vec4) nounwind { 244; AVX2-SLOW-LABEL: vf16: 245; AVX2-SLOW: # %bb.0: 246; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm2 247; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm3 248; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 249; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 250; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] 251; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 252; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] 253; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] 254; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] 255; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] 256; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7] 257; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] 258; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] 259; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm5 260; AVX2-SLOW-NEXT: vmovdqa 144(%rdi), %xmm6 261; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 262; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0],xmm6[1],xmm4[2,3] 263; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] 264; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 265; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15] 266; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm7[4,5,6,7] 267; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] 268; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm5 269; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5,6],xmm5[7] 270; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] 271; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 272; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1] 273; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5],ymm7[6,7] 274; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] 275; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 276; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0,1],xmm6[2],xmm4[3] 277; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] 278; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 279; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15] 280; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm7[4,5,6,7] 281; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] 282; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 283; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] 284; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 285; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 286; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] 287; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] 288; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] 289; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] 290; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0],xmm4[1],xmm6[2,3] 291; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] 292; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 293; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15] 294; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3],ymm7[4,5,6,7] 295; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] 296; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm7 297; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3] 298; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 299; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 300; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1] 301; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7] 302; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] 303; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] 304; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm4[2],xmm6[3] 305; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] 306; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 307; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15] 308; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] 309; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] 310; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 311; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] 312; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] 313; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] 314; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 315; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] 316; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u] 317; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] 318; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] 319; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] 320; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] 321; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] 322; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 323; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 324; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 325; AVX2-SLOW-NEXT: vmovdqa %ymm9, (%rsi) 326; AVX2-SLOW-NEXT: vmovdqa %ymm10, (%rdx) 327; AVX2-SLOW-NEXT: vmovdqa %ymm8, (%rcx) 328; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%r8) 329; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r9) 330; AVX2-SLOW-NEXT: vzeroupper 331; AVX2-SLOW-NEXT: retq 332; 333; AVX2-FAST-ALL-LABEL: vf16: 334; AVX2-FAST-ALL: # %bb.0: 335; AVX2-FAST-ALL-NEXT: vmovdqa 128(%rdi), %ymm0 336; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm3 337; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm4 338; AVX2-FAST-ALL-NEXT: vmovdqa 64(%rdi), %ymm1 339; AVX2-FAST-ALL-NEXT: vmovdqa 96(%rdi), %ymm2 340; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] 341; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm5, %xmm6 342; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] 343; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] 344; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15] 345; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm7 = [1,3,0,2,4,6,1,3] 346; AVX2-FAST-ALL-NEXT: vpermd %ymm6, %ymm7, %ymm6 347; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] 348; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] 349; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 350; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,0,3,5,u> 351; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm6, %ymm6 352; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27> 353; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm6, %ymm6 354; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] 355; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm6[4,5,6,7] 356; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] 357; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm6, %xmm5 358; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6],xmm5[7] 359; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] 360; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] 361; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <2,u,u,u,4,7,1,6> 362; AVX2-FAST-ALL-NEXT: vpermd %ymm6, %ymm10, %ymm6 363; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] 364; AVX2-FAST-ALL-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 365; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm6 = <u,u,u,u,1,3,6,u> 366; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm6, %ymm6 367; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25> 368; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm6, %ymm6 369; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] 370; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm6[4,5,6,7] 371; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15] 372; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm5, %xmm7 373; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] 374; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 375; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] 376; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <0,2,u,u,5,7,2,4> 377; AVX2-FAST-ALL-NEXT: vpermd %ymm7, %ymm10, %ymm7 378; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] 379; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] 380; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,4,6,0,1,4,6,0] 381; AVX2-FAST-ALL-NEXT: # ymm7 = mem[0,1,0,1] 382; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm7, %ymm7 383; AVX2-FAST-ALL-NEXT: vpshufb %ymm11, %ymm7, %ymm7 384; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15] 385; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm7[4,5,6,7] 386; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] 387; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm5, %xmm6 388; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3] 389; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 390; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] 391; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm10 = <0,3,u,u,5,0,2,7> 392; AVX2-FAST-ALL-NEXT: vpermd %ymm6, %ymm10, %ymm6 393; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] 394; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] 395; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,4,7,0,2,4,7,0] 396; AVX2-FAST-ALL-NEXT: # ymm6 = mem[0,1,0,1] 397; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm6, %ymm6 398; AVX2-FAST-ALL-NEXT: vpshufb %ymm8, %ymm6, %ymm6 399; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] 400; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] 401; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] 402; AVX2-FAST-ALL-NEXT: vextracti128 $1, %ymm3, %xmm4 403; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] 404; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] 405; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] 406; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <1,3,u,u,6,0,3,5> 407; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 408; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] 409; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5],ymm3[6,7] 410; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,0,2,5,7> 411; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 412; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31] 413; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 414; AVX2-FAST-ALL-NEXT: vmovdqa %ymm9, (%rsi) 415; AVX2-FAST-ALL-NEXT: vmovdqa %ymm12, (%rdx) 416; AVX2-FAST-ALL-NEXT: vmovdqa %ymm7, (%rcx) 417; AVX2-FAST-ALL-NEXT: vmovdqa %ymm5, (%r8) 418; AVX2-FAST-ALL-NEXT: vmovdqa %ymm0, (%r9) 419; AVX2-FAST-ALL-NEXT: vzeroupper 420; AVX2-FAST-ALL-NEXT: retq 421; 422; AVX2-FAST-PERLANE-LABEL: vf16: 423; AVX2-FAST-PERLANE: # %bb.0: 424; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm2 425; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm3 426; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 427; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 428; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] 429; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 430; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] 431; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] 432; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] 433; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] 434; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7] 435; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] 436; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] 437; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm6 438; AVX2-FAST-PERLANE-NEXT: vmovdqa 144(%rdi), %xmm4 439; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm5 440; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3] 441; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] 442; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 443; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] 444; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1,2,3],ymm7[4,5,6,7] 445; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] 446; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm6 447; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6],xmm6[7] 448; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] 449; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 450; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1] 451; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5],ymm7[6,7] 452; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] 453; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 454; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0,1],xmm4[2],xmm5[3] 455; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] 456; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 457; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] 458; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2,3],ymm7[4,5,6,7] 459; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] 460; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 461; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7] 462; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 463; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 464; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] 465; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] 466; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] 467; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] 468; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0],xmm5[1],xmm4[2,3] 469; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] 470; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 471; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] 472; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm7[4,5,6,7] 473; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] 474; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm7 475; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] 476; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 477; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 478; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1] 479; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7] 480; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] 481; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] 482; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3] 483; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] 484; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 485; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15] 486; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] 487; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] 488; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 489; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] 490; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] 491; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] 492; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 493; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] 494; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u] 495; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] 496; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12,13,14,15,4,5,14,15,u,u,u,u,u,u,u,u] 497; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[0,1,2,3,0,1,10,11,u,u,u,u,u,u,u,u] 498; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 499; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 500; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 501; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, (%rsi) 502; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, (%rdx) 503; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, (%rcx) 504; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, (%r8) 505; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r9) 506; AVX2-FAST-PERLANE-NEXT: vzeroupper 507; AVX2-FAST-PERLANE-NEXT: retq 508 %wide.vec = load <80 x i16>, <80 x i16>* %in.vec, align 32 509 510 %strided.vec0 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75> 511 %strided.vec1 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76> 512 %strided.vec2 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77> 513 %strided.vec3 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78> 514 %strided.vec4 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79> 515 516 store <16 x i16> %strided.vec0, <16 x i16>* %out.vec0, align 32 517 store <16 x i16> %strided.vec1, <16 x i16>* %out.vec1, align 32 518 store <16 x i16> %strided.vec2, <16 x i16>* %out.vec2, align 32 519 store <16 x i16> %strided.vec3, <16 x i16>* %out.vec3, align 32 520 store <16 x i16> %strided.vec4, <16 x i16>* %out.vec4, align 32 521 522 ret void 523} 524