1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 13 14define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind { 15; AVX-LABEL: shuffle_v32i8_to_v16i8_1: 16; AVX: # %bb.0: 17; AVX-NEXT: vmovdqa (%rdi), %xmm0 18; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 19; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> 20; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 21; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 22; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 23; AVX-NEXT: vmovdqa %xmm0, (%rsi) 24; AVX-NEXT: retq 25; 26; AVX512-LABEL: shuffle_v32i8_to_v16i8_1: 27; AVX512: # %bb.0: 28; AVX512-NEXT: vmovdqa (%rdi), %xmm0 29; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 30; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> 31; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 32; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 33; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 34; AVX512-NEXT: vmovdqa %xmm0, (%rsi) 35; AVX512-NEXT: retq 36 %vec = load <32 x i8>, <32 x i8>* %L 37 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31> 38 store <16 x i8> %strided.vec, <16 x i8>* %S 39 ret void 40} 41 42define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind { 43; AVX-LABEL: shuffle_v16i16_to_v8i16_1: 44; AVX: # %bb.0: 45; AVX-NEXT: vmovdqa (%rdi), %xmm0 46; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 47; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 48; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 49; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 50; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 51; AVX-NEXT: vmovdqa %xmm0, (%rsi) 52; AVX-NEXT: retq 53; 54; AVX512F-LABEL: shuffle_v16i16_to_v8i16_1: 55; AVX512F: # %bb.0: 56; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 57; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 58; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 59; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 60; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 61; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 62; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) 63; AVX512F-NEXT: retq 64; 65; AVX512VL-LABEL: shuffle_v16i16_to_v8i16_1: 66; AVX512VL: # %bb.0: 67; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 68; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 69; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 70; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 71; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 72; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 73; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) 74; AVX512VL-NEXT: retq 75; 76; AVX512BW-LABEL: shuffle_v16i16_to_v8i16_1: 77; AVX512BW: # %bb.0: 78; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [1,3,5,7,33,35,37,39] 79; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 80; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 81; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 82; AVX512BW-NEXT: vmovdqa %xmm1, (%rsi) 83; AVX512BW-NEXT: vzeroupper 84; AVX512BW-NEXT: retq 85; 86; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16_1: 87; AVX512BWVL: # %bb.0: 88; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 89; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,3,5,7,9,11,13,15] 90; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 91; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi) 92; AVX512BWVL-NEXT: retq 93 %vec = load <16 x i16>, <16 x i16>* %L 94 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 95 store <8 x i16> %strided.vec, <8 x i16>* %S 96 ret void 97} 98 99define void @shuffle_v8i32_to_v4i32_1(<8 x i32>* %L, <4 x i32>* %S) nounwind { 100; AVX-LABEL: shuffle_v8i32_to_v4i32_1: 101; AVX: # %bb.0: 102; AVX-NEXT: vmovaps (%rdi), %xmm0 103; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] 104; AVX-NEXT: vmovaps %xmm0, (%rsi) 105; AVX-NEXT: retq 106; 107; AVX512-LABEL: shuffle_v8i32_to_v4i32_1: 108; AVX512: # %bb.0: 109; AVX512-NEXT: vmovaps (%rdi), %xmm0 110; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3] 111; AVX512-NEXT: vmovaps %xmm0, (%rsi) 112; AVX512-NEXT: retq 113 %vec = load <8 x i32>, <8 x i32>* %L 114 %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 115 store <4 x i32> %strided.vec, <4 x i32>* %S 116 ret void 117} 118 119define void @shuffle_v32i8_to_v8i8_1(<32 x i8>* %L, <8 x i8>* %S) nounwind { 120; AVX-LABEL: shuffle_v32i8_to_v8i8_1: 121; AVX: # %bb.0: 122; AVX-NEXT: vmovdqa (%rdi), %xmm0 123; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 124; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 125; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 126; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 127; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 128; AVX-NEXT: vmovq %xmm0, (%rsi) 129; AVX-NEXT: retq 130; 131; AVX512-LABEL: shuffle_v32i8_to_v8i8_1: 132; AVX512: # %bb.0: 133; AVX512-NEXT: vmovdqa (%rdi), %xmm0 134; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 135; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> 136; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 137; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 138; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 139; AVX512-NEXT: vmovq %xmm0, (%rsi) 140; AVX512-NEXT: retq 141 %vec = load <32 x i8>, <32 x i8>* %L 142 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29> 143 store <8 x i8> %strided.vec, <8 x i8>* %S 144 ret void 145} 146 147define void @shuffle_v32i8_to_v8i8_2(<32 x i8>* %L, <8 x i8>* %S) nounwind { 148; AVX-LABEL: shuffle_v32i8_to_v8i8_2: 149; AVX: # %bb.0: 150; AVX-NEXT: vmovdqa (%rdi), %xmm0 151; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 152; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> 153; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 154; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 155; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 156; AVX-NEXT: vmovq %xmm0, (%rsi) 157; AVX-NEXT: retq 158; 159; AVX512-LABEL: shuffle_v32i8_to_v8i8_2: 160; AVX512: # %bb.0: 161; AVX512-NEXT: vmovdqa (%rdi), %xmm0 162; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 163; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> 164; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 165; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 166; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 167; AVX512-NEXT: vmovq %xmm0, (%rsi) 168; AVX512-NEXT: retq 169 %vec = load <32 x i8>, <32 x i8>* %L 170 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30> 171 store <8 x i8> %strided.vec, <8 x i8>* %S 172 ret void 173} 174 175define void @shuffle_v32i8_to_v8i8_3(<32 x i8>* %L, <8 x i8>* %S) nounwind { 176; AVX-LABEL: shuffle_v32i8_to_v8i8_3: 177; AVX: # %bb.0: 178; AVX-NEXT: vmovdqa (%rdi), %xmm0 179; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 180; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> 181; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 182; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 183; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 184; AVX-NEXT: vmovq %xmm0, (%rsi) 185; AVX-NEXT: retq 186; 187; AVX512-LABEL: shuffle_v32i8_to_v8i8_3: 188; AVX512: # %bb.0: 189; AVX512-NEXT: vmovdqa (%rdi), %xmm0 190; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 191; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> 192; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 193; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 194; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 195; AVX512-NEXT: vmovq %xmm0, (%rsi) 196; AVX512-NEXT: retq 197 %vec = load <32 x i8>, <32 x i8>* %L 198 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31> 199 store <8 x i8> %strided.vec, <8 x i8>* %S 200 ret void 201} 202 203define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind { 204; AVX1-LABEL: shuffle_v16i16_to_v4i16_1: 205; AVX1: # %bb.0: 206; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 207; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 208; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] 209; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] 210; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 211; AVX1-NEXT: vmovq %xmm0, (%rsi) 212; AVX1-NEXT: retq 213; 214; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1: 215; AVX2-SLOW: # %bb.0: 216; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 217; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 218; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] 219; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] 220; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 221; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) 222; AVX2-SLOW-NEXT: retq 223; 224; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_1: 225; AVX2-FAST: # %bb.0: 226; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 227; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 228; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] 229; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 230; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 231; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 232; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) 233; AVX2-FAST-NEXT: retq 234; 235; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1: 236; AVX512F: # %bb.0: 237; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] 238; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] 239; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] 240; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] 241; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 242; AVX512F-NEXT: vmovq %xmm0, (%rsi) 243; AVX512F-NEXT: retq 244; 245; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1: 246; AVX512VL: # %bb.0: 247; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 248; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 249; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] 250; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 251; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 252; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 253; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 254; AVX512VL-NEXT: retq 255; 256; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1: 257; AVX512BW: # %bb.0: 258; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [1,5,33,37,4,5,36,37] 259; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 260; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 261; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 262; AVX512BW-NEXT: vmovq %xmm1, (%rsi) 263; AVX512BW-NEXT: vzeroupper 264; AVX512BW-NEXT: retq 265; 266; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1: 267; AVX512BWVL: # %bb.0: 268; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 269; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = <1,5,9,13,u,u,u,u> 270; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 271; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi) 272; AVX512BWVL-NEXT: retq 273 %vec = load <16 x i16>, <16 x i16>* %L 274 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 275 store <4 x i16> %strided.vec, <4 x i16>* %S 276 ret void 277} 278 279define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind { 280; AVX1-LABEL: shuffle_v16i16_to_v4i16_2: 281; AVX1: # %bb.0: 282; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 283; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 284; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] 285; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] 286; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 287; AVX1-NEXT: vmovq %xmm0, (%rsi) 288; AVX1-NEXT: retq 289; 290; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2: 291; AVX2-SLOW: # %bb.0: 292; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 293; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 294; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] 295; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] 296; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 297; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) 298; AVX2-SLOW-NEXT: retq 299; 300; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_2: 301; AVX2-FAST: # %bb.0: 302; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 303; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 304; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] 305; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 306; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 307; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 308; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) 309; AVX2-FAST-NEXT: retq 310; 311; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2: 312; AVX512F: # %bb.0: 313; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 314; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 315; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] 316; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] 317; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 318; AVX512F-NEXT: vmovq %xmm0, (%rsi) 319; AVX512F-NEXT: retq 320; 321; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2: 322; AVX512VL: # %bb.0: 323; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 324; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 325; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] 326; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 327; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 328; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 329; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 330; AVX512VL-NEXT: retq 331; 332; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2: 333; AVX512BW: # %bb.0: 334; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [2,6,34,38,2,3,34,35] 335; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 336; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 337; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 338; AVX512BW-NEXT: vmovq %xmm1, (%rsi) 339; AVX512BW-NEXT: vzeroupper 340; AVX512BW-NEXT: retq 341; 342; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2: 343; AVX512BWVL: # %bb.0: 344; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 345; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = <2,6,10,14,u,u,u,u> 346; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 347; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi) 348; AVX512BWVL-NEXT: retq 349 %vec = load <16 x i16>, <16 x i16>* %L 350 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 351 store <4 x i16> %strided.vec, <4 x i16>* %S 352 ret void 353} 354 355define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind { 356; AVX1-LABEL: shuffle_v16i16_to_v4i16_3: 357; AVX1: # %bb.0: 358; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 359; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 360; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] 361; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 362; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 363; AVX1-NEXT: vmovq %xmm0, (%rsi) 364; AVX1-NEXT: retq 365; 366; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3: 367; AVX2-SLOW: # %bb.0: 368; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 369; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 370; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] 371; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 372; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 373; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) 374; AVX2-SLOW-NEXT: retq 375; 376; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_3: 377; AVX2-FAST: # %bb.0: 378; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 379; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 380; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] 381; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 382; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 383; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 384; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) 385; AVX2-FAST-NEXT: retq 386; 387; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3: 388; AVX512F: # %bb.0: 389; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 390; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] 391; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,1,2,3] 392; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] 393; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 394; AVX512F-NEXT: vmovq %xmm0, (%rsi) 395; AVX512F-NEXT: retq 396; 397; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_3: 398; AVX512VL: # %bb.0: 399; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 400; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 401; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] 402; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 403; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 404; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 405; AVX512VL-NEXT: vmovq %xmm0, (%rsi) 406; AVX512VL-NEXT: retq 407; 408; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3: 409; AVX512BW: # %bb.0: 410; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [3,7,35,39,2,3,34,35] 411; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 412; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 413; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 414; AVX512BW-NEXT: vmovq %xmm1, (%rsi) 415; AVX512BW-NEXT: vzeroupper 416; AVX512BW-NEXT: retq 417; 418; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3: 419; AVX512BWVL: # %bb.0: 420; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 421; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = <3,7,11,15,u,u,u,u> 422; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 423; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi) 424; AVX512BWVL-NEXT: retq 425 %vec = load <16 x i16>, <16 x i16>* %L 426 %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 427 store <4 x i16> %strided.vec, <4 x i16>* %S 428 ret void 429} 430 431define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind { 432; AVX-LABEL: shuffle_v32i8_to_v4i8_1: 433; AVX: # %bb.0: 434; AVX-NEXT: vmovdqa (%rdi), %xmm0 435; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 436; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 437; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 438; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 439; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 440; AVX-NEXT: vmovd %xmm0, (%rsi) 441; AVX-NEXT: retq 442; 443; AVX512-LABEL: shuffle_v32i8_to_v4i8_1: 444; AVX512: # %bb.0: 445; AVX512-NEXT: vmovdqa (%rdi), %xmm0 446; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 447; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 448; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 449; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 450; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 451; AVX512-NEXT: vmovd %xmm0, (%rsi) 452; AVX512-NEXT: retq 453 %vec = load <32 x i8>, <32 x i8>* %L 454 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25> 455 store <4 x i8> %strided.vec, <4 x i8>* %S 456 ret void 457} 458 459define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind { 460; AVX-LABEL: shuffle_v32i8_to_v4i8_2: 461; AVX: # %bb.0: 462; AVX-NEXT: vmovdqa (%rdi), %xmm0 463; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 464; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 465; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 466; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 467; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 468; AVX-NEXT: vmovd %xmm0, (%rsi) 469; AVX-NEXT: retq 470; 471; AVX512-LABEL: shuffle_v32i8_to_v4i8_2: 472; AVX512: # %bb.0: 473; AVX512-NEXT: vmovdqa (%rdi), %xmm0 474; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 475; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 476; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 477; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 478; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 479; AVX512-NEXT: vmovd %xmm0, (%rsi) 480; AVX512-NEXT: retq 481 %vec = load <32 x i8>, <32 x i8>* %L 482 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26> 483 store <4 x i8> %strided.vec, <4 x i8>* %S 484 ret void 485} 486 487define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind { 488; AVX-LABEL: shuffle_v32i8_to_v4i8_3: 489; AVX: # %bb.0: 490; AVX-NEXT: vmovdqa (%rdi), %xmm0 491; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 492; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 493; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 494; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 495; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 496; AVX-NEXT: vmovd %xmm0, (%rsi) 497; AVX-NEXT: retq 498; 499; AVX512-LABEL: shuffle_v32i8_to_v4i8_3: 500; AVX512: # %bb.0: 501; AVX512-NEXT: vmovdqa (%rdi), %xmm0 502; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 503; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 504; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 505; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 506; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 507; AVX512-NEXT: vmovd %xmm0, (%rsi) 508; AVX512-NEXT: retq 509 %vec = load <32 x i8>, <32 x i8>* %L 510 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27> 511 store <4 x i8> %strided.vec, <4 x i8>* %S 512 ret void 513} 514 515define void @shuffle_v32i8_to_v4i8_4(<32 x i8>* %L, <4 x i8>* %S) nounwind { 516; AVX-LABEL: shuffle_v32i8_to_v4i8_4: 517; AVX: # %bb.0: 518; AVX-NEXT: vmovdqa (%rdi), %xmm0 519; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 520; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 521; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 522; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 523; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 524; AVX-NEXT: vmovd %xmm0, (%rsi) 525; AVX-NEXT: retq 526; 527; AVX512-LABEL: shuffle_v32i8_to_v4i8_4: 528; AVX512: # %bb.0: 529; AVX512-NEXT: vmovdqa (%rdi), %xmm0 530; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 531; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 532; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 533; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 534; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 535; AVX512-NEXT: vmovd %xmm0, (%rsi) 536; AVX512-NEXT: retq 537 %vec = load <32 x i8>, <32 x i8>* %L 538 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28> 539 store <4 x i8> %strided.vec, <4 x i8>* %S 540 ret void 541} 542 543define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind { 544; AVX-LABEL: shuffle_v32i8_to_v4i8_5: 545; AVX: # %bb.0: 546; AVX-NEXT: vmovdqa (%rdi), %xmm0 547; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 548; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 549; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 550; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 551; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 552; AVX-NEXT: vmovd %xmm0, (%rsi) 553; AVX-NEXT: retq 554; 555; AVX512-LABEL: shuffle_v32i8_to_v4i8_5: 556; AVX512: # %bb.0: 557; AVX512-NEXT: vmovdqa (%rdi), %xmm0 558; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 559; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 560; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 561; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 562; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 563; AVX512-NEXT: vmovd %xmm0, (%rsi) 564; AVX512-NEXT: retq 565 %vec = load <32 x i8>, <32 x i8>* %L 566 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29> 567 store <4 x i8> %strided.vec, <4 x i8>* %S 568 ret void 569} 570 571define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind { 572; AVX-LABEL: shuffle_v32i8_to_v4i8_6: 573; AVX: # %bb.0: 574; AVX-NEXT: vmovdqa (%rdi), %xmm0 575; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 576; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 577; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 578; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 579; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 580; AVX-NEXT: vmovd %xmm0, (%rsi) 581; AVX-NEXT: retq 582; 583; AVX512-LABEL: shuffle_v32i8_to_v4i8_6: 584; AVX512: # %bb.0: 585; AVX512-NEXT: vmovdqa (%rdi), %xmm0 586; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 587; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 588; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 589; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 590; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 591; AVX512-NEXT: vmovd %xmm0, (%rsi) 592; AVX512-NEXT: retq 593 %vec = load <32 x i8>, <32 x i8>* %L 594 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30> 595 store <4 x i8> %strided.vec, <4 x i8>* %S 596 ret void 597} 598 599define void @shuffle_v32i8_to_v4i8_7(<32 x i8>* %L, <4 x i8>* %S) nounwind { 600; AVX-LABEL: shuffle_v32i8_to_v4i8_7: 601; AVX: # %bb.0: 602; AVX-NEXT: vmovdqa (%rdi), %xmm0 603; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 604; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 605; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 606; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 607; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 608; AVX-NEXT: vmovd %xmm0, (%rsi) 609; AVX-NEXT: retq 610; 611; AVX512-LABEL: shuffle_v32i8_to_v4i8_7: 612; AVX512: # %bb.0: 613; AVX512-NEXT: vmovdqa (%rdi), %xmm0 614; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 615; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 616; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 617; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 618; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 619; AVX512-NEXT: vmovd %xmm0, (%rsi) 620; AVX512-NEXT: retq 621 %vec = load <32 x i8>, <32 x i8>* %L 622 %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31> 623 store <4 x i8> %strided.vec, <4 x i8>* %S 624 ret void 625} 626 627