1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=ALL,SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=ALL,SSE,SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL,AVX512VLBW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL,AVX512VLBW 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL,AVX512VLVBMI 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512vbmi,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL,AVX512VLVBMI 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,XOP,XOPAVX1 14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,XOP,XOPAVX2 15 16define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) { 17; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: 18; SSE2: # %bb.0: 19; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 20; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 21; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 22; SSE2-NEXT: retq 23; 24; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: 25; SSSE3: # %bb.0: 26; SSSE3-NEXT: pxor %xmm1, %xmm1 27; SSSE3-NEXT: pshufb %xmm1, %xmm0 28; SSSE3-NEXT: retq 29; 30; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: 31; SSE41: # %bb.0: 32; SSE41-NEXT: pxor %xmm1, %xmm1 33; SSE41-NEXT: pshufb %xmm1, %xmm0 34; SSE41-NEXT: retq 35; 36; AVX1-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: 37; AVX1: # %bb.0: 38; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 39; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 40; AVX1-NEXT: retq 41; 42; AVX2OR512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: 43; AVX2OR512VL: # %bb.0: 44; AVX2OR512VL-NEXT: vpbroadcastb %xmm0, %xmm0 45; AVX2OR512VL-NEXT: retq 46; 47; XOPAVX1-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: 48; XOPAVX1: # %bb.0: 49; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 50; XOPAVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 51; XOPAVX1-NEXT: retq 52; 53; XOPAVX2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: 54; XOPAVX2: # %bb.0: 55; XOPAVX2-NEXT: vpbroadcastb %xmm0, %xmm0 56; XOPAVX2-NEXT: retq 57 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 58 ret <16 x i8> %shuffle 59} 60 61define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(<16 x i8> %a, <16 x i8> %b) { 62; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: 63; SSE2: # %bb.0: 64; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 65; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7] 66; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 67; SSE2-NEXT: retq 68; 69; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: 70; SSSE3: # %bb.0: 71; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] 72; SSSE3-NEXT: retq 73; 74; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: 75; SSE41: # %bb.0: 76; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] 77; SSE41-NEXT: retq 78; 79; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: 80; AVX: # %bb.0: 81; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] 82; AVX-NEXT: retq 83 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 84 ret <16 x i8> %shuffle 85} 86 87define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) { 88; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: 89; SSE2: # %bb.0: 90; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 91; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 92; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 93; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] 94; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 95; SSE2-NEXT: retq 96; 97; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: 98; SSSE3: # %bb.0: 99; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] 100; SSSE3-NEXT: retq 101; 102; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: 103; SSE41: # %bb.0: 104; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] 105; SSE41-NEXT: retq 106; 107; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: 108; AVX: # %bb.0: 109; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] 110; AVX-NEXT: retq 111 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> 112 ret <16 x i8> %shuffle 113} 114 115define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) { 116; SSE-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: 117; SSE: # %bb.0: 118; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 119; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 120; SSE-NEXT: retq 121; 122; AVX1-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: 123; AVX1: # %bb.0: 124; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 125; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 126; AVX1-NEXT: retq 127; 128; AVX2-SLOW-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: 129; AVX2-SLOW: # %bb.0: 130; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 131; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 132; AVX2-SLOW-NEXT: retq 133; 134; AVX2-FAST-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: 135; AVX2-FAST: # %bb.0: 136; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] 137; AVX2-FAST-NEXT: retq 138; 139; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: 140; AVX512VL: # %bb.0: 141; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] 142; AVX512VL-NEXT: retq 143; 144; XOP-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: 145; XOP: # %bb.0: 146; XOP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 147; XOP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 148; XOP-NEXT: retq 149 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3> 150 ret <16 x i8> %shuffle 151} 152 153define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(<16 x i8> %a, <16 x i8> %b) { 154; SSE-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: 155; SSE: # %bb.0: 156; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 157; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 158; SSE-NEXT: retq 159; 160; AVX1-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: 161; AVX1: # %bb.0: 162; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 163; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 164; AVX1-NEXT: retq 165; 166; AVX2-SLOW-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: 167; AVX2-SLOW: # %bb.0: 168; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 169; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 170; AVX2-SLOW-NEXT: retq 171; 172; AVX2-FAST-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: 173; AVX2-FAST: # %bb.0: 174; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] 175; AVX2-FAST-NEXT: retq 176; 177; AVX512VL-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: 178; AVX512VL: # %bb.0: 179; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] 180; AVX512VL-NEXT: retq 181; 182; XOP-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: 183; XOP: # %bb.0: 184; XOP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 185; XOP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 186; XOP-NEXT: retq 187 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7> 188 ret <16 x i8> %shuffle 189} 190 191define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) { 192; SSE2-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: 193; SSE2: # %bb.0: 194; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 195; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 196; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 197; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 198; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] 199; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6] 200; SSE2-NEXT: retq 201; 202; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: 203; SSSE3: # %bb.0: 204; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] 205; SSSE3-NEXT: retq 206; 207; SSE41-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: 208; SSE41: # %bb.0: 209; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] 210; SSE41-NEXT: retq 211; 212; AVX-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: 213; AVX: # %bb.0: 214; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] 215; AVX-NEXT: retq 216 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12> 217 ret <16 x i8> %shuffle 218} 219 220define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(<16 x i8> %a, <16 x i8> %b) { 221; SSE-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07: 222; SSE: # %bb.0: 223; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 224; SSE-NEXT: retq 225; 226; AVX-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07: 227; AVX: # %bb.0: 228; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 229; AVX-NEXT: retq 230 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7> 231 ret <16 x i8> %shuffle 232} 233 234define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) { 235; SSE-LABEL: shuffle_v16i8_0101010101010101: 236; SSE: # %bb.0: 237; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 238; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 239; SSE-NEXT: retq 240; 241; AVX1-LABEL: shuffle_v16i8_0101010101010101: 242; AVX1: # %bb.0: 243; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 244; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 245; AVX1-NEXT: retq 246; 247; AVX2OR512VL-LABEL: shuffle_v16i8_0101010101010101: 248; AVX2OR512VL: # %bb.0: 249; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %xmm0 250; AVX2OR512VL-NEXT: retq 251; 252; XOPAVX1-LABEL: shuffle_v16i8_0101010101010101: 253; XOPAVX1: # %bb.0: 254; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 255; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 256; XOPAVX1-NEXT: retq 257; 258; XOPAVX2-LABEL: shuffle_v16i8_0101010101010101: 259; XOPAVX2: # %bb.0: 260; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0 261; XOPAVX2-NEXT: retq 262 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 263 ret <16 x i8> %shuffle 264} 265 266define <16 x i8> @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i8> %a, <16 x i8> %b) { 267; SSE-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23: 268; SSE: # %bb.0: 269; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 270; SSE-NEXT: retq 271; 272; AVX-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23: 273; AVX: # %bb.0: 274; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 275; AVX-NEXT: retq 276 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 277 ret <16 x i8> %shuffle 278} 279 280define <16 x i8> @shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31(<16 x i8> %a, <16 x i8> %b) { 281; SSE-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31: 282; SSE: # %bb.0: 283; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 284; SSE-NEXT: retq 285; 286; AVX-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31: 287; AVX: # %bb.0: 288; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 289; AVX-NEXT: retq 290 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 291 ret <16 x i8> %shuffle 292} 293 294define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) { 295; SSE-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: 296; SSE: # %bb.0: 297; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 298; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 299; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 300; SSE-NEXT: movdqa %xmm1, %xmm0 301; SSE-NEXT: retq 302; 303; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: 304; AVX1: # %bb.0: 305; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 306; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 307; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 308; AVX1-NEXT: retq 309; 310; AVX2OR512VL-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: 311; AVX2OR512VL: # %bb.0: 312; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1 313; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 314; AVX2OR512VL-NEXT: retq 315; 316; XOPAVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: 317; XOPAVX1: # %bb.0: 318; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[0],xmm0[1],xmm1[0],xmm0[2],xmm1[0],xmm0[3],xmm1[0],xmm0[4],xmm1[0],xmm0[5],xmm1[0],xmm0[6],xmm1[0],xmm0[7] 319; XOPAVX1-NEXT: retq 320; 321; XOPAVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: 322; XOPAVX2: # %bb.0: 323; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 324; XOPAVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 325; XOPAVX2-NEXT: retq 326 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7> 327 ret <16 x i8> %shuffle 328} 329 330define <16 x i8> @shuffle_v16i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00(<16 x i8> %a, <16 x i8> %b) { 331; SSE2-LABEL: shuffle_v16i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: 332; SSE2: # %bb.0: 333; SSE2-NEXT: pxor %xmm1, %xmm1 334; SSE2-NEXT: movdqa %xmm0, %xmm2 335; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 336; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 337; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 338; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 339; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 340; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 341; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 342; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 343; SSE2-NEXT: packuswb %xmm2, %xmm0 344; SSE2-NEXT: retq 345; 346; SSSE3-LABEL: shuffle_v16i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: 347; SSSE3: # %bb.0: 348; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 349; SSSE3-NEXT: retq 350; 351; SSE41-LABEL: shuffle_v16i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: 352; SSE41: # %bb.0: 353; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 354; SSE41-NEXT: retq 355; 356; AVX-LABEL: shuffle_v16i8_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: 357; AVX: # %bb.0: 358; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 359; AVX-NEXT: retq 360 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 361 ret <16 x i8> %shuffle 362} 363 364define <16 x i8> @shuffle_v16i8_07_06_05_04_03_02_01_00_15_14_13_12_11_10_09_08(<16 x i8> %a, <16 x i8> %b) { 365; SSE2-LABEL: shuffle_v16i8_07_06_05_04_03_02_01_00_15_14_13_12_11_10_09_08: 366; SSE2: # %bb.0: 367; SSE2-NEXT: pxor %xmm1, %xmm1 368; SSE2-NEXT: movdqa %xmm0, %xmm2 369; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 370; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 371; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 372; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 373; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 374; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 375; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 376; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 377; SSE2-NEXT: packuswb %xmm2, %xmm0 378; SSE2-NEXT: retq 379; 380; SSSE3-LABEL: shuffle_v16i8_07_06_05_04_03_02_01_00_15_14_13_12_11_10_09_08: 381; SSSE3: # %bb.0: 382; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 383; SSSE3-NEXT: retq 384; 385; SSE41-LABEL: shuffle_v16i8_07_06_05_04_03_02_01_00_15_14_13_12_11_10_09_08: 386; SSE41: # %bb.0: 387; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 388; SSE41-NEXT: retq 389; 390; AVX-LABEL: shuffle_v16i8_07_06_05_04_03_02_01_00_15_14_13_12_11_10_09_08: 391; AVX: # %bb.0: 392; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] 393; AVX-NEXT: retq 394 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8> 395 ret <16 x i8> %shuffle 396} 397 398define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(<16 x i8> %a, <16 x i8> %b) { 399; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12: 400; SSE2: # %bb.0: 401; SSE2-NEXT: pxor %xmm1, %xmm1 402; SSE2-NEXT: movdqa %xmm0, %xmm2 403; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 404; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 405; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 406; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 407; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 408; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 409; SSE2-NEXT: packuswb %xmm2, %xmm0 410; SSE2-NEXT: retq 411; 412; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12: 413; SSSE3: # %bb.0: 414; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 415; SSSE3-NEXT: retq 416; 417; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12: 418; SSE41: # %bb.0: 419; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 420; SSE41-NEXT: retq 421; 422; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12: 423; AVX: # %bb.0: 424; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 425; AVX-NEXT: retq 426 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> 427 ret <16 x i8> %shuffle 428} 429 430define <16 x i8> @shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14(<16 x i8> %a, <16 x i8> %b) { 431; SSE2-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14: 432; SSE2: # %bb.0: 433; SSE2-NEXT: movdqa %xmm0, %xmm1 434; SSE2-NEXT: psrlw $8, %xmm1 435; SSE2-NEXT: psllw $8, %xmm0 436; SSE2-NEXT: por %xmm1, %xmm0 437; SSE2-NEXT: retq 438; 439; SSSE3-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14: 440; SSSE3: # %bb.0: 441; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 442; SSSE3-NEXT: retq 443; 444; SSE41-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14: 445; SSE41: # %bb.0: 446; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 447; SSE41-NEXT: retq 448; 449; AVX1-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14: 450; AVX1: # %bb.0: 451; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 452; AVX1-NEXT: retq 453; 454; AVX2OR512VL-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14: 455; AVX2OR512VL: # %bb.0: 456; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] 457; AVX2OR512VL-NEXT: retq 458; 459; XOP-LABEL: shuffle_v16i8_01_00_03_02_05_04_07_06_09_08_11_10_13_12_15_14: 460; XOP: # %bb.0: 461; XOP-NEXT: vprotw $8, %xmm0, %xmm0 462; XOP-NEXT: retq 463 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14> 464 ret <16 x i8> %shuffle 465} 466 467define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(<16 x i8> %a, <16 x i8> %b) { 468; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: 469; SSE2: # %bb.0: 470; SSE2-NEXT: pxor %xmm2, %xmm2 471; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 472; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 473; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] 474; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 475; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 476; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 477; SSE2-NEXT: packuswb %xmm1, %xmm0 478; SSE2-NEXT: retq 479; 480; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: 481; SSSE3: # %bb.0: 482; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 483; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9] 484; SSSE3-NEXT: retq 485; 486; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: 487; SSE41: # %bb.0: 488; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 489; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9] 490; SSE41-NEXT: retq 491; 492; AVX1-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: 493; AVX1: # %bb.0: 494; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 495; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9] 496; AVX1-NEXT: retq 497; 498; AVX2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: 499; AVX2: # %bb.0: 500; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 501; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9] 502; AVX2-NEXT: retq 503; 504; AVX512VLBW-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: 505; AVX512VLBW: # %bb.0: 506; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 507; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9] 508; AVX512VLBW-NEXT: retq 509; 510; AVX512VLVBMI-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: 511; AVX512VLVBMI: # %bb.0: 512; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,19,18,17,16,23,22,21,20] 513; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 514; AVX512VLVBMI-NEXT: retq 515; 516; XOP-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: 517; XOP: # %bb.0: 518; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],xmm1[3,2,1,0,7,6,5,4] 519; XOP-NEXT: retq 520 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20> 521 ret <16 x i8> %shuffle 522} 523 524define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) { 525; SSE2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: 526; SSE2: # %bb.0: 527; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 528; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 529; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 530; SSE2-NEXT: pxor %xmm1, %xmm1 531; SSE2-NEXT: movdqa %xmm0, %xmm2 532; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 533; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[3,2,1,0,4,5,6,7] 534; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 535; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4] 536; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1] 537; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[3,2,1,0,4,5,6,7] 538; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,7,6,5,4] 539; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] 540; SSE2-NEXT: packuswb %xmm1, %xmm0 541; SSE2-NEXT: retq 542; 543; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: 544; SSSE3: # %bb.0: 545; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u] 546; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u] 547; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 548; SSSE3-NEXT: retq 549; 550; SSE41-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: 551; SSE41: # %bb.0: 552; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u] 553; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u] 554; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 555; SSE41-NEXT: retq 556; 557; AVX1-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: 558; AVX1: # %bb.0: 559; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u] 560; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u] 561; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 562; AVX1-NEXT: retq 563; 564; AVX2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: 565; AVX2: # %bb.0: 566; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u] 567; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u] 568; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 569; AVX2-NEXT: retq 570; 571; AVX512VLBW-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: 572; AVX512VLBW: # %bb.0: 573; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u] 574; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u] 575; AVX512VLBW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 576; AVX512VLBW-NEXT: retq 577; 578; AVX512VLVBMI-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: 579; AVX512VLVBMI: # %bb.0: 580; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,31,30,29,28,11,10,9,8,23,22,21,20] 581; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 582; AVX512VLVBMI-NEXT: retq 583; 584; XOP-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: 585; XOP: # %bb.0: 586; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[3,2,1,0],xmm1[15,14,13,12],xmm0[11,10,9,8],xmm1[7,6,5,4] 587; XOP-NEXT: retq 588 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 31, i32 30, i32 29, i32 28, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20> 589 ret <16 x i8> %shuffle 590} 591 592define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i8> %a, <16 x i8> %b) { 593; SSE2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: 594; SSE2: # %bb.0: 595; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 596; SSE2-NEXT: andps %xmm2, %xmm0 597; SSE2-NEXT: andnps %xmm1, %xmm2 598; SSE2-NEXT: orps %xmm2, %xmm0 599; SSE2-NEXT: retq 600; 601; SSSE3-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: 602; SSSE3: # %bb.0: 603; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] 604; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 605; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 606; SSSE3-NEXT: retq 607; 608; SSE41-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: 609; SSE41: # %bb.0: 610; SSE41-NEXT: movdqa %xmm0, %xmm2 611; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 612; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 613; SSE41-NEXT: movdqa %xmm1, %xmm0 614; SSE41-NEXT: retq 615; 616; AVX1OR2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: 617; AVX1OR2: # %bb.0: 618; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 619; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 620; AVX1OR2-NEXT: retq 621; 622; AVX512VL-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: 623; AVX512VL: # %bb.0: 624; AVX512VL-NEXT: movw $-21846, %ax # imm = 0xAAAA 625; AVX512VL-NEXT: kmovd %eax, %k1 626; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} 627; AVX512VL-NEXT: retq 628 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 629 ret <16 x i8> %shuffle 630} 631 632define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(<16 x i8> %a, <16 x i8> %b) { 633; SSE2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: 634; SSE2: # %bb.0: 635; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 636; SSE2-NEXT: andps %xmm2, %xmm0 637; SSE2-NEXT: andnps %xmm1, %xmm2 638; SSE2-NEXT: orps %xmm2, %xmm0 639; SSE2-NEXT: retq 640; 641; SSSE3-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: 642; SSSE3: # %bb.0: 643; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[15] 644; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6],zero,xmm0[8,9,10],zero,xmm0[12,13,14],zero 645; SSSE3-NEXT: por %xmm1, %xmm0 646; SSSE3-NEXT: retq 647; 648; SSE41-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: 649; SSE41: # %bb.0: 650; SSE41-NEXT: movdqa %xmm0, %xmm2 651; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 652; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 653; SSE41-NEXT: movdqa %xmm1, %xmm0 654; SSE41-NEXT: retq 655; 656; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: 657; AVX1OR2: # %bb.0: 658; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 659; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 660; AVX1OR2-NEXT: retq 661; 662; AVX512VL-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: 663; AVX512VL: # %bb.0: 664; AVX512VL-NEXT: movw $-30584, %ax # imm = 0x8888 665; AVX512VL-NEXT: kmovd %eax, %k1 666; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} 667; AVX512VL-NEXT: retq 668 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31> 669 ret <16 x i8> %shuffle 670} 671 672define <16 x i8> @shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz(<16 x i8> %a) { 673; SSE-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: 674; SSE: # %bb.0: 675; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 676; SSE-NEXT: retq 677; 678; AVX-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz: 679; AVX: # %bb.0: 680; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 681; AVX-NEXT: retq 682 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31> 683 ret <16 x i8> %shuffle 684} 685 686define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(<16 x i8> %a, <16 x i8> %b) { 687; SSE2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: 688; SSE2: # %bb.0: 689; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] 690; SSE2-NEXT: andps %xmm2, %xmm0 691; SSE2-NEXT: andnps %xmm1, %xmm2 692; SSE2-NEXT: orps %xmm2, %xmm0 693; SSE2-NEXT: retq 694; 695; SSSE3-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: 696; SSSE3: # %bb.0: 697; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[4],zero,zero,xmm1[7],zero,zero,zero,zero,xmm1[12],zero,zero,xmm1[15] 698; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,xmm0[5,6],zero,xmm0[8,9,10,11],zero,xmm0[13,14],zero 699; SSSE3-NEXT: por %xmm1, %xmm0 700; SSSE3-NEXT: retq 701; 702; SSE41-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: 703; SSE41: # %bb.0: 704; SSE41-NEXT: movdqa %xmm0, %xmm2 705; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] 706; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 707; SSE41-NEXT: movdqa %xmm1, %xmm0 708; SSE41-NEXT: retq 709; 710; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: 711; AVX1OR2: # %bb.0: 712; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] 713; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 714; AVX1OR2-NEXT: retq 715; 716; AVX512VL-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: 717; AVX512VL: # %bb.0: 718; AVX512VL-NEXT: movw $-28528, %ax # imm = 0x9090 719; AVX512VL-NEXT: kmovd %eax, %k1 720; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} 721; AVX512VL-NEXT: retq 722 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 13, i32 14, i32 31> 723 ret <16 x i8> %shuffle 724} 725 726define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(<16 x i8> %a, <16 x i8> %b) { 727; SSE2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15: 728; SSE2: # %bb.0: 729; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0] 730; SSE2-NEXT: andps %xmm2, %xmm1 731; SSE2-NEXT: andnps %xmm0, %xmm2 732; SSE2-NEXT: orps %xmm1, %xmm2 733; SSE2-NEXT: movaps %xmm2, %xmm0 734; SSE2-NEXT: retq 735; 736; SSSE3-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15: 737; SSSE3: # %bb.0: 738; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[4,5,6,7],zero,zero,xmm0[10,11],zero,xmm0[13],zero,xmm0[15] 739; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[8,9],zero,zero,xmm1[12],zero,xmm1[14],zero 740; SSSE3-NEXT: por %xmm1, %xmm0 741; SSSE3-NEXT: retq 742; 743; SSE41-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15: 744; SSE41: # %bb.0: 745; SSE41-NEXT: movdqa %xmm0, %xmm2 746; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0] 747; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 748; SSE41-NEXT: movdqa %xmm2, %xmm0 749; SSE41-NEXT: retq 750; 751; AVX1OR2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15: 752; AVX1OR2: # %bb.0: 753; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0] 754; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 755; AVX1OR2-NEXT: retq 756; 757; AVX512VL-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15: 758; AVX512VL: # %bb.0: 759; AVX512VL-NEXT: movw $-21264, %ax # imm = 0xACF0 760; AVX512VL-NEXT: kmovd %eax, %k1 761; AVX512VL-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1} 762; AVX512VL-NEXT: retq 763 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 10, i32 11, i32 28, i32 13, i32 30, i32 15> 764 ret <16 x i8> %shuffle 765} 766 767define <16 x i8> @shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %a, <16 x i8> %b) { 768; SSE2-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: 769; SSE2: # %bb.0: 770; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 771; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 772; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 773; SSE2-NEXT: psrlq $16, %xmm0 774; SSE2-NEXT: packuswb %xmm0, %xmm0 775; SSE2-NEXT: retq 776; 777; SSSE3-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: 778; SSSE3: # %bb.0: 779; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 780; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 781; SSSE3-NEXT: retq 782; 783; SSE41-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: 784; SSE41: # %bb.0: 785; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 786; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 787; SSE41-NEXT: retq 788; 789; AVX1-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: 790; AVX1: # %bb.0: 791; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 792; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 793; AVX1-NEXT: retq 794; 795; AVX2-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: 796; AVX2: # %bb.0: 797; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 798; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 799; AVX2-NEXT: retq 800; 801; AVX512VLBW-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: 802; AVX512VLBW: # %bb.0: 803; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 804; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 805; AVX512VLBW-NEXT: retq 806; 807; AVX512VLVBMI-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: 808; AVX512VLVBMI: # %bb.0: 809; AVX512VLVBMI-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5122,5122,5122,5122,5122,5122,5122,5122] 810; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 811; AVX512VLVBMI-NEXT: retq 812; 813; XOP-LABEL: shuffle_v16i8_02_20_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: 814; XOP: # %bb.0: 815; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[2],xmm1[4],xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u] 816; XOP-NEXT: retq 817 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 20, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 818 ret <16 x i8> %shuffle 819} 820 821; PR39387 822define <16 x i8> @shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4(<16 x i8> %a, <16 x i8> %b) { 823; SSE2-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4: 824; SSE2: # %bb.0: 825; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,255] 826; SSE2-NEXT: movdqa %xmm0, %xmm3 827; SSE2-NEXT: pand %xmm2, %xmm3 828; SSE2-NEXT: pandn %xmm1, %xmm2 829; SSE2-NEXT: por %xmm3, %xmm2 830; SSE2-NEXT: pxor %xmm1, %xmm1 831; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 832; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 833; SSE2-NEXT: movdqa %xmm0, %xmm1 834; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,65535] 835; SSE2-NEXT: pand %xmm3, %xmm0 836; SSE2-NEXT: pandn %xmm2, %xmm3 837; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9] 838; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 839; SSE2-NEXT: por %xmm2, %xmm1 840; SSE2-NEXT: por %xmm0, %xmm3 841; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,0] 842; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,5,7] 843; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] 844; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,0,3,4,5,6,7] 845; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] 846; SSE2-NEXT: packuswb %xmm0, %xmm1 847; SSE2-NEXT: movdqa %xmm1, %xmm0 848; SSE2-NEXT: retq 849; 850; SSSE3-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4: 851; SSSE3: # %bb.0: 852; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] 853; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9] 854; SSSE3-NEXT: retq 855; 856; SSE41-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4: 857; SSE41: # %bb.0: 858; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] 859; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9] 860; SSE41-NEXT: retq 861; 862; AVX1-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4: 863; AVX1: # %bb.0: 864; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] 865; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9] 866; AVX1-NEXT: retq 867; 868; AVX2-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4: 869; AVX2: # %bb.0: 870; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] 871; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9] 872; AVX2-NEXT: retq 873; 874; AVX512VLBW-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4: 875; AVX512VLBW: # %bb.0: 876; AVX512VLBW-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] 877; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,3,6,6,7,8,9] 878; AVX512VLBW-NEXT: retq 879; 880; AVX512VLVBMI-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4: 881; AVX512VLVBMI: # %bb.0: 882; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [5,6,7,8,9,10,27,28,29,30,30,1,1,2,3,4] 883; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 884; AVX512VLVBMI-NEXT: retq 885; 886; XOP-LABEL: shuffle_v16i8_5_6_7_8_9_10_27_28_29_30_31_0_1_2_3_4: 887; XOP: # %bb.0: 888; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10],xmm1[11,12,13,14,14],xmm0[1,1,2,3,4] 889; XOP-NEXT: retq 890 %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 27, i32 28, i32 29, i32 30, i32 30, i32 1, i32 1, i32 2, i32 3, i32 4> 891 ret <16 x i8> %1 892} 893 894; PR27780 - https://bugs.llvm.org/show_bug.cgi?id=27780 895 896define <16 x i8> @load_fold_pblendvb(<16 x i8>* %px, <16 x i8> %y) { 897; SSE2-LABEL: load_fold_pblendvb: 898; SSE2: # %bb.0: 899; SSE2-NEXT: movaps {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] 900; SSE2-NEXT: andps %xmm1, %xmm0 901; SSE2-NEXT: andnps (%rdi), %xmm1 902; SSE2-NEXT: orps %xmm1, %xmm0 903; SSE2-NEXT: retq 904; 905; SSSE3-LABEL: load_fold_pblendvb: 906; SSSE3: # %bb.0: 907; SSSE3-NEXT: movdqa (%rdi), %xmm1 908; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[3],zero,zero,zero,xmm0[7,8,9],zero,xmm0[11],zero,zero,zero,xmm0[15] 909; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,xmm1[2],zero,xmm1[4,5,6],zero,zero,zero,xmm1[10],zero,xmm1[12,13,14],zero 910; SSSE3-NEXT: por %xmm1, %xmm0 911; SSSE3-NEXT: retq 912; 913; SSE41-LABEL: load_fold_pblendvb: 914; SSE41: # %bb.0: 915; SSE41-NEXT: movdqa %xmm0, %xmm1 916; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] 917; SSE41-NEXT: pblendvb %xmm0, (%rdi), %xmm1 918; SSE41-NEXT: movdqa %xmm1, %xmm0 919; SSE41-NEXT: retq 920; 921; AVX1OR2-LABEL: load_fold_pblendvb: 922; AVX1OR2: # %bb.0: 923; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] 924; AVX1OR2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 925; AVX1OR2-NEXT: retq 926; 927; AVX512VL-LABEL: load_fold_pblendvb: 928; AVX512VL: # %bb.0: 929; AVX512VL-NEXT: movw $29812, %ax # imm = 0x7474 930; AVX512VL-NEXT: kmovd %eax, %k1 931; AVX512VL-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} 932; AVX512VL-NEXT: retq 933 %x = load <16 x i8>, <16 x i8>* %px, align 16 934 %select = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31> 935 ret <16 x i8> %select 936} 937 938define <16 x i8> @load_fold_pblendvb_commute(<16 x i8>* %px, <16 x i8> %y) { 939; SSE2-LABEL: load_fold_pblendvb_commute: 940; SSE2: # %bb.0: 941; SSE2-NEXT: movaps {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] 942; SSE2-NEXT: movaps %xmm1, %xmm2 943; SSE2-NEXT: andnps %xmm0, %xmm2 944; SSE2-NEXT: andps (%rdi), %xmm1 945; SSE2-NEXT: orps %xmm2, %xmm1 946; SSE2-NEXT: movaps %xmm1, %xmm0 947; SSE2-NEXT: retq 948; 949; SSSE3-LABEL: load_fold_pblendvb_commute: 950; SSSE3: # %bb.0: 951; SSSE3-NEXT: movdqa (%rdi), %xmm1 952; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[2],zero,xmm0[4,5,6],zero,zero,zero,xmm0[10],zero,xmm0[12,13,14],zero 953; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,xmm1[3],zero,zero,zero,xmm1[7,8,9],zero,xmm1[11],zero,zero,zero,xmm1[15] 954; SSSE3-NEXT: por %xmm1, %xmm0 955; SSSE3-NEXT: retq 956; 957; SSE41-LABEL: load_fold_pblendvb_commute: 958; SSE41: # %bb.0: 959; SSE41-NEXT: movdqa %xmm0, %xmm1 960; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] 961; SSE41-NEXT: pblendvb %xmm0, (%rdi), %xmm1 962; SSE41-NEXT: movdqa %xmm1, %xmm0 963; SSE41-NEXT: retq 964; 965; AVX1OR2-LABEL: load_fold_pblendvb_commute: 966; AVX1OR2: # %bb.0: 967; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] 968; AVX1OR2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 969; AVX1OR2-NEXT: retq 970; 971; AVX512VL-LABEL: load_fold_pblendvb_commute: 972; AVX512VL: # %bb.0: 973; AVX512VL-NEXT: vmovdqa (%rdi), %xmm1 974; AVX512VL-NEXT: movw $29812, %ax # imm = 0x7474 975; AVX512VL-NEXT: kmovd %eax, %k1 976; AVX512VL-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} 977; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 978; AVX512VL-NEXT: retq 979 %x = load <16 x i8>, <16 x i8>* %px, align 16 980 %select = shufflevector <16 x i8> %y, <16 x i8> %x, <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31> 981 ret <16 x i8> %select 982} 983 984define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) { 985; SSE2-LABEL: trunc_v4i32_shuffle: 986; SSE2: # %bb.0: 987; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 988; SSE2-NEXT: packuswb %xmm0, %xmm0 989; SSE2-NEXT: packuswb %xmm0, %xmm0 990; SSE2-NEXT: retq 991; 992; SSSE3-LABEL: trunc_v4i32_shuffle: 993; SSSE3: # %bb.0: 994; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 995; SSSE3-NEXT: retq 996; 997; SSE41-LABEL: trunc_v4i32_shuffle: 998; SSE41: # %bb.0: 999; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 1000; SSE41-NEXT: retq 1001; 1002; AVX1OR2-LABEL: trunc_v4i32_shuffle: 1003; AVX1OR2: # %bb.0: 1004; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 1005; AVX1OR2-NEXT: retq 1006; 1007; AVX512VL-LABEL: trunc_v4i32_shuffle: 1008; AVX512VL: # %bb.0: 1009; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0 1010; AVX512VL-NEXT: retq 1011 %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1012 ret <16 x i8> %shuffle 1013} 1014 1015define <16 x i8> @stress_test0(<16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i8> %s.0.7, <16 x i8> %s.0.8, <16 x i8> %s.0.9) { 1016; We don't have anything useful to check here. This generates 100s of 1017; instructions. Instead, just make sure we survived codegen. 1018; ALL-LABEL: stress_test0: 1019; ALL: retq 1020entry: 1021 %s.1.4 = shufflevector <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i32> <i32 1, i32 22, i32 21, i32 28, i32 3, i32 16, i32 6, i32 1, i32 19, i32 29, i32 12, i32 31, i32 2, i32 3, i32 3, i32 6> 1022 %s.1.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i32> <i32 31, i32 20, i32 12, i32 19, i32 2, i32 15, i32 12, i32 31, i32 2, i32 28, i32 2, i32 30, i32 7, i32 8, i32 17, i32 28> 1023 %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> %s.0.9, <16 x i32> <i32 14, i32 10, i32 17, i32 5, i32 17, i32 9, i32 17, i32 21, i32 31, i32 24, i32 16, i32 6, i32 20, i32 28, i32 23, i32 8> 1024 %s.2.2 = shufflevector <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i32> <i32 20, i32 9, i32 21, i32 11, i32 11, i32 4, i32 3, i32 18, i32 3, i32 30, i32 4, i32 31, i32 11, i32 24, i32 13, i32 29> 1025 %s.3.2 = shufflevector <16 x i8> %s.2.2, <16 x i8> %s.1.4, <16 x i32> <i32 15, i32 13, i32 5, i32 11, i32 7, i32 17, i32 14, i32 22, i32 22, i32 16, i32 7, i32 24, i32 16, i32 22, i32 7, i32 29> 1026 %s.5.4 = shufflevector <16 x i8> %s.1.5, <16 x i8> %s.1.8, <16 x i32> <i32 3, i32 13, i32 19, i32 7, i32 23, i32 11, i32 1, i32 9, i32 16, i32 25, i32 2, i32 7, i32 0, i32 21, i32 23, i32 17> 1027 %s.6.1 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.3.2, <16 x i32> <i32 11, i32 2, i32 28, i32 31, i32 27, i32 3, i32 9, i32 27, i32 25, i32 25, i32 14, i32 7, i32 12, i32 28, i32 12, i32 23> 1028 %s.7.1 = shufflevector <16 x i8> %s.6.1, <16 x i8> %s.3.2, <16 x i32> <i32 15, i32 29, i32 14, i32 0, i32 29, i32 15, i32 26, i32 30, i32 6, i32 7, i32 2, i32 8, i32 12, i32 10, i32 29, i32 17> 1029 %s.7.2 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.5.4, <16 x i32> <i32 3, i32 29, i32 3, i32 19, i32 undef, i32 20, i32 undef, i32 3, i32 27, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef> 1030 %s.16.0 = shufflevector <16 x i8> %s.7.1, <16 x i8> %s.7.2, <16 x i32> <i32 13, i32 1, i32 16, i32 16, i32 6, i32 7, i32 29, i32 18, i32 19, i32 28, i32 undef, i32 undef, i32 31, i32 1, i32 undef, i32 10> 1031 ret <16 x i8> %s.16.0 1032} 1033 1034define <16 x i8> @undef_test1(<16 x i8> %s.0.5, <16 x i8> %s.0.8, <16 x i8> %s.0.9) noinline nounwind { 1035; There is nothing interesting to check about these instructions other than 1036; that they survive codegen. However, we actually do better and delete all of 1037; them because the result is 'undef'. 1038; 1039; ALL-LABEL: undef_test1: 1040; ALL: # %bb.0: # %entry 1041; ALL-NEXT: retq 1042entry: 1043 %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> undef, <16 x i32> <i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 6, i32 undef, i32 6, i32 undef, i32 14, i32 14, i32 undef, i32 undef, i32 0> 1044 %s.2.4 = shufflevector <16 x i8> undef, <16 x i8> %s.0.5, <16 x i32> <i32 21, i32 undef, i32 undef, i32 19, i32 undef, i32 undef, i32 29, i32 24, i32 21, i32 23, i32 21, i32 17, i32 19, i32 undef, i32 20, i32 22> 1045 %s.2.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> undef, <16 x i32> <i32 3, i32 8, i32 undef, i32 7, i32 undef, i32 10, i32 8, i32 0, i32 15, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 9> 1046 %s.2.9 = shufflevector <16 x i8> %s.0.9, <16 x i8> undef, <16 x i32> <i32 7, i32 undef, i32 14, i32 7, i32 8, i32 undef, i32 7, i32 8, i32 5, i32 15, i32 undef, i32 1, i32 11, i32 undef, i32 undef, i32 11> 1047 %s.3.4 = shufflevector <16 x i8> %s.2.4, <16 x i8> %s.0.5, <16 x i32> <i32 5, i32 0, i32 21, i32 6, i32 15, i32 27, i32 22, i32 21, i32 4, i32 22, i32 19, i32 26, i32 9, i32 26, i32 8, i32 29> 1048 %s.3.9 = shufflevector <16 x i8> %s.2.9, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 8, i32 1, i32 undef, i32 4, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 undef> 1049 %s.4.7 = shufflevector <16 x i8> %s.1.8, <16 x i8> %s.2.9, <16 x i32> <i32 9, i32 0, i32 22, i32 20, i32 24, i32 7, i32 21, i32 17, i32 20, i32 12, i32 19, i32 23, i32 2, i32 9, i32 17, i32 10> 1050 %s.4.8 = shufflevector <16 x i8> %s.2.9, <16 x i8> %s.3.9, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 10, i32 undef, i32 0, i32 5, i32 undef, i32 9, i32 undef> 1051 %s.5.7 = shufflevector <16 x i8> %s.4.7, <16 x i8> %s.4.8, <16 x i32> <i32 16, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1052 %s.8.4 = shufflevector <16 x i8> %s.3.4, <16 x i8> %s.5.7, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 28, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1053 %s.9.4 = shufflevector <16 x i8> %s.8.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 5> 1054 %s.10.4 = shufflevector <16 x i8> %s.9.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1055 %s.12.4 = shufflevector <16 x i8> %s.10.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef> 1056 1057 ret <16 x i8> %s.12.4 1058} 1059 1060define <16 x i8> @PR20540(<8 x i8> %a) { 1061; SSE-LABEL: PR20540: 1062; SSE: # %bb.0: 1063; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 1064; SSE-NEXT: retq 1065; 1066; AVX-LABEL: PR20540: 1067; AVX: # %bb.0: 1068; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 1069; AVX-NEXT: retq 1070 %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> 1071 ret <16 x i8> %shuffle 1072} 1073 1074define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { 1075; SSE-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 1076; SSE: # %bb.0: 1077; SSE-NEXT: movzbl %dil, %eax 1078; SSE-NEXT: movd %eax, %xmm0 1079; SSE-NEXT: retq 1080; 1081; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 1082; AVX: # %bb.0: 1083; AVX-NEXT: movzbl %dil, %eax 1084; AVX-NEXT: vmovd %eax, %xmm0 1085; AVX-NEXT: retq 1086 %a = insertelement <16 x i8> undef, i8 %i, i32 0 1087 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1088 ret <16 x i8> %shuffle 1089} 1090 1091define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { 1092; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 1093; SSE2: # %bb.0: 1094; SSE2-NEXT: shll $8, %edi 1095; SSE2-NEXT: pxor %xmm0, %xmm0 1096; SSE2-NEXT: pinsrw $2, %edi, %xmm0 1097; SSE2-NEXT: retq 1098; 1099; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 1100; SSSE3: # %bb.0: 1101; SSSE3-NEXT: shll $8, %edi 1102; SSSE3-NEXT: pxor %xmm0, %xmm0 1103; SSSE3-NEXT: pinsrw $2, %edi, %xmm0 1104; SSSE3-NEXT: retq 1105; 1106; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 1107; SSE41: # %bb.0: 1108; SSE41-NEXT: pxor %xmm0, %xmm0 1109; SSE41-NEXT: pinsrb $5, %edi, %xmm0 1110; SSE41-NEXT: retq 1111; 1112; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 1113; AVX: # %bb.0: 1114; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 1115; AVX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0 1116; AVX-NEXT: retq 1117 %a = insertelement <16 x i8> undef, i8 %i, i32 0 1118 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> 1119 ret <16 x i8> %shuffle 1120} 1121 1122define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) { 1123; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: 1124; SSE2: # %bb.0: 1125; SSE2-NEXT: shll $8, %edi 1126; SSE2-NEXT: pxor %xmm0, %xmm0 1127; SSE2-NEXT: pinsrw $7, %edi, %xmm0 1128; SSE2-NEXT: retq 1129; 1130; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: 1131; SSSE3: # %bb.0: 1132; SSSE3-NEXT: shll $8, %edi 1133; SSSE3-NEXT: pxor %xmm0, %xmm0 1134; SSSE3-NEXT: pinsrw $7, %edi, %xmm0 1135; SSSE3-NEXT: retq 1136; 1137; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: 1138; SSE41: # %bb.0: 1139; SSE41-NEXT: pxor %xmm0, %xmm0 1140; SSE41-NEXT: pinsrb $15, %edi, %xmm0 1141; SSE41-NEXT: retq 1142; 1143; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: 1144; AVX: # %bb.0: 1145; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 1146; AVX-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0 1147; AVX-NEXT: retq 1148 %a = insertelement <16 x i8> undef, i8 %i, i32 0 1149 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16> 1150 ret <16 x i8> %shuffle 1151} 1152 1153define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { 1154; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 1155; SSE2: # %bb.0: 1156; SSE2-NEXT: movzbl %dil, %eax 1157; SSE2-NEXT: pxor %xmm0, %xmm0 1158; SSE2-NEXT: pinsrw $1, %eax, %xmm0 1159; SSE2-NEXT: retq 1160; 1161; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 1162; SSSE3: # %bb.0: 1163; SSSE3-NEXT: movzbl %dil, %eax 1164; SSSE3-NEXT: pxor %xmm0, %xmm0 1165; SSSE3-NEXT: pinsrw $1, %eax, %xmm0 1166; SSSE3-NEXT: retq 1167; 1168; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 1169; SSE41: # %bb.0: 1170; SSE41-NEXT: pxor %xmm0, %xmm0 1171; SSE41-NEXT: pinsrb $2, %edi, %xmm0 1172; SSE41-NEXT: retq 1173; 1174; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 1175; AVX: # %bb.0: 1176; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 1177; AVX-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0 1178; AVX-NEXT: retq 1179 %a = insertelement <16 x i8> undef, i8 %i, i32 3 1180 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1181 ret <16 x i8> %shuffle 1182} 1183 1184define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu(<16 x i8> %a) { 1185; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu: 1186; SSE: # %bb.0: 1187; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] 1188; SSE-NEXT: retq 1189; 1190; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu: 1191; AVX: # %bb.0: 1192; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] 1193; AVX-NEXT: retq 1194 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 undef, i32 18, i32 undef> 1195 ret <16 x i8> %shuffle 1196} 1197 1198define <16 x i8> @shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) { 1199; SSE-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 1200; SSE: # %bb.0: 1201; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1202; SSE-NEXT: retq 1203; 1204; AVX-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 1205; AVX: # %bb.0: 1206; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1207; AVX-NEXT: retq 1208 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 28, i32 undef, i32 30, i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0> 1209 ret <16 x i8> %shuffle 1210} 1211 1212define <16 x i8> @shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) { 1213; SSE2-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: 1214; SSE2: # %bb.0: 1215; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1216; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 1217; SSE2-NEXT: por %xmm1, %xmm0 1218; SSE2-NEXT: retq 1219; 1220; SSSE3-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: 1221; SSSE3: # %bb.0: 1222; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 1223; SSSE3-NEXT: retq 1224; 1225; SSE41-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: 1226; SSE41: # %bb.0: 1227; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 1228; SSE41-NEXT: retq 1229; 1230; AVX-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: 1231; AVX: # %bb.0: 1232; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 1233; AVX-NEXT: retq 1234 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14> 1235 ret <16 x i8> %shuffle 1236} 1237 1238define <16 x i8> @shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) { 1239; SSE2-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: 1240; SSE2: # %bb.0: 1241; SSE2-NEXT: movdqa %xmm0, %xmm1 1242; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1243; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 1244; SSE2-NEXT: por %xmm1, %xmm0 1245; SSE2-NEXT: retq 1246; 1247; SSSE3-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: 1248; SSSE3: # %bb.0: 1249; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 1250; SSSE3-NEXT: retq 1251; 1252; SSE41-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: 1253; SSE41: # %bb.0: 1254; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 1255; SSE41-NEXT: retq 1256; 1257; AVX-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: 1258; AVX: # %bb.0: 1259; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 1260; AVX-NEXT: retq 1261 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14> 1262 ret <16 x i8> %shuffle 1263} 1264 1265define <16 x i8> @shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00(<16 x i8> %a, <16 x i8> %b) { 1266; SSE2-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00: 1267; SSE2: # %bb.0: 1268; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero 1269; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] 1270; SSE2-NEXT: por %xmm1, %xmm0 1271; SSE2-NEXT: retq 1272; 1273; SSSE3-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00: 1274; SSSE3: # %bb.0: 1275; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 1276; SSSE3-NEXT: retq 1277; 1278; SSE41-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00: 1279; SSE41: # %bb.0: 1280; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 1281; SSE41-NEXT: retq 1282; 1283; AVX-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00: 1284; AVX: # %bb.0: 1285; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] 1286; AVX-NEXT: retq 1287 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0> 1288 ret <16 x i8> %shuffle 1289} 1290 1291define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i8> %a, <16 x i8> %b) { 1292; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: 1293; SSE2: # %bb.0: 1294; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero 1295; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] 1296; SSE2-NEXT: por %xmm1, %xmm0 1297; SSE2-NEXT: retq 1298; 1299; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: 1300; SSSE3: # %bb.0: 1301; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] 1302; SSSE3-NEXT: movdqa %xmm1, %xmm0 1303; SSSE3-NEXT: retq 1304; 1305; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: 1306; SSE41: # %bb.0: 1307; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] 1308; SSE41-NEXT: movdqa %xmm1, %xmm0 1309; SSE41-NEXT: retq 1310; 1311; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: 1312; AVX: # %bb.0: 1313; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] 1314; AVX-NEXT: retq 1315 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16> 1316 ret <16 x i8> %shuffle 1317} 1318 1319define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i8> %a, <16 x i8> %b) { 1320; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00: 1321; SSE2: # %bb.0: 1322; SSE2-NEXT: movdqa %xmm0, %xmm1 1323; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero 1324; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] 1325; SSE2-NEXT: por %xmm1, %xmm0 1326; SSE2-NEXT: retq 1327; 1328; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00: 1329; SSSE3: # %bb.0: 1330; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] 1331; SSSE3-NEXT: retq 1332; 1333; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00: 1334; SSE41: # %bb.0: 1335; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] 1336; SSE41-NEXT: retq 1337; 1338; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00: 1339; AVX: # %bb.0: 1340; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] 1341; AVX-NEXT: retq 1342 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0> 1343 ret <16 x i8> %shuffle 1344} 1345 1346define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<16 x i8> %a, <16 x i8> %b) { 1347; SSE2-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: 1348; SSE2: # %bb.0: 1349; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1350; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 1351; SSE2-NEXT: por %xmm1, %xmm0 1352; SSE2-NEXT: retq 1353; 1354; SSSE3-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: 1355; SSSE3: # %bb.0: 1356; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 1357; SSSE3-NEXT: movdqa %xmm1, %xmm0 1358; SSSE3-NEXT: retq 1359; 1360; SSE41-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: 1361; SSE41: # %bb.0: 1362; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 1363; SSE41-NEXT: movdqa %xmm1, %xmm0 1364; SSE41-NEXT: retq 1365; 1366; AVX-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: 1367; AVX: # %bb.0: 1368; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] 1369; AVX-NEXT: retq 1370 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30> 1371 ret <16 x i8> %shuffle 1372} 1373 1374; PR31151 1375define <16 x i8> @shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23(<16 x i8> %val1, <16 x i8> %val2) { 1376; SSE-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23: 1377; SSE: # %bb.0: 1378; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1379; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1380; SSE-NEXT: retq 1381; 1382; AVX-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23: 1383; AVX: # %bb.0: 1384; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1385; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1386; AVX-NEXT: retq 1387 %shuffle = shufflevector <16 x i8> %val1, <16 x i8> %val2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23> 1388 ret <16 x i8> %shuffle 1389} 1390 1391define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %a) { 1392; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu: 1393; SSE2: # %bb.0: 1394; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1395; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1396; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 1397; SSE2-NEXT: retq 1398; 1399; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu: 1400; SSSE3: # %bb.0: 1401; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,u,u,u,u,u,u,u,1,u,u,u,u,u,u,u] 1402; SSSE3-NEXT: retq 1403; 1404; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu: 1405; SSE41: # %bb.0: 1406; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 1407; SSE41-NEXT: retq 1408; 1409; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu: 1410; AVX: # %bb.0: 1411; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 1412; AVX-NEXT: retq 1413 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1414 ret <16 x i8> %shuffle 1415} 1416 1417define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) { 1418; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz: 1419; SSE2: # %bb.0: 1420; SSE2-NEXT: pxor %xmm1, %xmm1 1421; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1422; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1423; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1424; SSE2-NEXT: retq 1425; 1426; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz: 1427; SSSE3: # %bb.0: 1428; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 1429; SSSE3-NEXT: retq 1430; 1431; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz: 1432; SSE41: # %bb.0: 1433; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 1434; SSE41-NEXT: retq 1435; 1436; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz: 1437; AVX: # %bb.0: 1438; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 1439; AVX-NEXT: retq 1440 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1441 ret <16 x i8> %shuffle 1442} 1443 1444define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu(<16 x i8> %a) { 1445; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu: 1446; SSE2: # %bb.0: 1447; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1448; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1449; SSE2-NEXT: retq 1450; 1451; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu: 1452; SSSE3: # %bb.0: 1453; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1454; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1455; SSSE3-NEXT: retq 1456; 1457; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu: 1458; SSE41: # %bb.0: 1459; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1460; SSE41-NEXT: retq 1461; 1462; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu: 1463; AVX: # %bb.0: 1464; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1465; AVX-NEXT: retq 1466 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef> 1467 ret <16 x i8> %shuffle 1468} 1469 1470define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) { 1471; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz: 1472; SSE2: # %bb.0: 1473; SSE2-NEXT: pxor %xmm1, %xmm1 1474; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1475; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1476; SSE2-NEXT: retq 1477; 1478; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz: 1479; SSSE3: # %bb.0: 1480; SSSE3-NEXT: pxor %xmm1, %xmm1 1481; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1482; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1483; SSSE3-NEXT: retq 1484; 1485; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz: 1486; SSE41: # %bb.0: 1487; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1488; SSE41-NEXT: retq 1489; 1490; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz: 1491; AVX: # %bb.0: 1492; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1493; AVX-NEXT: retq 1494 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31> 1495 ret <16 x i8> %shuffle 1496} 1497 1498define <16 x i8> @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu(<16 x i8> %a) { 1499; SSE2-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu: 1500; SSE2: # %bb.0: 1501; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1502; SSE2-NEXT: retq 1503; 1504; SSSE3-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu: 1505; SSSE3: # %bb.0: 1506; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1507; SSSE3-NEXT: retq 1508; 1509; SSE41-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu: 1510; SSE41: # %bb.0: 1511; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1512; SSE41-NEXT: retq 1513; 1514; AVX-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu: 1515; AVX: # %bb.0: 1516; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1517; AVX-NEXT: retq 1518 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7, i32 undef> 1519 ret <16 x i8> %shuffle 1520} 1521 1522define <16 x i8> @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz(<16 x i8> %a) { 1523; SSE2-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz: 1524; SSE2: # %bb.0: 1525; SSE2-NEXT: pxor %xmm1, %xmm1 1526; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1527; SSE2-NEXT: retq 1528; 1529; SSSE3-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz: 1530; SSSE3: # %bb.0: 1531; SSSE3-NEXT: pxor %xmm1, %xmm1 1532; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1533; SSSE3-NEXT: retq 1534; 1535; SSE41-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz: 1536; SSE41: # %bb.0: 1537; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1538; SSE41-NEXT: retq 1539; 1540; AVX-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz: 1541; AVX: # %bb.0: 1542; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1543; AVX-NEXT: retq 1544 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31> 1545 ret <16 x i8> %shuffle 1546} 1547 1548define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(<16 x i8> %a, <16 x i8> %b) { 1549; SSE2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: 1550; SSE2: # %bb.0: # %entry 1551; SSE2-NEXT: pxor %xmm2, %xmm2 1552; SSE2-NEXT: movdqa %xmm0, %xmm3 1553; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 1554; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[1,3,2,0,4,5,6,7] 1555; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,1] 1556; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,0,0,65535] 1557; SSE2-NEXT: pand %xmm5, %xmm4 1558; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1559; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,3,0,1] 1560; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] 1561; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] 1562; SSE2-NEXT: pandn %xmm2, %xmm5 1563; SSE2-NEXT: por %xmm4, %xmm5 1564; SSE2-NEXT: psrlq $16, %xmm0 1565; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] 1566; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,1,3] 1567; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] 1568; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,4] 1569; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 1570; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 1571; SSE2-NEXT: packuswb %xmm5, %xmm2 1572; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] 1573; SSE2-NEXT: pand %xmm0, %xmm2 1574; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7] 1575; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 1576; SSE2-NEXT: pandn %xmm1, %xmm0 1577; SSE2-NEXT: por %xmm2, %xmm0 1578; SSE2-NEXT: retq 1579; 1580; SSSE3-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: 1581; SSSE3: # %bb.0: # %entry 1582; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero 1583; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0] 1584; SSSE3-NEXT: por %xmm1, %xmm0 1585; SSSE3-NEXT: retq 1586; 1587; SSE41-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: 1588; SSE41: # %bb.0: # %entry 1589; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero 1590; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0] 1591; SSE41-NEXT: por %xmm1, %xmm0 1592; SSE41-NEXT: retq 1593; 1594; AVX1-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: 1595; AVX1: # %bb.0: # %entry 1596; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero 1597; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0] 1598; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1599; AVX1-NEXT: retq 1600; 1601; AVX2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: 1602; AVX2: # %bb.0: # %entry 1603; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero 1604; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0] 1605; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1606; AVX2-NEXT: retq 1607; 1608; AVX512VLBW-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: 1609; AVX512VLBW: # %bb.0: # %entry 1610; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero 1611; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0] 1612; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 1613; AVX512VLBW-NEXT: retq 1614; 1615; AVX512VLVBMI-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: 1616; AVX512VLVBMI: # %bb.0: # %entry 1617; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = <u,10,2,7,22,14,7,2,18,3,1,14,18,9,11,0> 1618; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 1619; AVX512VLVBMI-NEXT: retq 1620; 1621; XOP-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: 1622; XOP: # %bb.0: # %entry 1623; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[u,10,2,7],xmm1[6],xmm0[14,7,2],xmm1[2],xmm0[3,1,14],xmm1[2],xmm0[9,11,0] 1624; XOP-NEXT: retq 1625entry: 1626 %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0> 1627 1628 ret <16 x i8> %shuffle 1629} 1630 1631define <16 x i8> @shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30(<8 x i16> %a0, <8 x i16> %a1) { 1632; SSE-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: 1633; SSE: # %bb.0: 1634; SSE-NEXT: psrlw $8, %xmm0 1635; SSE-NEXT: psrlw $8, %xmm1 1636; SSE-NEXT: packuswb %xmm1, %xmm0 1637; SSE-NEXT: retq 1638; 1639; AVX1-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: 1640; AVX1: # %bb.0: 1641; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1642; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1643; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1644; AVX1-NEXT: retq 1645; 1646; AVX2OR512VL-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: 1647; AVX2OR512VL: # %bb.0: 1648; AVX2OR512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 1649; AVX2OR512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 1650; AVX2OR512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1651; AVX2OR512VL-NEXT: retq 1652; 1653; XOP-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: 1654; XOP: # %bb.0: 1655; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15],xmm1[1,3,5,7,9,11,13,15] 1656; XOP-NEXT: retq 1657 %1 = lshr <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 1658 %2 = lshr <8 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 1659 %3 = bitcast <8 x i16> %1 to <16 x i8> 1660 %4 = bitcast <8 x i16> %2 to <16 x i8> 1661 %5 = shufflevector <16 x i8> %3, <16 x i8> %4, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 1662 ret <16 x i8> %5 1663} 1664 1665define <16 x i8> @stress_test2(<16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i8> %s.0.2) { 1666; Nothing interesting to test here. Just make sure we didn't crashe. 1667; ALL-LABEL: stress_test2: 1668; ALL: retq 1669entry: 1670 %s.1.0 = shufflevector <16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i32> <i32 29, i32 30, i32 2, i32 16, i32 26, i32 21, i32 11, i32 26, i32 26, i32 3, i32 4, i32 5, i32 30, i32 28, i32 15, i32 5> 1671 %s.1.1 = shufflevector <16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i32> <i32 31, i32 1, i32 24, i32 12, i32 28, i32 5, i32 2, i32 9, i32 29, i32 1, i32 31, i32 5, i32 6, i32 17, i32 15, i32 22> 1672 %s.2.0 = shufflevector <16 x i8> %s.1.0, <16 x i8> %s.1.1, <16 x i32> <i32 22, i32 1, i32 12, i32 3, i32 30, i32 4, i32 30, i32 undef, i32 1, i32 10, i32 14, i32 18, i32 27, i32 13, i32 16, i32 19> 1673 1674 ret <16 x i8> %s.2.0 1675} 1676 1677define void @constant_gets_selected(<4 x i32>* %ptr1, <4 x i32>* %ptr2) { 1678; SSE-LABEL: constant_gets_selected: 1679; SSE: # %bb.0: # %entry 1680; SSE-NEXT: xorps %xmm0, %xmm0 1681; SSE-NEXT: movaps %xmm0, (%rdi) 1682; SSE-NEXT: movaps %xmm0, (%rsi) 1683; SSE-NEXT: retq 1684; 1685; AVX-LABEL: constant_gets_selected: 1686; AVX: # %bb.0: # %entry 1687; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1688; AVX-NEXT: vmovaps %xmm0, (%rdi) 1689; AVX-NEXT: vmovaps %xmm0, (%rsi) 1690; AVX-NEXT: retq 1691entry: 1692 %weird_zero = bitcast <4 x i32> zeroinitializer to <16 x i8> 1693 %shuffle.i = shufflevector <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %weird_zero, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27> 1694 %weirder_zero = bitcast <16 x i8> %shuffle.i to <4 x i32> 1695 store <4 x i32> %weirder_zero, <4 x i32>* %ptr1, align 16 1696 store <4 x i32> zeroinitializer, <4 x i32>* %ptr2, align 16 1697 ret void 1698} 1699 1700; 1701; Shuffle to logical bit shifts 1702; 1703 1704define <16 x i8> @shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i8> %a, <16 x i8> %b) { 1705; SSE-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14: 1706; SSE: # %bb.0: 1707; SSE-NEXT: psllw $8, %xmm0 1708; SSE-NEXT: retq 1709; 1710; AVX-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14: 1711; AVX: # %bb.0: 1712; AVX-NEXT: vpsllw $8, %xmm0, %xmm0 1713; AVX-NEXT: retq 1714 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14> 1715 ret <16 x i8> %shuffle 1716} 1717 1718define <16 x i8> @shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i8> %a, <16 x i8> %b) { 1719; SSE-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12: 1720; SSE: # %bb.0: 1721; SSE-NEXT: pslld $24, %xmm0 1722; SSE-NEXT: retq 1723; 1724; AVX-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12: 1725; AVX: # %bb.0: 1726; AVX-NEXT: vpslld $24, %xmm0, %xmm0 1727; AVX-NEXT: retq 1728 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 12> 1729 ret <16 x i8> %shuffle 1730} 1731 1732define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08(<16 x i8> %a, <16 x i8> %b) { 1733; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08: 1734; SSE: # %bb.0: 1735; SSE-NEXT: psllq $56, %xmm0 1736; SSE-NEXT: retq 1737; 1738; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08: 1739; AVX: # %bb.0: 1740; AVX-NEXT: vpsllq $56, %xmm0, %xmm0 1741; AVX-NEXT: retq 1742 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 8> 1743 ret <16 x i8> %shuffle 1744} 1745 1746define <16 x i8> @shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14(<16 x i8> %a, <16 x i8> %b) { 1747; SSE-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14: 1748; SSE: # %bb.0: 1749; SSE-NEXT: psllq $8, %xmm0 1750; SSE-NEXT: retq 1751; 1752; AVX-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14: 1753; AVX: # %bb.0: 1754; AVX-NEXT: vpsllq $8, %xmm0, %xmm0 1755; AVX-NEXT: retq 1756 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 undef, i32 2, i32 3, i32 undef, i32 5, i32 6, i32 16, i32 8, i32 9, i32 undef, i32 11, i32 12, i32 13, i32 14> 1757 ret <16 x i8> %shuffle 1758} 1759 1760define <16 x i8> @shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz(<16 x i8> %a, <16 x i8> %b) { 1761; SSE-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz: 1762; SSE: # %bb.0: 1763; SSE-NEXT: psrlw $8, %xmm0 1764; SSE-NEXT: retq 1765; 1766; AVX-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz: 1767; AVX: # %bb.0: 1768; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 1769; AVX-NEXT: retq 1770 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 undef, i32 16, i32 undef, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16> 1771 ret <16 x i8> %shuffle 1772} 1773 1774define <16 x i8> @shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz(<16 x i8> %a, <16 x i8> %b) { 1775; SSE-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz: 1776; SSE: # %bb.0: 1777; SSE-NEXT: psrld $16, %xmm0 1778; SSE-NEXT: retq 1779; 1780; AVX-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz: 1781; AVX: # %bb.0: 1782; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 1783; AVX-NEXT: retq 1784 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 14, i32 15, i32 16, i32 16> 1785 ret <16 x i8> %shuffle 1786} 1787 1788define <16 x i8> @shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz(<16 x i8> %a, <16 x i8> %b) { 1789; SSE-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz: 1790; SSE: # %bb.0: 1791; SSE-NEXT: psrlq $56, %xmm0 1792; SSE-NEXT: retq 1793; 1794; AVX-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz: 1795; AVX: # %bb.0: 1796; AVX-NEXT: vpsrlq $56, %xmm0, %xmm0 1797; AVX-NEXT: retq 1798 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16> 1799 ret <16 x i8> %shuffle 1800} 1801 1802define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz(<16 x i8> %a) { 1803; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz: 1804; SSE2: # %bb.0: 1805; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] 1806; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1807; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] 1808; SSE2-NEXT: retq 1809; 1810; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz: 1811; SSSE3: # %bb.0: 1812; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero 1813; SSSE3-NEXT: retq 1814; 1815; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz: 1816; SSE41: # %bb.0: 1817; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero 1818; SSE41-NEXT: retq 1819; 1820; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz: 1821; AVX: # %bb.0: 1822; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero 1823; AVX-NEXT: retq 1824 %shuffle = shufflevector <16 x i8> %a, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1825 ret <16 x i8> %shuffle 1826} 1827 1828define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) { 1829; SSE-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 1830; SSE: # %bb.0: 1831; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] 1832; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1833; SSE-NEXT: retq 1834; 1835; AVX1-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 1836; AVX1: # %bb.0: 1837; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] 1838; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1839; AVX1-NEXT: retq 1840; 1841; AVX2-SLOW-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 1842; AVX2-SLOW: # %bb.0: 1843; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] 1844; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1845; AVX2-SLOW-NEXT: retq 1846; 1847; AVX2-FAST-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 1848; AVX2-FAST: # %bb.0: 1849; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1850; AVX2-FAST-NEXT: retq 1851; 1852; AVX512VL-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 1853; AVX512VL: # %bb.0: 1854; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1855; AVX512VL-NEXT: retq 1856; 1857; XOP-LABEL: shuffle_v16i8_01_02_03_04_05_06_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: 1858; XOP: # %bb.0: 1859; XOP-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6] 1860; XOP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1861; XOP-NEXT: retq 1862 %shuffle = shufflevector <16 x i8> %a, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1863 ret <16 x i8> %shuffle 1864} 1865 1866define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06(<16 x i8> %a) { 1867; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: 1868; SSE: # %bb.0: 1869; SSE-NEXT: psrlq $8, %xmm0 1870; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 1871; SSE-NEXT: retq 1872; 1873; AVX1-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: 1874; AVX1: # %bb.0: 1875; AVX1-NEXT: vpsrlq $8, %xmm0, %xmm0 1876; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 1877; AVX1-NEXT: retq 1878; 1879; AVX2-SLOW-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: 1880; AVX2-SLOW: # %bb.0: 1881; AVX2-SLOW-NEXT: vpsrlq $8, %xmm0, %xmm0 1882; AVX2-SLOW-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 1883; AVX2-SLOW-NEXT: retq 1884; 1885; AVX2-FAST-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: 1886; AVX2-FAST: # %bb.0: 1887; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,2,3,4,5,6] 1888; AVX2-FAST-NEXT: retq 1889; 1890; AVX512VL-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: 1891; AVX512VL: # %bb.0: 1892; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,2,3,4,5,6] 1893; AVX512VL-NEXT: retq 1894; 1895; XOP-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_01_02_03_04_05_06: 1896; XOP: # %bb.0: 1897; XOP-NEXT: vpsrlq $8, %xmm0, %xmm0 1898; XOP-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 1899; XOP-NEXT: retq 1900 %shuffle = shufflevector <16 x i8> %a, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6> 1901 ret <16 x i8> %shuffle 1902} 1903 1904define <16 x i8> @shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14(<16 x i8> %a) { 1905; SSE2-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14: 1906; SSE2: # %bb.0: 1907; SSE2-NEXT: movdqa %xmm0, %xmm1 1908; SSE2-NEXT: psrld $24, %xmm1 1909; SSE2-NEXT: pslld $8, %xmm0 1910; SSE2-NEXT: por %xmm1, %xmm0 1911; SSE2-NEXT: retq 1912; 1913; SSSE3-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14: 1914; SSSE3: # %bb.0: 1915; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14] 1916; SSSE3-NEXT: retq 1917; 1918; SSE41-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14: 1919; SSE41: # %bb.0: 1920; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14] 1921; SSE41-NEXT: retq 1922; 1923; AVX1-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14: 1924; AVX1: # %bb.0: 1925; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14] 1926; AVX1-NEXT: retq 1927; 1928; AVX2-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14: 1929; AVX2: # %bb.0: 1930; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14] 1931; AVX2-NEXT: retq 1932; 1933; AVX512VL-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14: 1934; AVX512VL: # %bb.0: 1935; AVX512VL-NEXT: vprold $8, %xmm0, %xmm0 1936; AVX512VL-NEXT: retq 1937; 1938; XOP-LABEL: shuffle_v16i8_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14: 1939; XOP: # %bb.0: 1940; XOP-NEXT: vprotd $8, %xmm0, %xmm0 1941; XOP-NEXT: retq 1942 %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 11, i32 8, i32 9, i32 10, i32 15, i32 12, i32 13, i32 14> 1943 ret <16 x i8> %shuffle 1944} 1945 1946; PR44379 1947define <16 x i8> @shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09(<16 x i8> %a) { 1948; SSE-LABEL: shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09: 1949; SSE: # %bb.0: 1950; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] 1951; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] 1952; SSE-NEXT: retq 1953; 1954; AVX1-LABEL: shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09: 1955; AVX1: # %bb.0: 1956; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] 1957; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] 1958; AVX1-NEXT: retq 1959; 1960; AVX2-SLOW-LABEL: shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09: 1961; AVX2-SLOW: # %bb.0: 1962; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] 1963; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] 1964; AVX2-SLOW-NEXT: retq 1965; 1966; AVX2-FAST-LABEL: shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09: 1967; AVX2-FAST: # %bb.0: 1968; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9] 1969; AVX2-FAST-NEXT: retq 1970; 1971; AVX512VL-LABEL: shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09: 1972; AVX512VL: # %bb.0: 1973; AVX512VL-NEXT: vprolq $48, %xmm0, %xmm0 1974; AVX512VL-NEXT: retq 1975; 1976; XOP-LABEL: shuffle_v16i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09: 1977; XOP: # %bb.0: 1978; XOP-NEXT: vprotq $48, %xmm0, %xmm0 1979; XOP-NEXT: retq 1980 %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9> 1981 ret <16 x i8> %shuffle 1982} 1983 1984define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) { 1985; SSE-LABEL: PR12412: 1986; SSE: # %bb.0: # %entry 1987; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1988; SSE-NEXT: pand %xmm2, %xmm1 1989; SSE-NEXT: pand %xmm2, %xmm0 1990; SSE-NEXT: packuswb %xmm1, %xmm0 1991; SSE-NEXT: retq 1992; 1993; AVX1-LABEL: PR12412: 1994; AVX1: # %bb.0: # %entry 1995; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1996; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1997; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1998; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1999; AVX1-NEXT: retq 2000; 2001; AVX2-LABEL: PR12412: 2002; AVX2: # %bb.0: # %entry 2003; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 2004; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 2005; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 2006; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2007; AVX2-NEXT: retq 2008; 2009; AVX512VL-LABEL: PR12412: 2010; AVX512VL: # %bb.0: # %entry 2011; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 2012; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 2013; AVX512VL-NEXT: vpmovwb %ymm0, %xmm0 2014; AVX512VL-NEXT: vzeroupper 2015; AVX512VL-NEXT: retq 2016; 2017; XOP-LABEL: PR12412: 2018; XOP: # %bb.0: # %entry 2019; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm1[0,2,4,6,8,10,12,14] 2020; XOP-NEXT: retq 2021entry: 2022 %0 = shufflevector <16 x i8> %inval1, <16 x i8> %inval2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> 2023 ret <16 x i8> %0 2024} 2025 2026define <16 x i8> @shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz(<16 x i8> %a) { 2027; SSE-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz: 2028; SSE: # %bb.0: 2029; SSE-NEXT: psrld $8, %xmm0 2030; SSE-NEXT: retq 2031; 2032; AVX-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz: 2033; AVX: # %bb.0: 2034; AVX-NEXT: vpsrld $8, %xmm0, %xmm0 2035; AVX-NEXT: retq 2036 %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 undef, i32 2, i32 3, i32 16, i32 undef, i32 6, i32 7, i32 16, i32 undef, i32 10, i32 11, i32 16, i32 undef, i32 14, i32 15, i32 16> 2037 ret <16 x i8> %shuffle 2038} 2039 2040define <16 x i8> @shuffle_v16i8_bitcast_unpack(<16 x i8> %a, <16 x i8> %b) { 2041; SSE-LABEL: shuffle_v16i8_bitcast_unpack: 2042; SSE: # %bb.0: 2043; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 2044; SSE-NEXT: retq 2045; 2046; AVX-LABEL: shuffle_v16i8_bitcast_unpack: 2047; AVX: # %bb.0: 2048; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 2049; AVX-NEXT: retq 2050 %shuffle8 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 7, i32 23, i32 6, i32 22, i32 5, i32 21, i32 4, i32 20, i32 3, i32 19, i32 2, i32 18, i32 1, i32 17, i32 0, i32 16> 2051 %bitcast32 = bitcast <16 x i8> %shuffle8 to <4 x float> 2052 %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 2053 %bitcast16 = bitcast <4 x float> %shuffle32 to <8 x i16> 2054 %shuffle16 = shufflevector <8 x i16> %bitcast16, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 2055 %bitcast8 = bitcast <8 x i16> %shuffle16 to <16 x i8> 2056 ret <16 x i8> %bitcast8 2057} 2058 2059define <16 x i8> @insert_dup_mem_v16i8_i32(i32* %ptr) { 2060; SSE2-LABEL: insert_dup_mem_v16i8_i32: 2061; SSE2: # %bb.0: 2062; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2063; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2064; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 2065; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2066; SSE2-NEXT: retq 2067; 2068; SSSE3-LABEL: insert_dup_mem_v16i8_i32: 2069; SSSE3: # %bb.0: 2070; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2071; SSSE3-NEXT: pxor %xmm1, %xmm1 2072; SSSE3-NEXT: pshufb %xmm1, %xmm0 2073; SSSE3-NEXT: retq 2074; 2075; SSE41-LABEL: insert_dup_mem_v16i8_i32: 2076; SSE41: # %bb.0: 2077; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2078; SSE41-NEXT: pxor %xmm1, %xmm1 2079; SSE41-NEXT: pshufb %xmm1, %xmm0 2080; SSE41-NEXT: retq 2081; 2082; AVX1-LABEL: insert_dup_mem_v16i8_i32: 2083; AVX1: # %bb.0: 2084; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2085; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2086; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 2087; AVX1-NEXT: retq 2088; 2089; AVX2OR512VL-LABEL: insert_dup_mem_v16i8_i32: 2090; AVX2OR512VL: # %bb.0: 2091; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0 2092; AVX2OR512VL-NEXT: retq 2093; 2094; XOPAVX1-LABEL: insert_dup_mem_v16i8_i32: 2095; XOPAVX1: # %bb.0: 2096; XOPAVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2097; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2098; XOPAVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 2099; XOPAVX1-NEXT: retq 2100; 2101; XOPAVX2-LABEL: insert_dup_mem_v16i8_i32: 2102; XOPAVX2: # %bb.0: 2103; XOPAVX2-NEXT: vpbroadcastb (%rdi), %xmm0 2104; XOPAVX2-NEXT: retq 2105 %tmp = load i32, i32* %ptr, align 4 2106 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 2107 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8> 2108 %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> zeroinitializer 2109 ret <16 x i8> %tmp3 2110} 2111 2112define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) { 2113; SSE2-LABEL: insert_dup_mem_v16i8_sext_i8: 2114; SSE2: # %bb.0: 2115; SSE2-NEXT: movzbl (%rdi), %eax 2116; SSE2-NEXT: movd %eax, %xmm0 2117; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2118; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 2119; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2120; SSE2-NEXT: retq 2121; 2122; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8: 2123; SSSE3: # %bb.0: 2124; SSSE3-NEXT: movzbl (%rdi), %eax 2125; SSSE3-NEXT: movd %eax, %xmm0 2126; SSSE3-NEXT: pxor %xmm1, %xmm1 2127; SSSE3-NEXT: pshufb %xmm1, %xmm0 2128; SSSE3-NEXT: retq 2129; 2130; SSE41-LABEL: insert_dup_mem_v16i8_sext_i8: 2131; SSE41: # %bb.0: 2132; SSE41-NEXT: movzbl (%rdi), %eax 2133; SSE41-NEXT: movd %eax, %xmm0 2134; SSE41-NEXT: pxor %xmm1, %xmm1 2135; SSE41-NEXT: pshufb %xmm1, %xmm0 2136; SSE41-NEXT: retq 2137; 2138; AVX1-LABEL: insert_dup_mem_v16i8_sext_i8: 2139; AVX1: # %bb.0: 2140; AVX1-NEXT: movzbl (%rdi), %eax 2141; AVX1-NEXT: vmovd %eax, %xmm0 2142; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2143; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 2144; AVX1-NEXT: retq 2145; 2146; AVX2OR512VL-LABEL: insert_dup_mem_v16i8_sext_i8: 2147; AVX2OR512VL: # %bb.0: 2148; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0 2149; AVX2OR512VL-NEXT: retq 2150; 2151; XOPAVX1-LABEL: insert_dup_mem_v16i8_sext_i8: 2152; XOPAVX1: # %bb.0: 2153; XOPAVX1-NEXT: movzbl (%rdi), %eax 2154; XOPAVX1-NEXT: vmovd %eax, %xmm0 2155; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2156; XOPAVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 2157; XOPAVX1-NEXT: retq 2158; 2159; XOPAVX2-LABEL: insert_dup_mem_v16i8_sext_i8: 2160; XOPAVX2: # %bb.0: 2161; XOPAVX2-NEXT: vpbroadcastb (%rdi), %xmm0 2162; XOPAVX2-NEXT: retq 2163 %tmp = load i8, i8* %ptr, align 1 2164 %tmp1 = sext i8 %tmp to i32 2165 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0 2166 %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8> 2167 %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> zeroinitializer 2168 ret <16 x i8> %tmp4 2169} 2170 2171define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(i32* %ptr) { 2172; SSE2-LABEL: insert_dup_elt1_mem_v16i8_i32: 2173; SSE2: # %bb.0: 2174; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2175; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2176; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] 2177; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2178; SSE2-NEXT: retq 2179; 2180; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_i32: 2181; SSSE3: # %bb.0: 2182; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2183; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2184; SSSE3-NEXT: retq 2185; 2186; SSE41-LABEL: insert_dup_elt1_mem_v16i8_i32: 2187; SSE41: # %bb.0: 2188; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2189; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2190; SSE41-NEXT: retq 2191; 2192; AVX1-LABEL: insert_dup_elt1_mem_v16i8_i32: 2193; AVX1: # %bb.0: 2194; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2195; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2196; AVX1-NEXT: retq 2197; 2198; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v16i8_i32: 2199; AVX2OR512VL: # %bb.0: 2200; AVX2OR512VL-NEXT: vpbroadcastb 1(%rdi), %xmm0 2201; AVX2OR512VL-NEXT: retq 2202; 2203; XOPAVX1-LABEL: insert_dup_elt1_mem_v16i8_i32: 2204; XOPAVX1: # %bb.0: 2205; XOPAVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2206; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2207; XOPAVX1-NEXT: retq 2208; 2209; XOPAVX2-LABEL: insert_dup_elt1_mem_v16i8_i32: 2210; XOPAVX2: # %bb.0: 2211; XOPAVX2-NEXT: vpbroadcastb 1(%rdi), %xmm0 2212; XOPAVX2-NEXT: retq 2213 %tmp = load i32, i32* %ptr, align 4 2214 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 2215 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8> 2216 %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 2217 ret <16 x i8> %tmp3 2218} 2219 2220define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(i32* %ptr) { 2221; SSE2-LABEL: insert_dup_elt2_mem_v16i8_i32: 2222; SSE2: # %bb.0: 2223; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2224; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2225; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] 2226; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2227; SSE2-NEXT: retq 2228; 2229; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_i32: 2230; SSSE3: # %bb.0: 2231; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2232; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 2233; SSSE3-NEXT: retq 2234; 2235; SSE41-LABEL: insert_dup_elt2_mem_v16i8_i32: 2236; SSE41: # %bb.0: 2237; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2238; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 2239; SSE41-NEXT: retq 2240; 2241; AVX1-LABEL: insert_dup_elt2_mem_v16i8_i32: 2242; AVX1: # %bb.0: 2243; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2244; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 2245; AVX1-NEXT: retq 2246; 2247; AVX2OR512VL-LABEL: insert_dup_elt2_mem_v16i8_i32: 2248; AVX2OR512VL: # %bb.0: 2249; AVX2OR512VL-NEXT: vpbroadcastb 2(%rdi), %xmm0 2250; AVX2OR512VL-NEXT: retq 2251; 2252; XOPAVX1-LABEL: insert_dup_elt2_mem_v16i8_i32: 2253; XOPAVX1: # %bb.0: 2254; XOPAVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2255; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 2256; XOPAVX1-NEXT: retq 2257; 2258; XOPAVX2-LABEL: insert_dup_elt2_mem_v16i8_i32: 2259; XOPAVX2: # %bb.0: 2260; XOPAVX2-NEXT: vpbroadcastb 2(%rdi), %xmm0 2261; XOPAVX2-NEXT: retq 2262 %tmp = load i32, i32* %ptr, align 4 2263 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 2264 %tmp2 = bitcast <4 x i32> %tmp1 to <16 x i8> 2265 %tmp3 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 2266 ret <16 x i8> %tmp3 2267} 2268 2269define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(i8* %ptr) { 2270; SSE2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: 2271; SSE2: # %bb.0: 2272; SSE2-NEXT: movsbl (%rdi), %eax 2273; SSE2-NEXT: movd %eax, %xmm0 2274; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2275; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] 2276; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2277; SSE2-NEXT: retq 2278; 2279; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: 2280; SSSE3: # %bb.0: 2281; SSSE3-NEXT: movsbl (%rdi), %eax 2282; SSSE3-NEXT: movd %eax, %xmm0 2283; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2284; SSSE3-NEXT: retq 2285; 2286; SSE41-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: 2287; SSE41: # %bb.0: 2288; SSE41-NEXT: movsbl (%rdi), %eax 2289; SSE41-NEXT: movd %eax, %xmm0 2290; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2291; SSE41-NEXT: retq 2292; 2293; AVX1-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: 2294; AVX1: # %bb.0: 2295; AVX1-NEXT: movsbl (%rdi), %eax 2296; AVX1-NEXT: vmovd %eax, %xmm0 2297; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2298; AVX1-NEXT: retq 2299; 2300; AVX2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: 2301; AVX2: # %bb.0: 2302; AVX2-NEXT: movsbl (%rdi), %eax 2303; AVX2-NEXT: shrl $8, %eax 2304; AVX2-NEXT: vmovd %eax, %xmm0 2305; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 2306; AVX2-NEXT: retq 2307; 2308; AVX512VL-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: 2309; AVX512VL: # %bb.0: 2310; AVX512VL-NEXT: movsbl (%rdi), %eax 2311; AVX512VL-NEXT: shrl $8, %eax 2312; AVX512VL-NEXT: vpbroadcastb %eax, %xmm0 2313; AVX512VL-NEXT: retq 2314; 2315; XOPAVX1-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: 2316; XOPAVX1: # %bb.0: 2317; XOPAVX1-NEXT: movsbl (%rdi), %eax 2318; XOPAVX1-NEXT: vmovd %eax, %xmm0 2319; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 2320; XOPAVX1-NEXT: retq 2321; 2322; XOPAVX2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: 2323; XOPAVX2: # %bb.0: 2324; XOPAVX2-NEXT: movsbl (%rdi), %eax 2325; XOPAVX2-NEXT: shrl $8, %eax 2326; XOPAVX2-NEXT: vmovd %eax, %xmm0 2327; XOPAVX2-NEXT: vpbroadcastb %xmm0, %xmm0 2328; XOPAVX2-NEXT: retq 2329 %tmp = load i8, i8* %ptr, align 1 2330 %tmp1 = sext i8 %tmp to i32 2331 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0 2332 %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8> 2333 %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 2334 ret <16 x i8> %tmp4 2335} 2336 2337define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(i8* %ptr) { 2338; SSE2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: 2339; SSE2: # %bb.0: 2340; SSE2-NEXT: movsbl (%rdi), %eax 2341; SSE2-NEXT: movd %eax, %xmm0 2342; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2343; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] 2344; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2345; SSE2-NEXT: retq 2346; 2347; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: 2348; SSSE3: # %bb.0: 2349; SSSE3-NEXT: movsbl (%rdi), %eax 2350; SSSE3-NEXT: movd %eax, %xmm0 2351; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 2352; SSSE3-NEXT: retq 2353; 2354; SSE41-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: 2355; SSE41: # %bb.0: 2356; SSE41-NEXT: movsbl (%rdi), %eax 2357; SSE41-NEXT: movd %eax, %xmm0 2358; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 2359; SSE41-NEXT: retq 2360; 2361; AVX1-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: 2362; AVX1: # %bb.0: 2363; AVX1-NEXT: movsbl (%rdi), %eax 2364; AVX1-NEXT: vmovd %eax, %xmm0 2365; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 2366; AVX1-NEXT: retq 2367; 2368; AVX2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: 2369; AVX2: # %bb.0: 2370; AVX2-NEXT: movsbl (%rdi), %eax 2371; AVX2-NEXT: shrl $16, %eax 2372; AVX2-NEXT: vmovd %eax, %xmm0 2373; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 2374; AVX2-NEXT: retq 2375; 2376; AVX512VL-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: 2377; AVX512VL: # %bb.0: 2378; AVX512VL-NEXT: movsbl (%rdi), %eax 2379; AVX512VL-NEXT: shrl $16, %eax 2380; AVX512VL-NEXT: vpbroadcastb %eax, %xmm0 2381; AVX512VL-NEXT: retq 2382; 2383; XOPAVX1-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: 2384; XOPAVX1: # %bb.0: 2385; XOPAVX1-NEXT: movsbl (%rdi), %eax 2386; XOPAVX1-NEXT: vmovd %eax, %xmm0 2387; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] 2388; XOPAVX1-NEXT: retq 2389; 2390; XOPAVX2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: 2391; XOPAVX2: # %bb.0: 2392; XOPAVX2-NEXT: movsbl (%rdi), %eax 2393; XOPAVX2-NEXT: shrl $16, %eax 2394; XOPAVX2-NEXT: vmovd %eax, %xmm0 2395; XOPAVX2-NEXT: vpbroadcastb %xmm0, %xmm0 2396; XOPAVX2-NEXT: retq 2397 %tmp = load i8, i8* %ptr, align 1 2398 %tmp1 = sext i8 %tmp to i32 2399 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0 2400 %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8> 2401 %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 2402 ret <16 x i8> %tmp4 2403} 2404 2405define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b) { 2406; SSE2-LABEL: PR31364: 2407; SSE2: # %bb.0: 2408; SSE2-NEXT: movzbl (%rdi), %eax 2409; SSE2-NEXT: movzbl (%rsi), %ecx 2410; SSE2-NEXT: shll $8, %ecx 2411; SSE2-NEXT: orl %eax, %ecx 2412; SSE2-NEXT: movd %ecx, %xmm1 2413; SSE2-NEXT: pxor %xmm0, %xmm0 2414; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2415; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,3,4,5,6,7] 2416; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] 2417; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 2418; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] 2419; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,4,4] 2420; SSE2-NEXT: packuswb %xmm1, %xmm0 2421; SSE2-NEXT: retq 2422; 2423; SSSE3-LABEL: PR31364: 2424; SSSE3: # %bb.0: 2425; SSSE3-NEXT: movzbl (%rdi), %eax 2426; SSSE3-NEXT: movzbl (%rsi), %ecx 2427; SSSE3-NEXT: shll $8, %ecx 2428; SSSE3-NEXT: orl %eax, %ecx 2429; SSSE3-NEXT: movd %ecx, %xmm0 2430; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0] 2431; SSSE3-NEXT: retq 2432; 2433; SSE41-LABEL: PR31364: 2434; SSE41: # %bb.0: 2435; SSE41-NEXT: movzbl (%rdi), %eax 2436; SSE41-NEXT: movd %eax, %xmm0 2437; SSE41-NEXT: pinsrb $1, (%rsi), %xmm0 2438; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0] 2439; SSE41-NEXT: retq 2440; 2441; AVX-LABEL: PR31364: 2442; AVX: # %bb.0: 2443; AVX-NEXT: movzbl (%rdi), %eax 2444; AVX-NEXT: vmovd %eax, %xmm0 2445; AVX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm0 2446; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0] 2447; AVX-NEXT: retq 2448 %v0 = load i8, i8* %a, align 1 2449 %vecins = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %v0, i32 0 2450 %v1 = load i8, i8* %b, align 1 2451 %vecins2 = insertelement <16 x i8> %vecins, i8 %v1, i32 1 2452 %result = shufflevector <16 x i8> %vecins2, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 3, i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0> 2453 ret <16 x i8> %result 2454} 2455 2456define <16 x i8> @PR31301(i8* nocapture readonly %x, i8* nocapture readonly %y) { 2457; SSE2-LABEL: PR31301: 2458; SSE2: # %bb.0: # %entry 2459; SSE2-NEXT: movzbl (%rdi), %eax 2460; SSE2-NEXT: movd %eax, %xmm0 2461; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2462; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 2463; SSE2-NEXT: movzbl (%rsi), %eax 2464; SSE2-NEXT: movd %eax, %xmm1 2465; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2466; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 2467; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 2468; SSE2-NEXT: retq 2469; 2470; SSSE3-LABEL: PR31301: 2471; SSSE3: # %bb.0: # %entry 2472; SSSE3-NEXT: movzbl (%rdi), %eax 2473; SSSE3-NEXT: movd %eax, %xmm0 2474; SSSE3-NEXT: pxor %xmm1, %xmm1 2475; SSSE3-NEXT: pshufb %xmm1, %xmm0 2476; SSSE3-NEXT: movzbl (%rsi), %eax 2477; SSSE3-NEXT: movd %eax, %xmm2 2478; SSSE3-NEXT: pshufb %xmm1, %xmm2 2479; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 2480; SSSE3-NEXT: retq 2481; 2482; SSE41-LABEL: PR31301: 2483; SSE41: # %bb.0: # %entry 2484; SSE41-NEXT: movzbl (%rdi), %eax 2485; SSE41-NEXT: movd %eax, %xmm0 2486; SSE41-NEXT: pxor %xmm1, %xmm1 2487; SSE41-NEXT: pshufb %xmm1, %xmm0 2488; SSE41-NEXT: movzbl (%rsi), %eax 2489; SSE41-NEXT: movd %eax, %xmm2 2490; SSE41-NEXT: pshufb %xmm1, %xmm2 2491; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 2492; SSE41-NEXT: retq 2493; 2494; AVX1-LABEL: PR31301: 2495; AVX1: # %bb.0: # %entry 2496; AVX1-NEXT: movzbl (%rdi), %eax 2497; AVX1-NEXT: vmovd %eax, %xmm0 2498; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2499; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 2500; AVX1-NEXT: movzbl (%rsi), %eax 2501; AVX1-NEXT: vmovd %eax, %xmm2 2502; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 2503; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 2504; AVX1-NEXT: retq 2505; 2506; AVX2OR512VL-LABEL: PR31301: 2507; AVX2OR512VL: # %bb.0: # %entry 2508; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0 2509; AVX2OR512VL-NEXT: vpbroadcastb (%rsi), %xmm1 2510; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 2511; AVX2OR512VL-NEXT: retq 2512; 2513; XOPAVX1-LABEL: PR31301: 2514; XOPAVX1: # %bb.0: # %entry 2515; XOPAVX1-NEXT: movzbl (%rdi), %eax 2516; XOPAVX1-NEXT: vmovd %eax, %xmm0 2517; XOPAVX1-NEXT: movzbl (%rsi), %eax 2518; XOPAVX1-NEXT: vmovd %eax, %xmm1 2519; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[0],xmm1[0],xmm0[0],xmm1[0],xmm0[0],xmm1[0],xmm0[0],xmm1[0],xmm0[0],xmm1[0],xmm0[0],xmm1[0],xmm0[0],xmm1[0] 2520; XOPAVX1-NEXT: retq 2521; 2522; XOPAVX2-LABEL: PR31301: 2523; XOPAVX2: # %bb.0: # %entry 2524; XOPAVX2-NEXT: vpbroadcastb (%rdi), %xmm0 2525; XOPAVX2-NEXT: vpbroadcastb (%rsi), %xmm1 2526; XOPAVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 2527; XOPAVX2-NEXT: retq 2528entry: 2529 %0 = load i8, i8* %x, align 1 2530 %1 = insertelement <16 x i8> undef, i8 %0, i32 0 2531 %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2532 %2 = load i8, i8* %y, align 1 2533 %3 = insertelement <16 x i8> undef, i8 %2, i32 0 2534 %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2535 %vzip.i = shufflevector <16 x i8> %lane, <16 x i8> %lane3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 2536 ret <16 x i8> %vzip.i 2537} 2538