1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST-ALL 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST-PERLANE 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=VL_BW_DQ --check-prefix=VL_BW_DQ-FAST-ALL 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=VL_BW_DQ --check-prefix=VL_BW_DQ-FAST-PERLANE 7 8define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) { 9; AVX512F-LABEL: shuf2i1_1_0: 10; AVX512F: # %bb.0: 11; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0 12; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 13; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 14; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 15; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 16; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 17; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 18; AVX512F-NEXT: vzeroupper 19; AVX512F-NEXT: retq 20; 21; AVX512VL-LABEL: shuf2i1_1_0: 22; AVX512VL: # %bb.0: 23; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0 24; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1 25; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 26; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z} 27; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 28; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 29; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 30; AVX512VL-NEXT: retq 31; 32; VL_BW_DQ-LABEL: shuf2i1_1_0: 33; VL_BW_DQ: # %bb.0: 34; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 35; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 36; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 37; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 38; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 39; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 40; VL_BW_DQ-NEXT: retq 41 %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0> 42 ret <2 x i1> %b 43} 44 45define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { 46; AVX512F-LABEL: shuf2i1_1_2: 47; AVX512F: # %bb.0: 48; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0 49; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 50; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 51; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0] 52; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] 53; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 54; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 55; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 56; AVX512F-NEXT: vzeroupper 57; AVX512F-NEXT: retq 58; 59; AVX512VL-LABEL: shuf2i1_1_2: 60; AVX512VL: # %bb.0: 61; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0 62; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1 63; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 64; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z} 65; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551615,0] 66; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] 67; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 68; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 69; AVX512VL-NEXT: retq 70; 71; VL_BW_DQ-LABEL: shuf2i1_1_2: 72; VL_BW_DQ: # %bb.0: 73; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 74; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 75; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 76; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0] 77; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] 78; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 79; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 80; VL_BW_DQ-NEXT: retq 81 %b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2> 82 ret <2 x i1> %b 83} 84 85 86define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) { 87; AVX512F-LABEL: shuf4i1_3_2_10: 88; AVX512F: # %bb.0: 89; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 90; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 91; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 92; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] 93; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 94; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 95; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 96; AVX512F-NEXT: vzeroupper 97; AVX512F-NEXT: retq 98; 99; AVX512VL-LABEL: shuf4i1_3_2_10: 100; AVX512VL: # %bb.0: 101; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 102; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 103; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 104; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z} 105; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] 106; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 107; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 108; AVX512VL-NEXT: retq 109; 110; VL_BW_DQ-LABEL: shuf4i1_3_2_10: 111; VL_BW_DQ: # %bb.0: 112; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0 113; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0 114; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0 115; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] 116; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0 117; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0 118; VL_BW_DQ-NEXT: retq 119 %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 120 ret <4 x i1> %b 121} 122 123define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) { 124; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0: 125; AVX512F: # %bb.0: 126; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 127; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 128; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] 129; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 130; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 131; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 132; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 133; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 134; AVX512F-NEXT: vzeroupper 135; AVX512F-NEXT: retq 136; 137; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0: 138; AVX512VL: # %bb.0: 139; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 140; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 141; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} 142; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,1,0,3,7,7,0] 143; AVX512VL-NEXT: vpermd %ymm1, %ymm2, %ymm1 144; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 145; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 146; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 147; AVX512VL-NEXT: vzeroupper 148; AVX512VL-NEXT: retq 149; 150; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0: 151; VL_BW_DQ: # %bb.0: 152; VL_BW_DQ-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 153; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 154; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [3,6,1,0,3,7,7,0] 155; VL_BW_DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 156; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 157; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 158; VL_BW_DQ-NEXT: vzeroupper 159; VL_BW_DQ-NEXT: retq 160 %a2 = icmp eq <8 x i64> %a, %a1 161 %b2 = icmp eq <8 x i64> %b, %b1 162 %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> <i32 3, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0> 163 ret <8 x i1> %c 164} 165 166define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) { 167; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 168; AVX512F: # %bb.0: 169; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 170; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 171; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 172; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 173; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 174; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 175; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 176; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 177; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 178; AVX512F-NEXT: vzeroupper 179; AVX512F-NEXT: retq 180; 181; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 182; AVX512VL: # %bb.0: 183; AVX512VL-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 184; AVX512VL-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 185; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 186; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 187; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 188; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 189; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1 190; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 191; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 192; AVX512VL-NEXT: vzeroupper 193; AVX512VL-NEXT: retq 194; 195; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 196; VL_BW_DQ: # %bb.0: 197; VL_BW_DQ-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 198; VL_BW_DQ-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 199; VL_BW_DQ-NEXT: vpmovm2d %k1, %zmm0 200; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm1 201; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 202; VL_BW_DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 203; VL_BW_DQ-NEXT: vpmovd2m %zmm2, %k0 204; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0 205; VL_BW_DQ-NEXT: vzeroupper 206; VL_BW_DQ-NEXT: retq 207 %a2 = icmp eq <16 x i32> %a, %a1 208 %b2 = icmp eq <16 x i32> %b, %b1 209 %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 210 ret <16 x i1> %c 211} 212 213define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) { 214; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 215; AVX512F: # %bb.0: 216; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1 217; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 218; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 219; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 220; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 221; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 222; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 223; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 224; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 225; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 226; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 227; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 228; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 229; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 230; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 231; AVX512F-NEXT: retq 232; 233; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 234; AVX512VL: # %bb.0: 235; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm1 236; AVX512VL-NEXT: vpslld $31, %zmm1, %zmm1 237; AVX512VL-NEXT: vptestmd %zmm1, %zmm1, %k1 238; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 239; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0 240; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0 241; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2 242; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 243; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 244; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 245; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 246; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1 247; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 248; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 249; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 250; AVX512VL-NEXT: retq 251; 252; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 253; VL_BW_DQ: # %bb.0: 254; VL_BW_DQ-NEXT: vpsllw $7, %ymm0, %ymm0 255; VL_BW_DQ-NEXT: vpmovb2m %ymm0, %k0 256; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 257; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 258; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 259; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0 260; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k0 261; VL_BW_DQ-NEXT: vpmovm2b %k0, %ymm0 262; VL_BW_DQ-NEXT: retq 263 %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 264 ret <32 x i1> %b 265} 266 267define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16(<32 x i16> %a, <32 x i16> %c, <32 x i16> %d) { 268; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16: 269; AVX512F: # %bb.0: 270; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 271; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm4 272; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4 273; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 274; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 275; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 276; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 277; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 278; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 279; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} 280; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 281; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 282; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 283; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 284; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 285; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 286; AVX512F-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0 287; AVX512F-NEXT: retq 288; 289; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16: 290; AVX512VL: # %bb.0: 291; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 292; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm4 293; AVX512VL-NEXT: vpmovsxwd %ymm4, %zmm4 294; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 295; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 296; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 297; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0 298; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2 299; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 300; AVX512VL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} 301; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 302; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 303; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 304; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 305; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 306; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 307; AVX512VL-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0 308; AVX512VL-NEXT: retq 309; 310; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16: 311; VL_BW_DQ: # %bb.0: 312; VL_BW_DQ-NEXT: vptestnmw %zmm0, %zmm0, %k0 313; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 314; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 315; VL_BW_DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 316; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0 317; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1 318; VL_BW_DQ-NEXT: vpblendmw %zmm1, %zmm2, %zmm0 {%k1} 319; VL_BW_DQ-NEXT: retq 320 %cmp = icmp eq <32 x i16> %a, zeroinitializer 321 %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 322 %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d 323 ret <32 x i16> %sel 324} 325 326define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8(<32 x i8> %a, <32 x i8> %c, <32 x i8> %d) { 327; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8: 328; AVX512F: # %bb.0: 329; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 330; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 331; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm3 332; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 333; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 334; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 335; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 336; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 337; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} 338; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 339; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 340; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 341; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 342; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 343; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 344; AVX512F-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 345; AVX512F-NEXT: retq 346; 347; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8: 348; AVX512VL: # %bb.0: 349; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 350; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 351; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm3 352; AVX512VL-NEXT: vptestmd %zmm3, %zmm3, %k1 353; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 354; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0 355; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2 356; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 357; AVX512VL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} 358; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 359; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 360; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 361; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 362; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 363; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 364; AVX512VL-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 365; AVX512VL-NEXT: retq 366; 367; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8: 368; VL_BW_DQ: # %bb.0: 369; VL_BW_DQ-NEXT: vptestnmb %ymm0, %ymm0, %k0 370; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 371; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 372; VL_BW_DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 373; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0 374; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1 375; VL_BW_DQ-NEXT: vpblendmb %ymm1, %ymm2, %ymm0 {%k1} 376; VL_BW_DQ-NEXT: retq 377 %cmp = icmp eq <32 x i8> %a, zeroinitializer 378 %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 379 %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d 380 ret <32 x i8> %sel 381} 382 383define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split(<16 x i32> %a, <16 x i32> %b, <32 x i16> %c, <32 x i16> %d) { 384; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split: 385; AVX512F: # %bb.0: 386; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 387; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2 388; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 389; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 390; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 391; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 392; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 393; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 394; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 395; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 396; AVX512F-NEXT: vpternlogq $202, %zmm3, %zmm2, %zmm0 397; AVX512F-NEXT: retq 398; 399; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split: 400; AVX512VL: # %bb.0: 401; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 402; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2 403; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 404; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 405; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 406; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 407; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 408; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 409; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 410; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 411; AVX512VL-NEXT: vpternlogq $202, %zmm3, %zmm2, %zmm0 412; AVX512VL-NEXT: retq 413; 414; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split: 415; VL_BW_DQ: # %bb.0: 416; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 417; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1 418; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0 419; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 420; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 421; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 422; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0 423; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1 424; VL_BW_DQ-NEXT: vpblendmw %zmm2, %zmm3, %zmm0 {%k1} 425; VL_BW_DQ-NEXT: retq 426 %cmp1 = icmp eq <16 x i32> %a, zeroinitializer 427 %cmp2 = icmp eq <16 x i32> %b, zeroinitializer 428 %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 429 %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 430 %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d 431 ret <32 x i16> %sel 432} 433 434define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split(<16 x i32> %a, <16 x i32> %b, <32 x i8> %c, <32 x i8> %d) { 435; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split: 436; AVX512F: # %bb.0: 437; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 438; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2 439; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 440; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 441; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 442; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 443; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 444; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 445; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 446; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 447; AVX512F-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 448; AVX512F-NEXT: retq 449; 450; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split: 451; AVX512VL: # %bb.0: 452; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 453; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2 454; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 455; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 456; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 457; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 458; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 459; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 460; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 461; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 462; AVX512VL-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 463; AVX512VL-NEXT: retq 464; 465; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split: 466; VL_BW_DQ: # %bb.0: 467; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 468; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1 469; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0 470; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 471; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 472; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 473; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0 474; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1 475; VL_BW_DQ-NEXT: vpblendmb %ymm2, %ymm3, %ymm0 {%k1} 476; VL_BW_DQ-NEXT: retq 477 %cmp1 = icmp eq <16 x i32> %a, zeroinitializer 478 %cmp2 = icmp eq <16 x i32> %b, zeroinitializer 479 %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 480 %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 481 %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d 482 ret <32 x i8> %sel 483} 484 485define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { 486; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u: 487; AVX512F: # %bb.0: 488; AVX512F-NEXT: kmovw %edi, %k1 489; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 490; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 491; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0 492; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 493; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 494; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 495; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 496; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 497; AVX512F-NEXT: vzeroupper 498; AVX512F-NEXT: retq 499; 500; AVX512VL-FAST-ALL-LABEL: shuf8i1_u_2_u_u_2_u_2_u: 501; AVX512VL-FAST-ALL: # %bb.0: 502; AVX512VL-FAST-ALL-NEXT: kmovw %edi, %k1 503; AVX512VL-FAST-ALL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 504; AVX512VL-FAST-ALL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} 505; AVX512VL-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2] 506; AVX512VL-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 507; AVX512VL-FAST-ALL-NEXT: vpslld $31, %ymm1, %ymm1 508; AVX512VL-FAST-ALL-NEXT: vptestmd %ymm1, %ymm1, %k1 509; AVX512VL-FAST-ALL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 510; AVX512VL-FAST-ALL-NEXT: vpmovdw %ymm0, %xmm0 511; AVX512VL-FAST-ALL-NEXT: vzeroupper 512; AVX512VL-FAST-ALL-NEXT: retq 513; 514; AVX512VL-FAST-PERLANE-LABEL: shuf8i1_u_2_u_u_2_u_2_u: 515; AVX512VL-FAST-PERLANE: # %bb.0: 516; AVX512VL-FAST-PERLANE-NEXT: kmovw %edi, %k1 517; AVX512VL-FAST-PERLANE-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 518; AVX512VL-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} 519; AVX512VL-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 520; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 521; AVX512VL-FAST-PERLANE-NEXT: vpslld $31, %ymm1, %ymm1 522; AVX512VL-FAST-PERLANE-NEXT: vptestmd %ymm1, %ymm1, %k1 523; AVX512VL-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 524; AVX512VL-FAST-PERLANE-NEXT: vpmovdw %ymm0, %xmm0 525; AVX512VL-FAST-PERLANE-NEXT: vzeroupper 526; AVX512VL-FAST-PERLANE-NEXT: retq 527; 528; VL_BW_DQ-FAST-ALL-LABEL: shuf8i1_u_2_u_u_2_u_2_u: 529; VL_BW_DQ-FAST-ALL: # %bb.0: 530; VL_BW_DQ-FAST-ALL-NEXT: kmovd %edi, %k0 531; VL_BW_DQ-FAST-ALL-NEXT: vpmovm2d %k0, %ymm0 532; VL_BW_DQ-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] 533; VL_BW_DQ-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 534; VL_BW_DQ-FAST-ALL-NEXT: vpmovd2m %ymm0, %k0 535; VL_BW_DQ-FAST-ALL-NEXT: vpmovm2w %k0, %xmm0 536; VL_BW_DQ-FAST-ALL-NEXT: vzeroupper 537; VL_BW_DQ-FAST-ALL-NEXT: retq 538; 539; VL_BW_DQ-FAST-PERLANE-LABEL: shuf8i1_u_2_u_u_2_u_2_u: 540; VL_BW_DQ-FAST-PERLANE: # %bb.0: 541; VL_BW_DQ-FAST-PERLANE-NEXT: kmovd %edi, %k0 542; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovm2d %k0, %ymm0 543; VL_BW_DQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 544; VL_BW_DQ-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm0 545; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovd2m %ymm0, %k0 546; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovm2w %k0, %xmm0 547; VL_BW_DQ-FAST-PERLANE-NEXT: vzeroupper 548; VL_BW_DQ-FAST-PERLANE-NEXT: retq 549 %b = bitcast i8 %a to <8 x i1> 550 %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef> 551 ret <8 x i1> %c 552} 553 554define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { 555; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u: 556; AVX512F: # %bb.0: 557; AVX512F-NEXT: kmovw %edi, %k1 558; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 559; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 560; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> 561; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 562; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 563; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 564; AVX512F-NEXT: kmovw %k0, %eax 565; AVX512F-NEXT: # kill: def $al killed $al killed $eax 566; AVX512F-NEXT: vzeroupper 567; AVX512F-NEXT: retq 568; 569; AVX512VL-LABEL: shuf8i1_10_2_9_u_3_u_2_u: 570; AVX512VL: # %bb.0: 571; AVX512VL-NEXT: kmovw %edi, %k1 572; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 573; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 574; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 575; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3] 576; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 577; AVX512VL-NEXT: vpslld $31, %ymm2, %ymm0 578; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 579; AVX512VL-NEXT: kmovw %k0, %eax 580; AVX512VL-NEXT: # kill: def $al killed $al killed $eax 581; AVX512VL-NEXT: vzeroupper 582; AVX512VL-NEXT: retq 583; 584; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u: 585; VL_BW_DQ: # %bb.0: 586; VL_BW_DQ-NEXT: kmovd %edi, %k0 587; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 588; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 589; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3] 590; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 591; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 592; VL_BW_DQ-NEXT: kmovd %k0, %eax 593; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax 594; VL_BW_DQ-NEXT: vzeroupper 595; VL_BW_DQ-NEXT: retq 596 %b = bitcast i8 %a to <8 x i1> 597 %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef> 598 %d = bitcast <8 x i1> %c to i8 599 ret i8 %d 600} 601 602define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) { 603; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u: 604; AVX512F: # %bb.0: 605; AVX512F-NEXT: kmovw %edi, %k1 606; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 607; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1] 608; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 609; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 610; AVX512F-NEXT: kmovw %k0, %eax 611; AVX512F-NEXT: # kill: def $al killed $al killed $eax 612; AVX512F-NEXT: vzeroupper 613; AVX512F-NEXT: retq 614; 615; AVX512VL-LABEL: shuf8i1_0_1_4_5_u_u_u_u: 616; AVX512VL: # %bb.0: 617; AVX512VL-NEXT: kmovw %edi, %k1 618; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 619; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 620; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 621; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 622; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 623; AVX512VL-NEXT: kmovw %k0, %eax 624; AVX512VL-NEXT: # kill: def $al killed $al killed $eax 625; AVX512VL-NEXT: vzeroupper 626; AVX512VL-NEXT: retq 627; 628; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u: 629; VL_BW_DQ: # %bb.0: 630; VL_BW_DQ-NEXT: kmovd %edi, %k0 631; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 632; VL_BW_DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 633; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 634; VL_BW_DQ-NEXT: kmovd %k0, %eax 635; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax 636; VL_BW_DQ-NEXT: vzeroupper 637; VL_BW_DQ-NEXT: retq 638 %b = bitcast i8 %a to <8 x i1> 639 %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef> 640 %d = bitcast <8 x i1> %c to i8 641 ret i8 %d 642} 643 644define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { 645; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0: 646; AVX512F: # %bb.0: 647; AVX512F-NEXT: kmovw %edi, %k1 648; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 649; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 650; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] 651; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 652; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 653; AVX512F-NEXT: kmovw %k0, %eax 654; AVX512F-NEXT: # kill: def $al killed $al killed $eax 655; AVX512F-NEXT: vzeroupper 656; AVX512F-NEXT: retq 657; 658; AVX512VL-LABEL: shuf8i1_9_6_1_0_3_7_7_0: 659; AVX512VL: # %bb.0: 660; AVX512VL-NEXT: kmovw %edi, %k1 661; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 662; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 663; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 664; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] 665; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 666; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0 667; AVX512VL-NEXT: kmovw %k0, %eax 668; AVX512VL-NEXT: # kill: def $al killed $al killed $eax 669; AVX512VL-NEXT: vzeroupper 670; AVX512VL-NEXT: retq 671; 672; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0: 673; VL_BW_DQ: # %bb.0: 674; VL_BW_DQ-NEXT: kmovd %edi, %k0 675; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 676; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 677; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] 678; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 679; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 680; VL_BW_DQ-NEXT: kmovd %k0, %eax 681; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax 682; VL_BW_DQ-NEXT: vzeroupper 683; VL_BW_DQ-NEXT: retq 684 %b = bitcast i8 %a to <8 x i1> 685 %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0> 686 %d = bitcast <8 x i1>%c to i8 687 ret i8 %d 688} 689 690define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { 691; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0: 692; AVX512F: # %bb.0: 693; AVX512F-NEXT: kmovw %edi, %k1 694; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 695; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] 696; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 697; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 698; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 699; AVX512F-NEXT: kmovw %k0, %eax 700; AVX512F-NEXT: # kill: def $al killed $al killed $eax 701; AVX512F-NEXT: vzeroupper 702; AVX512F-NEXT: retq 703; 704; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0: 705; AVX512VL: # %bb.0: 706; AVX512VL-NEXT: kmovw %edi, %k1 707; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 708; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 709; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 710; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 711; AVX512VL-NEXT: kmovw %k0, %eax 712; AVX512VL-NEXT: # kill: def $al killed $al killed $eax 713; AVX512VL-NEXT: vzeroupper 714; AVX512VL-NEXT: retq 715; 716; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0: 717; VL_BW_DQ: # %bb.0: 718; VL_BW_DQ-NEXT: kmovd %edi, %k0 719; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 720; VL_BW_DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 721; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 722; VL_BW_DQ-NEXT: kmovd %k0, %eax 723; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax 724; VL_BW_DQ-NEXT: vzeroupper 725; VL_BW_DQ-NEXT: retq 726 %b = bitcast i8 %a to <8 x i1> 727 %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0> 728 %d = bitcast <8 x i1>%c to i8 729 ret i8 %d 730} 731 732define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { 733; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1: 734; AVX512F: # %bb.0: 735; AVX512F-NEXT: kmovw %edi, %k1 736; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 737; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1] 738; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,0,0,0,0] 739; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 740; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 741; AVX512F-NEXT: kmovw %k0, %eax 742; AVX512F-NEXT: # kill: def $al killed $al killed $eax 743; AVX512F-NEXT: vzeroupper 744; AVX512F-NEXT: retq 745; 746; AVX512VL-LABEL: shuf8i1__9_6_1_10_3_7_7_1: 747; AVX512VL: # %bb.0: 748; AVX512VL-NEXT: kmovw %edi, %k1 749; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 750; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 751; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 752; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7] 753; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 754; AVX512VL-NEXT: kmovw %k0, %eax 755; AVX512VL-NEXT: # kill: def $al killed $al killed $eax 756; AVX512VL-NEXT: vzeroupper 757; AVX512VL-NEXT: retq 758; 759; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1: 760; VL_BW_DQ: # %bb.0: 761; VL_BW_DQ-NEXT: kmovd %edi, %k0 762; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 763; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 764; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7] 765; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 766; VL_BW_DQ-NEXT: kmovd %k0, %eax 767; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax 768; VL_BW_DQ-NEXT: vzeroupper 769; VL_BW_DQ-NEXT: retq 770 %b = bitcast i8 %a to <8 x i1> 771 %c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1> 772 %c1 = bitcast <8 x i1>%c to i8 773 ret i8 %c1 774} 775 776define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { 777; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: 778; AVX512F: # %bb.0: 779; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 780; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 781; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 782; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 783; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] 784; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 785; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 786; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 787; AVX512F-NEXT: kmovw %k0, %eax 788; AVX512F-NEXT: # kill: def $al killed $al killed $eax 789; AVX512F-NEXT: vzeroupper 790; AVX512F-NEXT: retq 791; 792; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: 793; AVX512VL: # %bb.0: 794; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0 795; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 796; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 797; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 798; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} 799; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [9,1,2,3,4,5,6,7] 800; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 801; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0 802; AVX512VL-NEXT: kmovw %k0, %eax 803; AVX512VL-NEXT: # kill: def $al killed $al killed $eax 804; AVX512VL-NEXT: vzeroupper 805; AVX512VL-NEXT: retq 806; 807; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: 808; VL_BW_DQ: # %bb.0: 809; VL_BW_DQ-NEXT: vpsllw $15, %xmm0, %xmm0 810; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0 811; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 812; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [9,1,2,3,4,5,6,7] 813; VL_BW_DQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 814; VL_BW_DQ-NEXT: vpermt2d %ymm0, %ymm1, %ymm2 815; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 816; VL_BW_DQ-NEXT: kmovd %k0, %eax 817; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax 818; VL_BW_DQ-NEXT: vzeroupper 819; VL_BW_DQ-NEXT: retq 820 %c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0> 821 %c1 = bitcast <8 x i1>%c to i8 822 ret i8 %c1 823} 824 825define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { 826; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: 827; AVX512F: # %bb.0: 828; AVX512F-NEXT: kmovw %edi, %k1 829; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 830; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 831; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 832; AVX512F-NEXT: kmovw %k0, %eax 833; AVX512F-NEXT: # kill: def $ax killed $ax killed $eax 834; AVX512F-NEXT: vzeroupper 835; AVX512F-NEXT: retq 836; 837; AVX512VL-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: 838; AVX512VL: # %bb.0: 839; AVX512VL-NEXT: kmovw %edi, %k1 840; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 841; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0 842; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0 843; AVX512VL-NEXT: kmovw %k0, %eax 844; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax 845; AVX512VL-NEXT: vzeroupper 846; AVX512VL-NEXT: retq 847; 848; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: 849; VL_BW_DQ: # %bb.0: 850; VL_BW_DQ-NEXT: kmovd %edi, %k0 851; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm0 852; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0 853; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0 854; VL_BW_DQ-NEXT: kmovd %k0, %eax 855; VL_BW_DQ-NEXT: # kill: def $ax killed $ax killed $eax 856; VL_BW_DQ-NEXT: vzeroupper 857; VL_BW_DQ-NEXT: retq 858 %b = bitcast i16 %a to <16 x i1> 859 %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer 860 %d = bitcast <16 x i1> %c to i16 861 ret i16 %d 862} 863 864define i64 @shuf64i1_zero(i64 %a) { 865; AVX512F-LABEL: shuf64i1_zero: 866; AVX512F: # %bb.0: 867; AVX512F-NEXT: kmovw %edi, %k1 868; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 869; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 870; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 871; AVX512F-NEXT: kmovw %k0, %eax 872; AVX512F-NEXT: kmovw %k0, %ecx 873; AVX512F-NEXT: shll $16, %ecx 874; AVX512F-NEXT: orl %eax, %ecx 875; AVX512F-NEXT: movq %rcx, %rax 876; AVX512F-NEXT: shlq $32, %rax 877; AVX512F-NEXT: orq %rcx, %rax 878; AVX512F-NEXT: vzeroupper 879; AVX512F-NEXT: retq 880; 881; AVX512VL-LABEL: shuf64i1_zero: 882; AVX512VL: # %bb.0: 883; AVX512VL-NEXT: kmovw %edi, %k1 884; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 885; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0 886; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0 887; AVX512VL-NEXT: kmovw %k0, %eax 888; AVX512VL-NEXT: kmovw %k0, %ecx 889; AVX512VL-NEXT: shll $16, %ecx 890; AVX512VL-NEXT: orl %eax, %ecx 891; AVX512VL-NEXT: movq %rcx, %rax 892; AVX512VL-NEXT: shlq $32, %rax 893; AVX512VL-NEXT: orq %rcx, %rax 894; AVX512VL-NEXT: vzeroupper 895; AVX512VL-NEXT: retq 896; 897; VL_BW_DQ-LABEL: shuf64i1_zero: 898; VL_BW_DQ: # %bb.0: 899; VL_BW_DQ-NEXT: kmovq %rdi, %k0 900; VL_BW_DQ-NEXT: vpmovm2b %k0, %zmm0 901; VL_BW_DQ-NEXT: vpbroadcastb %xmm0, %zmm0 902; VL_BW_DQ-NEXT: vpmovb2m %zmm0, %k0 903; VL_BW_DQ-NEXT: kmovq %k0, %rax 904; VL_BW_DQ-NEXT: vzeroupper 905; VL_BW_DQ-NEXT: retq 906 %b = bitcast i64 %a to <64 x i1> 907 %c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer 908 %d = bitcast <64 x i1> %c to i64 909 ret i64 %d 910} 911