1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512DQ 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW 4 5; 6; Variable Shifts 7; 8 9define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { 10; ALL-LABEL: var_shift_v8i64: 11; ALL: # %bb.0: 12; ALL-NEXT: vpsravq %zmm1, %zmm0, %zmm0 13; ALL-NEXT: retq 14 %shift = ashr <8 x i64> %a, %b 15 ret <8 x i64> %shift 16} 17 18define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { 19; ALL-LABEL: var_shift_v16i32: 20; ALL: # %bb.0: 21; ALL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 22; ALL-NEXT: retq 23 %shift = ashr <16 x i32> %a, %b 24 ret <16 x i32> %shift 25} 26 27define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { 28; AVX512DQ-LABEL: var_shift_v32i16: 29; AVX512DQ: # %bb.0: 30; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 31; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm3 32; AVX512DQ-NEXT: vpsravd %zmm2, %zmm3, %zmm2 33; AVX512DQ-NEXT: vpmovdw %zmm2, %ymm2 34; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 35; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 36; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 37; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 38; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 39; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 40; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 41; AVX512DQ-NEXT: retq 42; 43; AVX512BW-LABEL: var_shift_v32i16: 44; AVX512BW: # %bb.0: 45; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 46; AVX512BW-NEXT: retq 47 %shift = ashr <32 x i16> %a, %b 48 ret <32 x i16> %shift 49} 50 51define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { 52; AVX512DQ-LABEL: var_shift_v64i8: 53; AVX512DQ: # %bb.0: 54; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 55; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2 56; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 57; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm4 58; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 59; AVX512DQ-NEXT: vpsraw $4, %ymm5, %ymm6 60; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm5 61; AVX512DQ-NEXT: vpsraw $2, %ymm5, %ymm6 62; AVX512DQ-NEXT: vpaddw %ymm3, %ymm3, %ymm3 63; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm5 64; AVX512DQ-NEXT: vpsraw $1, %ymm5, %ymm6 65; AVX512DQ-NEXT: vpaddw %ymm3, %ymm3, %ymm3 66; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm3 67; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3 68; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 69; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 70; AVX512DQ-NEXT: vpsraw $4, %ymm4, %ymm5 71; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4 72; AVX512DQ-NEXT: vpsraw $2, %ymm4, %ymm5 73; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 74; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4 75; AVX512DQ-NEXT: vpsraw $1, %ymm4, %ymm5 76; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 77; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm2 78; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 79; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 80; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 81; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 82; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 83; AVX512DQ-NEXT: vpsraw $4, %ymm4, %ymm5 84; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 85; AVX512DQ-NEXT: vpsraw $2, %ymm4, %ymm5 86; AVX512DQ-NEXT: vpaddw %ymm3, %ymm3, %ymm3 87; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 88; AVX512DQ-NEXT: vpsraw $1, %ymm4, %ymm5 89; AVX512DQ-NEXT: vpaddw %ymm3, %ymm3, %ymm3 90; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 91; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3 92; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 93; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 94; AVX512DQ-NEXT: vpsraw $4, %ymm0, %ymm4 95; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 96; AVX512DQ-NEXT: vpsraw $2, %ymm0, %ymm4 97; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1 98; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 99; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm4 100; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1 101; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 102; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 103; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 104; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 105; AVX512DQ-NEXT: retq 106; 107; AVX512BW-LABEL: var_shift_v64i8: 108; AVX512BW: # %bb.0: 109; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 110; AVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3 111; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1 112; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 113; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 114; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} 115; AVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3 116; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm4 117; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 118; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} 119; AVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3 120; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm4 121; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 122; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} 123; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2 124; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 125; AVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3 126; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 127; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 128; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} 129; AVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3 130; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm1 131; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 132; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} 133; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3 134; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm1 135; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 136; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1} 137; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 138; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 139; AVX512BW-NEXT: retq 140 %shift = ashr <64 x i8> %a, %b 141 ret <64 x i8> %shift 142} 143 144; 145; Uniform Variable Shifts 146; 147 148define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind { 149; ALL-LABEL: splatvar_shift_v8i64: 150; ALL: # %bb.0: 151; ALL-NEXT: vpsraq %xmm1, %zmm0, %zmm0 152; ALL-NEXT: retq 153 %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer 154 %shift = ashr <8 x i64> %a, %splat 155 ret <8 x i64> %shift 156} 157 158define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { 159; ALL-LABEL: splatvar_shift_v16i32: 160; ALL: # %bb.0: 161; ALL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 162; ALL-NEXT: vpsrad %xmm1, %zmm0, %zmm0 163; ALL-NEXT: retq 164 %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer 165 %shift = ashr <16 x i32> %a, %splat 166 ret <16 x i32> %shift 167} 168 169define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { 170; AVX512DQ-LABEL: splatvar_shift_v32i16: 171; AVX512DQ: # %bb.0: 172; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 173; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 174; AVX512DQ-NEXT: vpsraw %xmm1, %ymm2, %ymm2 175; AVX512DQ-NEXT: vpsraw %xmm1, %ymm0, %ymm0 176; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 177; AVX512DQ-NEXT: retq 178; 179; AVX512BW-LABEL: splatvar_shift_v32i16: 180; AVX512BW: # %bb.0: 181; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 182; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0 183; AVX512BW-NEXT: retq 184 %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer 185 %shift = ashr <32 x i16> %a, %splat 186 ret <32 x i16> %shift 187} 188 189define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { 190; AVX512DQ-LABEL: splatvar_shift_v64i8: 191; AVX512DQ: # %bb.0: 192; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 193; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 194; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 195; AVX512DQ-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 196; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 197; AVX512DQ-NEXT: vpsrlw $8, %xmm3, %xmm3 198; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3 199; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 200; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] 201; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm4, %ymm4 202; AVX512DQ-NEXT: vpxor %ymm4, %ymm2, %ymm2 203; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2 204; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 205; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0 206; AVX512DQ-NEXT: vpxor %ymm4, %ymm0, %ymm0 207; AVX512DQ-NEXT: vpsubb %ymm4, %ymm0, %ymm0 208; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 209; AVX512DQ-NEXT: retq 210; 211; AVX512BW-LABEL: splatvar_shift_v64i8: 212; AVX512BW: # %bb.0: 213; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 214; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 215; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] 216; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 217; AVX512BW-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 218; AVX512BW-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 219; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 220; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 221; AVX512BW-NEXT: vpternlogq $108, %zmm0, %zmm2, %zmm1 222; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm0 223; AVX512BW-NEXT: retq 224 %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer 225 %shift = ashr <64 x i8> %a, %splat 226 ret <64 x i8> %shift 227} 228 229; 230; Constant Shifts 231; 232 233define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind { 234; ALL-LABEL: constant_shift_v8i64: 235; ALL: # %bb.0: 236; ALL-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 237; ALL-NEXT: retq 238 %shift = ashr <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62> 239 ret <8 x i64> %shift 240} 241 242define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind { 243; ALL-LABEL: constant_shift_v16i32: 244; ALL: # %bb.0: 245; ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 246; ALL-NEXT: retq 247 %shift = ashr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7> 248 ret <16 x i32> %shift 249} 250 251define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { 252; AVX512DQ-LABEL: constant_shift_v32i16: 253; AVX512DQ: # %bb.0: 254; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm1 255; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 256; AVX512DQ-NEXT: vpsravd %zmm2, %zmm1, %zmm1 257; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1 258; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 259; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 260; AVX512DQ-NEXT: vpsravd %zmm2, %zmm0, %zmm0 261; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 262; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 263; AVX512DQ-NEXT: retq 264; 265; AVX512BW-LABEL: constant_shift_v32i16: 266; AVX512BW: # %bb.0: 267; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 268; AVX512BW-NEXT: retq 269 %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 270 ret <32 x i16> %shift 271} 272 273define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { 274; AVX512DQ-LABEL: constant_shift_v64i8: 275; AVX512DQ: # %bb.0: 276; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 277; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 278; AVX512DQ-NEXT: vpsraw $8, %ymm2, %ymm2 279; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] 280; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] 281; AVX512DQ-NEXT: vpmullw %ymm3, %ymm2, %ymm2 282; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 283; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 284; AVX512DQ-NEXT: vpsraw $8, %ymm1, %ymm1 285; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] 286; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] 287; AVX512DQ-NEXT: vpmullw %ymm4, %ymm1, %ymm1 288; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 289; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 290; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 291; AVX512DQ-NEXT: vpsraw $8, %ymm2, %ymm2 292; AVX512DQ-NEXT: vpmullw %ymm3, %ymm2, %ymm2 293; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 294; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 295; AVX512DQ-NEXT: vpsraw $8, %ymm0, %ymm0 296; AVX512DQ-NEXT: vpmullw %ymm4, %ymm0, %ymm0 297; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 298; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 299; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 300; AVX512DQ-NEXT: retq 301; 302; AVX512BW-LABEL: constant_shift_v64i8: 303; AVX512BW: # %bb.0: 304; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 305; AVX512BW-NEXT: vpsraw $8, %zmm1, %zmm1 306; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 307; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1 308; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 309; AVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0 310; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 311; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 312; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 313; AVX512BW-NEXT: retq 314 %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0> 315 ret <64 x i8> %shift 316} 317 318; 319; Uniform Constant Shifts 320; 321 322define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind { 323; ALL-LABEL: splatconstant_shift_v8i64: 324; ALL: # %bb.0: 325; ALL-NEXT: vpsraq $7, %zmm0, %zmm0 326; ALL-NEXT: retq 327 %shift = ashr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7> 328 ret <8 x i64> %shift 329} 330 331define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind { 332; ALL-LABEL: splatconstant_shift_v16i32: 333; ALL: # %bb.0: 334; ALL-NEXT: vpsrad $5, %zmm0, %zmm0 335; ALL-NEXT: retq 336 %shift = ashr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> 337 ret <16 x i32> %shift 338} 339 340define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind { 341; AVX512DQ-LABEL: splatconstant_shift_v32i16: 342; AVX512DQ: # %bb.0: 343; AVX512DQ-NEXT: vpsraw $3, %ymm0, %ymm1 344; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 345; AVX512DQ-NEXT: vpsraw $3, %ymm0, %ymm0 346; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 347; AVX512DQ-NEXT: retq 348; 349; AVX512BW-LABEL: splatconstant_shift_v32i16: 350; AVX512BW: # %bb.0: 351; AVX512BW-NEXT: vpsraw $3, %zmm0, %zmm0 352; AVX512BW-NEXT: retq 353 %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 354 ret <32 x i16> %shift 355} 356 357define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { 358; AVX512DQ-LABEL: splatconstant_shift_v64i8: 359; AVX512DQ: # %bb.0: 360; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 361; AVX512DQ-NEXT: vpsrlw $3, %ymm1, %ymm1 362; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] 363; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 364; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 365; AVX512DQ-NEXT: vpxor %ymm3, %ymm1, %ymm1 366; AVX512DQ-NEXT: vpsubb %ymm3, %ymm1, %ymm1 367; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0 368; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 369; AVX512DQ-NEXT: vpxor %ymm3, %ymm0, %ymm0 370; AVX512DQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0 371; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 372; AVX512DQ-NEXT: retq 373; 374; AVX512BW-LABEL: splatconstant_shift_v64i8: 375; AVX512BW: # %bb.0: 376; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 377; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 378; AVX512BW-NEXT: vpternlogq $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 379; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 380; AVX512BW-NEXT: retq 381 %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3> 382 ret <64 x i8> %shift 383} 384 385define <64 x i8> @ashr_const7_v64i8(<64 x i8> %a) { 386; AVX512DQ-LABEL: ashr_const7_v64i8: 387; AVX512DQ: # %bb.0: 388; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 389; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 390; AVX512DQ-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 391; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 392; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 393; AVX512DQ-NEXT: retq 394; 395; AVX512BW-LABEL: ashr_const7_v64i8: 396; AVX512BW: # %bb.0: 397; AVX512BW-NEXT: vpmovb2m %zmm0, %k0 398; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 399; AVX512BW-NEXT: retq 400 %res = ashr <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7> 401 ret <64 x i8> %res 402} 403