1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512VLVBMI2 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOPAVX1 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOPAVX2 12 13declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>) 14declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>) 15declare <16 x i16> @llvm.fshl.v16i16(<16 x i16>, <16 x i16>, <16 x i16>) 16declare <32 x i8> @llvm.fshl.v32i8(<32 x i8>, <32 x i8>, <32 x i8>) 17 18; 19; Variable Shifts 20; 21 22define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { 23; AVX1-LABEL: var_funnnel_v4i64: 24; AVX1: # %bb.0: 25; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63] 26; AVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4 27; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 28; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 29; AVX1-NEXT: vpsrlq $1, %xmm6, %xmm6 30; AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm7 31; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] 32; AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5 33; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5,6,7] 34; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 35; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm6 36; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] 37; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 38; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] 39; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 40; AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 41; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 42; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 43; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm5 44; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 45; AVX1-NEXT: vpsllq %xmm3, %xmm4, %xmm3 46; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] 47; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm4 48; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 49; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 50; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] 51; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 52; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 53; AVX1-NEXT: retq 54; 55; AVX2-LABEL: var_funnnel_v4i64: 56; AVX2: # %bb.0: 57; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] 58; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4 59; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm1 60; AVX2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 61; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 62; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 63; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 64; AVX2-NEXT: retq 65; 66; AVX512F-LABEL: var_funnnel_v4i64: 67; AVX512F: # %bb.0: 68; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] 69; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4 70; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm1 71; AVX512F-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 72; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 73; AVX512F-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 74; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 75; AVX512F-NEXT: retq 76; 77; AVX512VL-LABEL: var_funnnel_v4i64: 78; AVX512VL: # %bb.0: 79; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] 80; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4 81; AVX512VL-NEXT: vpsrlq $1, %ymm1, %ymm1 82; AVX512VL-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 83; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 84; AVX512VL-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 85; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 86; AVX512VL-NEXT: retq 87; 88; AVX512BW-LABEL: var_funnnel_v4i64: 89; AVX512BW: # %bb.0: 90; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] 91; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm4 92; AVX512BW-NEXT: vpsrlq $1, %ymm1, %ymm1 93; AVX512BW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 94; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 95; AVX512BW-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 96; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 97; AVX512BW-NEXT: retq 98; 99; AVX512VBMI2-LABEL: var_funnnel_v4i64: 100; AVX512VBMI2: # %bb.0: 101; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 102; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 103; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 104; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0 105; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 106; AVX512VBMI2-NEXT: retq 107; 108; AVX512VLBW-LABEL: var_funnnel_v4i64: 109; AVX512VLBW: # %bb.0: 110; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] 111; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4 112; AVX512VLBW-NEXT: vpsrlq $1, %ymm1, %ymm1 113; AVX512VLBW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 114; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm2 115; AVX512VLBW-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 116; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 117; AVX512VLBW-NEXT: retq 118; 119; AVX512VLVBMI2-LABEL: var_funnnel_v4i64: 120; AVX512VLVBMI2: # %bb.0: 121; AVX512VLVBMI2-NEXT: vpshldvq %ymm2, %ymm1, %ymm0 122; AVX512VLVBMI2-NEXT: retq 123; 124; XOPAVX1-LABEL: var_funnnel_v4i64: 125; XOPAVX1: # %bb.0: 126; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63] 127; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4 128; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 129; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 130; XOPAVX1-NEXT: vpshlq %xmm5, %xmm6, %xmm5 131; XOPAVX1-NEXT: vpshlq %xmm4, %xmm0, %xmm0 132; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 133; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm2 134; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 135; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 136; XOPAVX1-NEXT: vpsubq %xmm3, %xmm4, %xmm3 137; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 138; XOPAVX1-NEXT: vpsrlq $1, %xmm5, %xmm5 139; XOPAVX1-NEXT: vpshlq %xmm3, %xmm5, %xmm3 140; XOPAVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2 141; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 142; XOPAVX1-NEXT: vpshlq %xmm2, %xmm1, %xmm1 143; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 144; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 145; XOPAVX1-NEXT: retq 146; 147; XOPAVX2-LABEL: var_funnnel_v4i64: 148; XOPAVX2: # %bb.0: 149; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] 150; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4 151; XOPAVX2-NEXT: vpsrlq $1, %ymm1, %ymm1 152; XOPAVX2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 153; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 154; XOPAVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 155; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 156; XOPAVX2-NEXT: retq 157 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) 158 ret <4 x i64> %res 159} 160 161define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind { 162; AVX1-LABEL: var_funnnel_v8i32: 163; AVX1: # %bb.0: 164; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [31,31,31,31,31,31,31,31] 165; AVX1-NEXT: vandnps %ymm8, %ymm2, %ymm4 166; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 167; AVX1-NEXT: vpsrldq {{.*#+}} xmm6 = xmm5[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 168; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 169; AVX1-NEXT: vpsrld $1, %xmm7, %xmm7 170; AVX1-NEXT: vpsrld %xmm6, %xmm7, %xmm6 171; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm3 172; AVX1-NEXT: vpsrld %xmm3, %xmm7, %xmm3 173; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4,5,6,7] 174; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9 175; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] 176; AVX1-NEXT: vpsrld %xmm6, %xmm7, %xmm6 177; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero 178; AVX1-NEXT: vpsrld %xmm5, %xmm7, %xmm5 179; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7] 180; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7] 181; AVX1-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 182; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 183; AVX1-NEXT: vpsrld %xmm5, %xmm1, %xmm5 184; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6 185; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6 186; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] 187; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] 188; AVX1-NEXT: vpsrld %xmm6, %xmm1, %xmm6 189; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 190; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1 191; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7] 192; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7] 193; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 194; AVX1-NEXT: vandps %ymm2, %ymm8, %ymm2 195; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 196; AVX1-NEXT: vpslld $23, %xmm3, %xmm3 197; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] 198; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 199; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 200; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 201; AVX1-NEXT: vpmulld %xmm3, %xmm5, %xmm3 202; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 203; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 204; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 205; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 206; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 207; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 208; AVX1-NEXT: retq 209; 210; AVX2-LABEL: var_funnnel_v8i32: 211; AVX2: # %bb.0: 212; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] 213; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4 214; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 215; AVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 216; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 217; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 218; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 219; AVX2-NEXT: retq 220; 221; AVX512F-LABEL: var_funnnel_v8i32: 222; AVX512F: # %bb.0: 223; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] 224; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4 225; AVX512F-NEXT: vpsrld $1, %ymm1, %ymm1 226; AVX512F-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 227; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 228; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 229; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 230; AVX512F-NEXT: retq 231; 232; AVX512VL-LABEL: var_funnnel_v8i32: 233; AVX512VL: # %bb.0: 234; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] 235; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4 236; AVX512VL-NEXT: vpsrld $1, %ymm1, %ymm1 237; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 238; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 239; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 240; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 241; AVX512VL-NEXT: retq 242; 243; AVX512BW-LABEL: var_funnnel_v8i32: 244; AVX512BW: # %bb.0: 245; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] 246; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm4 247; AVX512BW-NEXT: vpsrld $1, %ymm1, %ymm1 248; AVX512BW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 249; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 250; AVX512BW-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 251; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 252; AVX512BW-NEXT: retq 253; 254; AVX512VBMI2-LABEL: var_funnnel_v8i32: 255; AVX512VBMI2: # %bb.0: 256; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 257; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 258; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 259; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0 260; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 261; AVX512VBMI2-NEXT: retq 262; 263; AVX512VLBW-LABEL: var_funnnel_v8i32: 264; AVX512VLBW: # %bb.0: 265; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] 266; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4 267; AVX512VLBW-NEXT: vpsrld $1, %ymm1, %ymm1 268; AVX512VLBW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 269; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm2 270; AVX512VLBW-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 271; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 272; AVX512VLBW-NEXT: retq 273; 274; AVX512VLVBMI2-LABEL: var_funnnel_v8i32: 275; AVX512VLVBMI2: # %bb.0: 276; AVX512VLVBMI2-NEXT: vpshldvd %ymm2, %ymm1, %ymm0 277; AVX512VLVBMI2-NEXT: retq 278; 279; XOPAVX1-LABEL: var_funnnel_v8i32: 280; XOPAVX1: # %bb.0: 281; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] 282; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4 283; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 284; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 285; XOPAVX1-NEXT: vpshld %xmm5, %xmm6, %xmm5 286; XOPAVX1-NEXT: vpshld %xmm4, %xmm0, %xmm0 287; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 288; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm2 289; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 290; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 291; XOPAVX1-NEXT: vpsubd %xmm3, %xmm4, %xmm3 292; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 293; XOPAVX1-NEXT: vpsrld $1, %xmm5, %xmm5 294; XOPAVX1-NEXT: vpshld %xmm3, %xmm5, %xmm3 295; XOPAVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2 296; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1 297; XOPAVX1-NEXT: vpshld %xmm2, %xmm1, %xmm1 298; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 299; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 300; XOPAVX1-NEXT: retq 301; 302; XOPAVX2-LABEL: var_funnnel_v8i32: 303; XOPAVX2: # %bb.0: 304; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] 305; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4 306; XOPAVX2-NEXT: vpsrld $1, %ymm1, %ymm1 307; XOPAVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 308; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 309; XOPAVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 310; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 311; XOPAVX2-NEXT: retq 312 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) 313 ret <8 x i32> %res 314} 315 316define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind { 317; AVX1-LABEL: var_funnnel_v16i16: 318; AVX1: # %bb.0: 319; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 320; AVX1-NEXT: vandnps %ymm8, %ymm2, %ymm4 321; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 322; AVX1-NEXT: vpsllw $12, %xmm5, %xmm6 323; AVX1-NEXT: vpsllw $4, %xmm5, %xmm5 324; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 325; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm6 326; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 327; AVX1-NEXT: vpsrlw $9, %xmm7, %xmm3 328; AVX1-NEXT: vpsrlw $1, %xmm7, %xmm7 329; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm7, %xmm3 330; AVX1-NEXT: vpsrlw $4, %xmm3, %xmm5 331; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm3, %xmm3 332; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm5 333; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6 334; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm3, %xmm3 335; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm5 336; AVX1-NEXT: vpaddw %xmm6, %xmm6, %xmm6 337; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm3, %xmm3 338; AVX1-NEXT: vpsllw $12, %xmm4, %xmm5 339; AVX1-NEXT: vpsllw $4, %xmm4, %xmm4 340; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4 341; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm5 342; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm6 343; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 344; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1 345; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4 346; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 347; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4 348; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5 349; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 350; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4 351; AVX1-NEXT: vpaddw %xmm5, %xmm5, %xmm5 352; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 353; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 354; AVX1-NEXT: vandps %ymm2, %ymm8, %ymm2 355; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 356; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4,4,5,5,6,6,7,7] 357; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 358; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] 359; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 360; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 361; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 362; AVX1-NEXT: vpslld $23, %xmm3, %xmm3 363; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3 364; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 365; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 366; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 367; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 368; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7] 369; AVX1-NEXT: vpslld $23, %xmm4, %xmm4 370; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 371; AVX1-NEXT: vcvttps2dq %xmm4, %xmm4 372; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 373; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 374; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2 375; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 376; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 377; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 378; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 379; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 380; AVX1-NEXT: retq 381; 382; AVX2-LABEL: var_funnnel_v16i16: 383; AVX2: # %bb.0: 384; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 385; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4 386; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 387; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] 388; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 389; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15] 390; AVX2-NEXT: vpsrlvd %ymm6, %ymm7, %ymm6 391; AVX2-NEXT: vpsrld $16, %ymm6, %ymm6 392; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] 393; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11] 394; AVX2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 395; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 396; AVX2-NEXT: vpackusdw %ymm6, %ymm1, %ymm1 397; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15] 398; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 399; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15] 400; AVX2-NEXT: vpsllvd %ymm3, %ymm4, %ymm3 401; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 402; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11] 403; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11] 404; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 405; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 406; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 407; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 408; AVX2-NEXT: retq 409; 410; AVX512F-LABEL: var_funnnel_v16i16: 411; AVX512F: # %bb.0: 412; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 413; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4 414; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero 415; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 416; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 417; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 418; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 419; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 420; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 421; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 422; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 423; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 424; AVX512F-NEXT: retq 425; 426; AVX512VL-LABEL: var_funnnel_v16i16: 427; AVX512VL: # %bb.0: 428; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 429; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4 430; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero 431; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1 432; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 433; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 434; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 435; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 436; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 437; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 438; AVX512VL-NEXT: vpord %zmm1, %zmm0, %zmm0 439; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 440; AVX512VL-NEXT: retq 441; 442; AVX512BW-LABEL: var_funnnel_v16i16: 443; AVX512BW: # %bb.0: 444; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 445; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 446; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm4 447; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1 448; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 449; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 450; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 451; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 452; AVX512BW-NEXT: retq 453; 454; AVX512VBMI2-LABEL: var_funnnel_v16i16: 455; AVX512VBMI2: # %bb.0: 456; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 457; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 458; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 459; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0 460; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 461; AVX512VBMI2-NEXT: retq 462; 463; AVX512VLBW-LABEL: var_funnnel_v16i16: 464; AVX512VLBW: # %bb.0: 465; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 466; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm4 467; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 468; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1 469; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm2 470; AVX512VLBW-NEXT: vpsllvw %ymm2, %ymm0, %ymm0 471; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 472; AVX512VLBW-NEXT: retq 473; 474; AVX512VLVBMI2-LABEL: var_funnnel_v16i16: 475; AVX512VLVBMI2: # %bb.0: 476; AVX512VLVBMI2-NEXT: vpshldvw %ymm2, %ymm1, %ymm0 477; AVX512VLVBMI2-NEXT: retq 478; 479; XOPAVX1-LABEL: var_funnnel_v16i16: 480; XOPAVX1: # %bb.0: 481; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 482; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4 483; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 484; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 485; XOPAVX1-NEXT: vpshlw %xmm5, %xmm6, %xmm5 486; XOPAVX1-NEXT: vpshlw %xmm4, %xmm0, %xmm0 487; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 488; XOPAVX1-NEXT: vandnps %ymm3, %ymm2, %ymm2 489; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 490; XOPAVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 491; XOPAVX1-NEXT: vpsubw %xmm3, %xmm4, %xmm3 492; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 493; XOPAVX1-NEXT: vpsrlw $1, %xmm5, %xmm5 494; XOPAVX1-NEXT: vpshlw %xmm3, %xmm5, %xmm3 495; XOPAVX1-NEXT: vpsubw %xmm2, %xmm4, %xmm2 496; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 497; XOPAVX1-NEXT: vpshlw %xmm2, %xmm1, %xmm1 498; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 499; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 500; XOPAVX1-NEXT: retq 501; 502; XOPAVX2-LABEL: var_funnnel_v16i16: 503; XOPAVX2: # %bb.0: 504; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 505; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 506; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 507; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm6 508; XOPAVX2-NEXT: vpshlw %xmm5, %xmm6, %xmm5 509; XOPAVX2-NEXT: vpshlw %xmm4, %xmm0, %xmm0 510; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 511; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 512; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 513; XOPAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 514; XOPAVX2-NEXT: vpsubw %xmm3, %xmm4, %xmm3 515; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 516; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 517; XOPAVX2-NEXT: vpshlw %xmm3, %xmm5, %xmm3 518; XOPAVX2-NEXT: vpsubw %xmm2, %xmm4, %xmm2 519; XOPAVX2-NEXT: vpshlw %xmm2, %xmm1, %xmm1 520; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 521; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 522; XOPAVX2-NEXT: retq 523 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) 524 ret <16 x i16> %res 525} 526 527define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind { 528; AVX1-LABEL: var_funnnel_v32i8: 529; AVX1: # %bb.0: 530; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 531; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 532; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 533; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm5 534; AVX1-NEXT: vpsrlw $4, %xmm5, %xmm3 535; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 536; AVX1-NEXT: vpand %xmm3, %xmm10, %xmm7 537; AVX1-NEXT: vmovaps {{.*#+}} ymm9 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 538; AVX1-NEXT: vandnps %ymm9, %ymm2, %ymm8 539; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm3 540; AVX1-NEXT: vpsllw $5, %xmm3, %xmm3 541; AVX1-NEXT: vpblendvb %xmm3, %xmm7, %xmm5, %xmm5 542; AVX1-NEXT: vpsrlw $2, %xmm5, %xmm7 543; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] 544; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm7 545; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3 546; AVX1-NEXT: vpblendvb %xmm3, %xmm7, %xmm5, %xmm5 547; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm7 548; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 549; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3 550; AVX1-NEXT: vpblendvb %xmm3, %xmm7, %xmm5, %xmm3 551; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 552; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 553; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm5 554; AVX1-NEXT: vpand %xmm5, %xmm10, %xmm5 555; AVX1-NEXT: vpsllw $5, %xmm8, %xmm7 556; AVX1-NEXT: vpblendvb %xmm7, %xmm5, %xmm1, %xmm1 557; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm5 558; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 559; AVX1-NEXT: vpaddb %xmm7, %xmm7, %xmm6 560; AVX1-NEXT: vpblendvb %xmm6, %xmm5, %xmm1, %xmm1 561; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm5 562; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 563; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm5 564; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 565; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 566; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 567; AVX1-NEXT: vpsllw $4, %xmm3, %xmm4 568; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 569; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 570; AVX1-NEXT: vandps %ymm2, %ymm9, %ymm2 571; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 572; AVX1-NEXT: vpsllw $5, %xmm6, %xmm6 573; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3 574; AVX1-NEXT: vpsllw $2, %xmm3, %xmm4 575; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 576; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4 577; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6 578; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3 579; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm4 580; AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm6 581; AVX1-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3 582; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4 583; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 584; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2 585; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 586; AVX1-NEXT: vpsllw $2, %xmm0, %xmm4 587; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4 588; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 589; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 590; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4 591; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm2 592; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 593; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 594; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 595; AVX1-NEXT: retq 596; 597; AVX2-LABEL: var_funnnel_v32i8: 598; AVX2: # %bb.0: 599; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 600; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm4 601; AVX2-NEXT: vpsllw $5, %ymm4, %ymm4 602; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 603; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 604; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1 605; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm6 606; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 607; AVX2-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 608; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm6 609; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 610; AVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4 611; AVX2-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 612; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm6 613; AVX2-NEXT: vpand %ymm5, %ymm6, %ymm5 614; AVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm4 615; AVX2-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1 616; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 617; AVX2-NEXT: vpsllw $5, %ymm2, %ymm2 618; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm3 619; AVX2-NEXT: vpsllw $4, %ymm0, %ymm4 620; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 621; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 622; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2 623; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 624; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 625; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 626; AVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3 627; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 628; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 629; AVX2-NEXT: retq 630; 631; AVX512F-LABEL: var_funnnel_v32i8: 632; AVX512F: # %bb.0: 633; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 634; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4 635; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4 636; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 637; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 638; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 639; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm6 640; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 641; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 642; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm6 643; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 644; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4 645; AVX512F-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 646; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm6 647; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm5 648; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm4 649; AVX512F-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1 650; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 651; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 652; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3 653; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 654; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 655; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 656; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2 657; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 658; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 659; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2 660; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 661; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 662; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 663; AVX512F-NEXT: retq 664; 665; AVX512VL-LABEL: var_funnnel_v32i8: 666; AVX512VL: # %bb.0: 667; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 668; AVX512VL-NEXT: vpandn %ymm3, %ymm2, %ymm4 669; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm4 670; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1 671; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 672; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1 673; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm6 674; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 675; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 676; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm6 677; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 678; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 679; AVX512VL-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm1 680; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm6 681; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm5 682; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm4 683; AVX512VL-NEXT: vpblendvb %ymm4, %ymm5, %ymm1, %ymm1 684; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 685; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 686; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3 687; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 688; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 689; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 690; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2 691; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 692; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 693; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 694; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 695; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 696; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 697; AVX512VL-NEXT: retq 698; 699; AVX512BW-LABEL: var_funnnel_v32i8: 700; AVX512BW: # %bb.0: 701; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 702; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 703; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero 704; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 705; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 706; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm2 707; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero 708; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1 709; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 710; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 711; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 712; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 713; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 714; AVX512BW-NEXT: retq 715; 716; AVX512VBMI2-LABEL: var_funnnel_v32i8: 717; AVX512VBMI2: # %bb.0: 718; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 719; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 720; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero 721; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 722; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 723; AVX512VBMI2-NEXT: vpandn %ymm3, %ymm2, %ymm2 724; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero 725; AVX512VBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1 726; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 727; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 728; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 729; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0 730; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 731; AVX512VBMI2-NEXT: retq 732; 733; AVX512VLBW-LABEL: var_funnnel_v32i8: 734; AVX512VLBW: # %bb.0: 735; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 736; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 737; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero 738; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 739; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 740; AVX512VLBW-NEXT: vpandn %ymm3, %ymm2, %ymm2 741; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero 742; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 743; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 744; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 745; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 746; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 747; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 748; AVX512VLBW-NEXT: retq 749; 750; AVX512VLVBMI2-LABEL: var_funnnel_v32i8: 751; AVX512VLVBMI2: # %bb.0: 752; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 753; AVX512VLVBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 754; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero 755; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 756; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 757; AVX512VLVBMI2-NEXT: vpandn %ymm3, %ymm2, %ymm2 758; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero 759; AVX512VLVBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1 760; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 761; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 762; AVX512VLVBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 763; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0 764; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 765; AVX512VLVBMI2-NEXT: retq 766; 767; XOPAVX1-LABEL: var_funnnel_v32i8: 768; XOPAVX1: # %bb.0: 769; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 770; XOPAVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 771; XOPAVX1-NEXT: vpshlb %xmm4, %xmm3, %xmm3 772; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 773; XOPAVX1-NEXT: vandnps %ymm8, %ymm2, %ymm6 774; XOPAVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 775; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 776; XOPAVX1-NEXT: vpsubb %xmm7, %xmm5, %xmm7 777; XOPAVX1-NEXT: vpshlb %xmm7, %xmm3, %xmm3 778; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1 779; XOPAVX1-NEXT: vpsubb %xmm6, %xmm5, %xmm4 780; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1 781; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 782; XOPAVX1-NEXT: vandps %ymm2, %ymm8, %ymm2 783; XOPAVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 784; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 785; XOPAVX1-NEXT: vpshlb %xmm3, %xmm4, %xmm3 786; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0 787; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 788; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 789; XOPAVX1-NEXT: retq 790; 791; XOPAVX2-LABEL: var_funnnel_v32i8: 792; XOPAVX2: # %bb.0: 793; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 794; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 795; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 796; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm6 797; XOPAVX2-NEXT: vpshlb %xmm5, %xmm6, %xmm5 798; XOPAVX2-NEXT: vpshlb %xmm4, %xmm0, %xmm0 799; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 800; XOPAVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2 801; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 802; XOPAVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 803; XOPAVX2-NEXT: vpsubb %xmm3, %xmm4, %xmm3 804; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 805; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 806; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 807; XOPAVX2-NEXT: vpshlb %xmm3, %xmm5, %xmm3 808; XOPAVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm2 809; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1 810; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 811; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 812; XOPAVX2-NEXT: retq 813 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) 814 ret <32 x i8> %res 815} 816 817; 818; Uniform Variable Shifts 819; 820 821define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { 822; AVX1-LABEL: splatvar_funnnel_v4i64: 823; AVX1: # %bb.0: 824; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] 825; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 826; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 827; AVX1-NEXT: vpsrlq $1, %xmm5, %xmm5 828; AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5 829; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 830; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 831; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 832; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 833; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 834; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3 835; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 836; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 837; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 838; AVX1-NEXT: retq 839; 840; AVX2-LABEL: splatvar_funnnel_v4i64: 841; AVX2: # %bb.0: 842; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] 843; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 844; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm1 845; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 846; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 847; AVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm0 848; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 849; AVX2-NEXT: retq 850; 851; AVX512F-LABEL: splatvar_funnnel_v4i64: 852; AVX512F: # %bb.0: 853; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] 854; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 855; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm1 856; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 857; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 858; AVX512F-NEXT: vpsllq %xmm2, %ymm0, %ymm0 859; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 860; AVX512F-NEXT: retq 861; 862; AVX512VL-LABEL: splatvar_funnnel_v4i64: 863; AVX512VL: # %bb.0: 864; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] 865; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 866; AVX512VL-NEXT: vpsrlq $1, %ymm1, %ymm1 867; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 868; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 869; AVX512VL-NEXT: vpsllq %xmm2, %ymm0, %ymm0 870; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 871; AVX512VL-NEXT: retq 872; 873; AVX512BW-LABEL: splatvar_funnnel_v4i64: 874; AVX512BW: # %bb.0: 875; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] 876; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 877; AVX512BW-NEXT: vpsrlq $1, %ymm1, %ymm1 878; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 879; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 880; AVX512BW-NEXT: vpsllq %xmm2, %ymm0, %ymm0 881; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 882; AVX512BW-NEXT: retq 883; 884; AVX512VBMI2-LABEL: splatvar_funnnel_v4i64: 885; AVX512VBMI2: # %bb.0: 886; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 887; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 888; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm2 889; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0 890; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 891; AVX512VBMI2-NEXT: retq 892; 893; AVX512VLBW-LABEL: splatvar_funnnel_v4i64: 894; AVX512VLBW: # %bb.0: 895; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] 896; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 897; AVX512VLBW-NEXT: vpsrlq $1, %ymm1, %ymm1 898; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 899; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 900; AVX512VLBW-NEXT: vpsllq %xmm2, %ymm0, %ymm0 901; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 902; AVX512VLBW-NEXT: retq 903; 904; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i64: 905; AVX512VLVBMI2: # %bb.0: 906; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm2, %ymm2 907; AVX512VLVBMI2-NEXT: vpshldvq %ymm2, %ymm1, %ymm0 908; AVX512VLVBMI2-NEXT: retq 909; 910; XOPAVX1-LABEL: splatvar_funnnel_v4i64: 911; XOPAVX1: # %bb.0: 912; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] 913; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 914; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 915; XOPAVX1-NEXT: vpsrlq $1, %xmm5, %xmm5 916; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm5 917; XOPAVX1-NEXT: vpsrlq $1, %xmm1, %xmm1 918; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 919; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 920; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 921; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 922; XOPAVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm3 923; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 924; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 925; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 926; XOPAVX1-NEXT: retq 927; 928; XOPAVX2-LABEL: splatvar_funnnel_v4i64: 929; XOPAVX2: # %bb.0: 930; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] 931; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 932; XOPAVX2-NEXT: vpsrlq $1, %ymm1, %ymm1 933; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 934; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 935; XOPAVX2-NEXT: vpsllq %xmm2, %ymm0, %ymm0 936; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 937; XOPAVX2-NEXT: retq 938 %splat = shufflevector <4 x i64> %amt, <4 x i64> undef, <4 x i32> zeroinitializer 939 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %splat) 940 ret <4 x i64> %res 941} 942 943define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) nounwind { 944; AVX1-LABEL: splatvar_funnnel_v8i32: 945; AVX1: # %bb.0: 946; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31] 947; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 948; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 949; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 950; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5 951; AVX1-NEXT: vpsrld %xmm4, %xmm5, %xmm5 952; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 953; AVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1 954; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 955; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 956; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 957; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 958; AVX1-NEXT: vpslld %xmm2, %xmm3, %xmm3 959; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm0 960; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 961; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 962; AVX1-NEXT: retq 963; 964; AVX2-LABEL: splatvar_funnnel_v8i32: 965; AVX2: # %bb.0: 966; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] 967; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 968; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 969; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 970; AVX2-NEXT: vpsrld %xmm4, %ymm1, %ymm1 971; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 972; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 973; AVX2-NEXT: vpslld %xmm2, %ymm0, %ymm0 974; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 975; AVX2-NEXT: retq 976; 977; AVX512F-LABEL: splatvar_funnnel_v8i32: 978; AVX512F: # %bb.0: 979; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] 980; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 981; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 982; AVX512F-NEXT: vpsrld $1, %ymm1, %ymm1 983; AVX512F-NEXT: vpsrld %xmm4, %ymm1, %ymm1 984; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 985; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 986; AVX512F-NEXT: vpslld %xmm2, %ymm0, %ymm0 987; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 988; AVX512F-NEXT: retq 989; 990; AVX512VL-LABEL: splatvar_funnnel_v8i32: 991; AVX512VL: # %bb.0: 992; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] 993; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 994; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 995; AVX512VL-NEXT: vpsrld $1, %ymm1, %ymm1 996; AVX512VL-NEXT: vpsrld %xmm4, %ymm1, %ymm1 997; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 998; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 999; AVX512VL-NEXT: vpslld %xmm2, %ymm0, %ymm0 1000; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 1001; AVX512VL-NEXT: retq 1002; 1003; AVX512BW-LABEL: splatvar_funnnel_v8i32: 1004; AVX512BW: # %bb.0: 1005; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] 1006; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 1007; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 1008; AVX512BW-NEXT: vpsrld $1, %ymm1, %ymm1 1009; AVX512BW-NEXT: vpsrld %xmm4, %ymm1, %ymm1 1010; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 1011; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 1012; AVX512BW-NEXT: vpslld %xmm2, %ymm0, %ymm0 1013; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 1014; AVX512BW-NEXT: retq 1015; 1016; AVX512VBMI2-LABEL: splatvar_funnnel_v8i32: 1017; AVX512VBMI2: # %bb.0: 1018; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1019; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1020; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %ymm2 1021; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0 1022; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1023; AVX512VBMI2-NEXT: retq 1024; 1025; AVX512VLBW-LABEL: splatvar_funnnel_v8i32: 1026; AVX512VLBW: # %bb.0: 1027; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] 1028; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 1029; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 1030; AVX512VLBW-NEXT: vpsrld $1, %ymm1, %ymm1 1031; AVX512VLBW-NEXT: vpsrld %xmm4, %ymm1, %ymm1 1032; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 1033; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 1034; AVX512VLBW-NEXT: vpslld %xmm2, %ymm0, %ymm0 1035; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 1036; AVX512VLBW-NEXT: retq 1037; 1038; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i32: 1039; AVX512VLVBMI2: # %bb.0: 1040; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm2, %ymm2 1041; AVX512VLVBMI2-NEXT: vpshldvd %ymm2, %ymm1, %ymm0 1042; AVX512VLVBMI2-NEXT: retq 1043; 1044; XOPAVX1-LABEL: splatvar_funnnel_v8i32: 1045; XOPAVX1: # %bb.0: 1046; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31] 1047; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 1048; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 1049; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 1050; XOPAVX1-NEXT: vpsrld $1, %xmm5, %xmm5 1051; XOPAVX1-NEXT: vpsrld %xmm4, %xmm5, %xmm5 1052; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1 1053; XOPAVX1-NEXT: vpsrld %xmm4, %xmm1, %xmm1 1054; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 1055; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1056; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 1057; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1058; XOPAVX1-NEXT: vpslld %xmm2, %xmm3, %xmm3 1059; XOPAVX1-NEXT: vpslld %xmm2, %xmm0, %xmm0 1060; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 1061; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 1062; XOPAVX1-NEXT: retq 1063; 1064; XOPAVX2-LABEL: splatvar_funnnel_v8i32: 1065; XOPAVX2: # %bb.0: 1066; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] 1067; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 1068; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 1069; XOPAVX2-NEXT: vpsrld $1, %ymm1, %ymm1 1070; XOPAVX2-NEXT: vpsrld %xmm4, %ymm1, %ymm1 1071; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 1072; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 1073; XOPAVX2-NEXT: vpslld %xmm2, %ymm0, %ymm0 1074; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1075; XOPAVX2-NEXT: retq 1076 %splat = shufflevector <8 x i32> %amt, <8 x i32> undef, <8 x i32> zeroinitializer 1077 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %splat) 1078 ret <8 x i32> %res 1079} 1080 1081define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind { 1082; AVX1-LABEL: splatvar_funnnel_v16i16: 1083; AVX1: # %bb.0: 1084; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] 1085; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 1086; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 1087; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 1088; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm5 1089; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 1090; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 1091; AVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 1092; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 1093; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1094; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 1095; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1096; AVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3 1097; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1098; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 1099; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 1100; AVX1-NEXT: retq 1101; 1102; AVX2-LABEL: splatvar_funnnel_v16i16: 1103; AVX2: # %bb.0: 1104; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] 1105; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 1106; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 1107; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 1108; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 1109; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 1110; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 1111; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0 1112; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1113; AVX2-NEXT: retq 1114; 1115; AVX512F-LABEL: splatvar_funnnel_v16i16: 1116; AVX512F: # %bb.0: 1117; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] 1118; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 1119; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 1120; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 1121; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 1122; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 1123; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 1124; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm0 1125; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 1126; AVX512F-NEXT: retq 1127; 1128; AVX512VL-LABEL: splatvar_funnnel_v16i16: 1129; AVX512VL: # %bb.0: 1130; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] 1131; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 1132; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 1133; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1 1134; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 1135; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm2 1136; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 1137; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm0 1138; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 1139; AVX512VL-NEXT: retq 1140; 1141; AVX512BW-LABEL: splatvar_funnnel_v16i16: 1142; AVX512BW: # %bb.0: 1143; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] 1144; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 1145; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 1146; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1 1147; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 1148; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 1149; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 1150; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm0 1151; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 1152; AVX512BW-NEXT: retq 1153; 1154; AVX512VBMI2-LABEL: splatvar_funnnel_v16i16: 1155; AVX512VBMI2: # %bb.0: 1156; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1157; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1158; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %ymm2 1159; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0 1160; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1161; AVX512VBMI2-NEXT: retq 1162; 1163; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: 1164; AVX512VLBW: # %bb.0: 1165; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] 1166; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 1167; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 1168; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 1169; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 1170; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 1171; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 1172; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm0 1173; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 1174; AVX512VLBW-NEXT: retq 1175; 1176; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i16: 1177; AVX512VLVBMI2: # %bb.0: 1178; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm2, %ymm2 1179; AVX512VLVBMI2-NEXT: vpshldvw %ymm2, %ymm1, %ymm0 1180; AVX512VLVBMI2-NEXT: retq 1181; 1182; XOPAVX1-LABEL: splatvar_funnnel_v16i16: 1183; XOPAVX1: # %bb.0: 1184; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] 1185; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 1186; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 1187; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 1188; XOPAVX1-NEXT: vpsrlw $1, %xmm5, %xmm5 1189; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 1190; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 1191; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 1192; XOPAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 1193; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1194; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 1195; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1196; XOPAVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3 1197; XOPAVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1198; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 1199; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 1200; XOPAVX1-NEXT: retq 1201; 1202; XOPAVX2-LABEL: splatvar_funnnel_v16i16: 1203; XOPAVX2: # %bb.0: 1204; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] 1205; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 1206; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 1207; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 1208; XOPAVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 1209; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 1210; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 1211; XOPAVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm0 1212; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1213; XOPAVX2-NEXT: retq 1214 %splat = shufflevector <16 x i16> %amt, <16 x i16> undef, <16 x i32> zeroinitializer 1215 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %splat) 1216 ret <16 x i16> %res 1217} 1218 1219define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) nounwind { 1220; AVX1-LABEL: splatvar_funnnel_v32i8: 1221; AVX1: # %bb.0: 1222; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1223; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 1224; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1225; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 1226; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1227; AVX1-NEXT: vpandn %xmm8, %xmm2, %xmm6 1228; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero 1229; AVX1-NEXT: vpsrlw %xmm6, %xmm3, %xmm3 1230; AVX1-NEXT: vpcmpeqd %xmm7, %xmm7, %xmm7 1231; AVX1-NEXT: vpsrlw %xmm6, %xmm7, %xmm5 1232; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1233; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 1234; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 1235; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 1236; AVX1-NEXT: vpsrlw %xmm6, %xmm1, %xmm1 1237; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 1238; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1239; AVX1-NEXT: vpand %xmm2, %xmm8, %xmm2 1240; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero 1241; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1242; AVX1-NEXT: vpsllw %xmm2, %xmm3, %xmm3 1243; AVX1-NEXT: vpsllw %xmm2, %xmm7, %xmm4 1244; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 1245; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 1246; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 1247; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm0 1248; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1249; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 1250; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 1251; AVX1-NEXT: retq 1252; 1253; AVX2-LABEL: splatvar_funnnel_v32i8: 1254; AVX2: # %bb.0: 1255; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1256; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 1257; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero 1258; AVX2-NEXT: vpsllw %xmm4, %ymm0, %ymm0 1259; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 1260; AVX2-NEXT: vpsllw %xmm4, %xmm5, %xmm4 1261; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 1262; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 1263; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 1264; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero 1265; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 1266; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1267; AVX2-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 1268; AVX2-NEXT: vpsrlw %xmm2, %xmm5, %xmm2 1269; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 1270; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 1271; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1272; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1273; AVX2-NEXT: retq 1274; 1275; AVX512F-LABEL: splatvar_funnnel_v32i8: 1276; AVX512F: # %bb.0: 1277; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1278; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 1279; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero 1280; AVX512F-NEXT: vpsllw %xmm4, %ymm0, %ymm0 1281; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 1282; AVX512F-NEXT: vpsllw %xmm4, %xmm5, %xmm4 1283; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4 1284; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 1285; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 1286; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero 1287; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 1288; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1289; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 1290; AVX512F-NEXT: vpsrlw %xmm2, %xmm5, %xmm2 1291; AVX512F-NEXT: vpsrlw $8, %xmm2, %xmm2 1292; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 1293; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 1294; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 1295; AVX512F-NEXT: retq 1296; 1297; AVX512VL-LABEL: splatvar_funnnel_v32i8: 1298; AVX512VL: # %bb.0: 1299; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1300; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 1301; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero 1302; AVX512VL-NEXT: vpsllw %xmm4, %ymm0, %ymm0 1303; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 1304; AVX512VL-NEXT: vpsllw %xmm4, %xmm5, %xmm4 1305; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4 1306; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm4 1307; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm0 1308; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 1309; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1 1310; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1311; AVX512VL-NEXT: vpsrlw %xmm0, %ymm1, %ymm1 1312; AVX512VL-NEXT: vpsrlw %xmm0, %xmm5, %xmm0 1313; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 1314; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0 1315; AVX512VL-NEXT: vpternlogq $236, %ymm1, %ymm4, %ymm0 1316; AVX512VL-NEXT: retq 1317; 1318; AVX512BW-LABEL: splatvar_funnnel_v32i8: 1319; AVX512BW: # %bb.0: 1320; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1321; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 1322; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero 1323; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1 1324; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1325; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 1326; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 1327; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 1328; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero 1329; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 1330; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm0 1331; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 1332; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1333; AVX512BW-NEXT: retq 1334; 1335; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8: 1336; AVX512VBMI2: # %bb.0: 1337; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1338; AVX512VBMI2-NEXT: vpandn %xmm3, %xmm2, %xmm4 1339; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero 1340; AVX512VBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1 1341; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1342; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 1343; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 1344; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2 1345; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero 1346; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 1347; AVX512VBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm0 1348; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0 1349; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 1350; AVX512VBMI2-NEXT: retq 1351; 1352; AVX512VLBW-LABEL: splatvar_funnnel_v32i8: 1353; AVX512VLBW: # %bb.0: 1354; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1355; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 1356; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero 1357; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 1358; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1359; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 1360; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 1361; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 1362; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero 1363; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 1364; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm0 1365; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 1366; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 1367; AVX512VLBW-NEXT: retq 1368; 1369; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8: 1370; AVX512VLVBMI2: # %bb.0: 1371; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1372; AVX512VLVBMI2-NEXT: vpandn %xmm3, %xmm2, %xmm4 1373; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero 1374; AVX512VLVBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1 1375; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1376; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 1377; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 1378; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2 1379; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero 1380; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 1381; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %zmm0, %zmm0 1382; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0 1383; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 1384; AVX512VLVBMI2-NEXT: retq 1385; 1386; XOPAVX1-LABEL: splatvar_funnnel_v32i8: 1387; XOPAVX1: # %bb.0: 1388; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 1389; XOPAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 1390; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1391; XOPAVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 1392; XOPAVX1-NEXT: vpshlb %xmm5, %xmm4, %xmm4 1393; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1394; XOPAVX1-NEXT: vpandn %xmm6, %xmm2, %xmm7 1395; XOPAVX1-NEXT: vpsubb %xmm7, %xmm3, %xmm3 1396; XOPAVX1-NEXT: vpshlb %xmm3, %xmm4, %xmm4 1397; XOPAVX1-NEXT: vpshlb %xmm5, %xmm1, %xmm1 1398; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 1399; XOPAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 1400; XOPAVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 1401; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1402; XOPAVX1-NEXT: vpshlb %xmm2, %xmm3, %xmm3 1403; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0 1404; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 1405; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 1406; XOPAVX1-NEXT: retq 1407; 1408; XOPAVX2-LABEL: splatvar_funnnel_v32i8: 1409; XOPAVX2: # %bb.0: 1410; XOPAVX2-NEXT: vpbroadcastb %xmm2, %xmm2 1411; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] 1412; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 1413; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 1414; XOPAVX2-NEXT: vpshlb %xmm4, %xmm5, %xmm5 1415; XOPAVX2-NEXT: vpshlb %xmm4, %xmm0, %xmm0 1416; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 1417; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 1418; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 1419; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2 1420; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 1421; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1422; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 1423; XOPAVX2-NEXT: vpshlb %xmm2, %xmm3, %xmm3 1424; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1 1425; XOPAVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 1426; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1427; XOPAVX2-NEXT: retq 1428 %splat = shufflevector <32 x i8> %amt, <32 x i8> undef, <32 x i32> zeroinitializer 1429 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %splat) 1430 ret <32 x i8> %res 1431} 1432 1433; Harder PR37426 - https://bugs.llvm.org/show_bug.cgi?id=37426 1434; CGP should sink splatted select operands through the funnel shift. 1435 1436define void @fancierRotate2(i32* %arr, i8* %control, i32 %rot0, i32 %rot1) { 1437; AVX1-LABEL: fancierRotate2: 1438; AVX1: # %bb.0: # %entry 1439; AVX1-NEXT: vmovd %edx, %xmm1 1440; AVX1-NEXT: vmovd %ecx, %xmm3 1441; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 1442; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 1443; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31] 1444; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm2 1445; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero 1446; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32] 1447; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm2 1448; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero 1449; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm4 1450; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm4[0],zero,xmm4[1],zero 1451; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4 1452; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero 1453; AVX1-NEXT: .p2align 4, 0x90 1454; AVX1-NEXT: .LBB8_1: # %loop 1455; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 1456; AVX1-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero 1457; AVX1-NEXT: vpcmpeqb %xmm5, %xmm8, %xmm5 1458; AVX1-NEXT: vpmovsxbd %xmm5, %xmm6 1459; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] 1460; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 1461; AVX1-NEXT: vmovdqu 4096(%rdi,%rax,4), %xmm7 1462; AVX1-NEXT: vmovdqu 4112(%rdi,%rax,4), %xmm0 1463; AVX1-NEXT: vpslld %xmm9, %xmm7, %xmm1 1464; AVX1-NEXT: vpsrld %xmm10, %xmm7, %xmm2 1465; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 1466; AVX1-NEXT: vpslld %xmm9, %xmm0, %xmm2 1467; AVX1-NEXT: vpsrld %xmm10, %xmm0, %xmm3 1468; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 1469; AVX1-NEXT: vpslld %xmm11, %xmm7, %xmm3 1470; AVX1-NEXT: vpsrld %xmm4, %xmm7, %xmm7 1471; AVX1-NEXT: vpor %xmm7, %xmm3, %xmm3 1472; AVX1-NEXT: vblendvps %xmm6, %xmm1, %xmm3, %xmm1 1473; AVX1-NEXT: vpslld %xmm11, %xmm0, %xmm3 1474; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm0 1475; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 1476; AVX1-NEXT: vblendvps %xmm5, %xmm2, %xmm0, %xmm0 1477; AVX1-NEXT: vmovups %xmm1, 4096(%rdi,%rax,4) 1478; AVX1-NEXT: vmovups %xmm0, 4112(%rdi,%rax,4) 1479; AVX1-NEXT: addq $8, %rax 1480; AVX1-NEXT: jne .LBB8_1 1481; AVX1-NEXT: # %bb.2: # %exit 1482; AVX1-NEXT: vzeroupper 1483; AVX1-NEXT: retq 1484; 1485; AVX2-LABEL: fancierRotate2: 1486; AVX2: # %bb.0: # %entry 1487; AVX2-NEXT: vmovd %edx, %xmm0 1488; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 1489; AVX2-NEXT: vmovd %ecx, %xmm1 1490; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 1491; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 1492; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1493; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] 1494; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32] 1495; AVX2-NEXT: .p2align 4, 0x90 1496; AVX2-NEXT: .LBB8_1: # %loop 1497; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 1498; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 1499; AVX2-NEXT: vpcmpeqd %ymm2, %ymm5, %ymm5 1500; AVX2-NEXT: vblendvps %ymm5, %ymm0, %ymm1, %ymm5 1501; AVX2-NEXT: vandps %ymm3, %ymm5, %ymm5 1502; AVX2-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm6 1503; AVX2-NEXT: vpsllvd %ymm5, %ymm6, %ymm7 1504; AVX2-NEXT: vpsubd %ymm5, %ymm4, %ymm5 1505; AVX2-NEXT: vpsrlvd %ymm5, %ymm6, %ymm5 1506; AVX2-NEXT: vpor %ymm5, %ymm7, %ymm5 1507; AVX2-NEXT: vmovdqu %ymm5, 4096(%rdi,%rax,4) 1508; AVX2-NEXT: addq $8, %rax 1509; AVX2-NEXT: jne .LBB8_1 1510; AVX2-NEXT: # %bb.2: # %exit 1511; AVX2-NEXT: vzeroupper 1512; AVX2-NEXT: retq 1513; 1514; AVX512F-LABEL: fancierRotate2: 1515; AVX512F: # %bb.0: # %entry 1516; AVX512F-NEXT: vmovd %edx, %xmm0 1517; AVX512F-NEXT: vpbroadcastd %xmm0, %ymm0 1518; AVX512F-NEXT: vmovd %ecx, %xmm1 1519; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1 1520; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00 1521; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 1522; AVX512F-NEXT: .p2align 4, 0x90 1523; AVX512F-NEXT: .LBB8_1: # %loop 1524; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 1525; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 1526; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3 1527; AVX512F-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3 1528; AVX512F-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm4 1529; AVX512F-NEXT: vprolvd %zmm3, %zmm4, %zmm3 1530; AVX512F-NEXT: vmovdqu %ymm3, 4096(%rdi,%rax,4) 1531; AVX512F-NEXT: addq $8, %rax 1532; AVX512F-NEXT: jne .LBB8_1 1533; AVX512F-NEXT: # %bb.2: # %exit 1534; AVX512F-NEXT: vzeroupper 1535; AVX512F-NEXT: retq 1536; 1537; AVX512VL-LABEL: fancierRotate2: 1538; AVX512VL: # %bb.0: # %entry 1539; AVX512VL-NEXT: vpbroadcastd %edx, %ymm0 1540; AVX512VL-NEXT: vpbroadcastd %ecx, %ymm1 1541; AVX512VL-NEXT: movq $-1024, %rax # imm = 0xFC00 1542; AVX512VL-NEXT: .p2align 4, 0x90 1543; AVX512VL-NEXT: .LBB8_1: # %loop 1544; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 1545; AVX512VL-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 1546; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1 1547; AVX512VL-NEXT: vpblendmd %ymm0, %ymm1, %ymm2 {%k1} 1548; AVX512VL-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm3 1549; AVX512VL-NEXT: vprolvd %ymm2, %ymm3, %ymm2 1550; AVX512VL-NEXT: vmovdqu %ymm2, 4096(%rdi,%rax,4) 1551; AVX512VL-NEXT: addq $8, %rax 1552; AVX512VL-NEXT: jne .LBB8_1 1553; AVX512VL-NEXT: # %bb.2: # %exit 1554; AVX512VL-NEXT: vzeroupper 1555; AVX512VL-NEXT: retq 1556; 1557; AVX512BW-LABEL: fancierRotate2: 1558; AVX512BW: # %bb.0: # %entry 1559; AVX512BW-NEXT: vmovd %edx, %xmm0 1560; AVX512BW-NEXT: vpbroadcastd %xmm0, %ymm0 1561; AVX512BW-NEXT: vmovd %ecx, %xmm1 1562; AVX512BW-NEXT: vpbroadcastd %xmm1, %ymm1 1563; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00 1564; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 1565; AVX512BW-NEXT: .p2align 4, 0x90 1566; AVX512BW-NEXT: .LBB8_1: # %loop 1567; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1 1568; AVX512BW-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 1569; AVX512BW-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3 1570; AVX512BW-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3 1571; AVX512BW-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm4 1572; AVX512BW-NEXT: vprolvd %zmm3, %zmm4, %zmm3 1573; AVX512BW-NEXT: vmovdqu %ymm3, 4096(%rdi,%rax,4) 1574; AVX512BW-NEXT: addq $8, %rax 1575; AVX512BW-NEXT: jne .LBB8_1 1576; AVX512BW-NEXT: # %bb.2: # %exit 1577; AVX512BW-NEXT: vzeroupper 1578; AVX512BW-NEXT: retq 1579; 1580; AVX512VBMI2-LABEL: fancierRotate2: 1581; AVX512VBMI2: # %bb.0: # %entry 1582; AVX512VBMI2-NEXT: vmovd %edx, %xmm0 1583; AVX512VBMI2-NEXT: vpbroadcastd %xmm0, %ymm0 1584; AVX512VBMI2-NEXT: vmovd %ecx, %xmm1 1585; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %ymm1 1586; AVX512VBMI2-NEXT: movq $-1024, %rax # imm = 0xFC00 1587; AVX512VBMI2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1588; AVX512VBMI2-NEXT: .p2align 4, 0x90 1589; AVX512VBMI2-NEXT: .LBB8_1: # %loop 1590; AVX512VBMI2-NEXT: # =>This Inner Loop Header: Depth=1 1591; AVX512VBMI2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 1592; AVX512VBMI2-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3 1593; AVX512VBMI2-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3 1594; AVX512VBMI2-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm4 1595; AVX512VBMI2-NEXT: vprolvd %zmm3, %zmm4, %zmm3 1596; AVX512VBMI2-NEXT: vmovdqu %ymm3, 4096(%rdi,%rax,4) 1597; AVX512VBMI2-NEXT: addq $8, %rax 1598; AVX512VBMI2-NEXT: jne .LBB8_1 1599; AVX512VBMI2-NEXT: # %bb.2: # %exit 1600; AVX512VBMI2-NEXT: vzeroupper 1601; AVX512VBMI2-NEXT: retq 1602; 1603; AVX512VLBW-LABEL: fancierRotate2: 1604; AVX512VLBW: # %bb.0: # %entry 1605; AVX512VLBW-NEXT: vpbroadcastd %edx, %ymm0 1606; AVX512VLBW-NEXT: vpbroadcastd %ecx, %ymm1 1607; AVX512VLBW-NEXT: movq $-1024, %rax # imm = 0xFC00 1608; AVX512VLBW-NEXT: .p2align 4, 0x90 1609; AVX512VLBW-NEXT: .LBB8_1: # %loop 1610; AVX512VLBW-NEXT: # =>This Inner Loop Header: Depth=1 1611; AVX512VLBW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1612; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1 1613; AVX512VLBW-NEXT: vpblendmd %ymm0, %ymm1, %ymm2 {%k1} 1614; AVX512VLBW-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm3 1615; AVX512VLBW-NEXT: vprolvd %ymm2, %ymm3, %ymm2 1616; AVX512VLBW-NEXT: vmovdqu %ymm2, 4096(%rdi,%rax,4) 1617; AVX512VLBW-NEXT: addq $8, %rax 1618; AVX512VLBW-NEXT: jne .LBB8_1 1619; AVX512VLBW-NEXT: # %bb.2: # %exit 1620; AVX512VLBW-NEXT: vzeroupper 1621; AVX512VLBW-NEXT: retq 1622; 1623; AVX512VLVBMI2-LABEL: fancierRotate2: 1624; AVX512VLVBMI2: # %bb.0: # %entry 1625; AVX512VLVBMI2-NEXT: vpbroadcastd %edx, %ymm0 1626; AVX512VLVBMI2-NEXT: vpbroadcastd %ecx, %ymm1 1627; AVX512VLVBMI2-NEXT: movq $-1024, %rax # imm = 0xFC00 1628; AVX512VLVBMI2-NEXT: .p2align 4, 0x90 1629; AVX512VLVBMI2-NEXT: .LBB8_1: # %loop 1630; AVX512VLVBMI2-NEXT: # =>This Inner Loop Header: Depth=1 1631; AVX512VLVBMI2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 1632; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1 1633; AVX512VLVBMI2-NEXT: vpblendmd %ymm0, %ymm1, %ymm2 {%k1} 1634; AVX512VLVBMI2-NEXT: vmovdqu 4096(%rdi,%rax,4), %ymm3 1635; AVX512VLVBMI2-NEXT: vprolvd %ymm2, %ymm3, %ymm2 1636; AVX512VLVBMI2-NEXT: vmovdqu %ymm2, 4096(%rdi,%rax,4) 1637; AVX512VLVBMI2-NEXT: addq $8, %rax 1638; AVX512VLVBMI2-NEXT: jne .LBB8_1 1639; AVX512VLVBMI2-NEXT: # %bb.2: # %exit 1640; AVX512VLVBMI2-NEXT: vzeroupper 1641; AVX512VLVBMI2-NEXT: retq 1642; 1643; XOPAVX1-LABEL: fancierRotate2: 1644; XOPAVX1: # %bb.0: # %entry 1645; XOPAVX1-NEXT: vmovd %edx, %xmm0 1646; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1647; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1648; XOPAVX1-NEXT: vmovd %ecx, %xmm1 1649; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 1650; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 1651; XOPAVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 1652; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1653; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1654; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 1655; XOPAVX1-NEXT: .p2align 4, 0x90 1656; XOPAVX1-NEXT: .LBB8_1: # %loop 1657; XOPAVX1-NEXT: # =>This Inner Loop Header: Depth=1 1658; XOPAVX1-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero 1659; XOPAVX1-NEXT: vpcomeqb %xmm2, %xmm5, %xmm5 1660; XOPAVX1-NEXT: vpmovsxbd %xmm5, %xmm6 1661; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] 1662; XOPAVX1-NEXT: vpmovsxbd %xmm5, %xmm5 1663; XOPAVX1-NEXT: vblendvps %xmm5, %xmm3, %xmm4, %xmm5 1664; XOPAVX1-NEXT: vprotd %xmm5, 4112(%rdi,%rax,4), %xmm5 1665; XOPAVX1-NEXT: vblendvps %xmm6, %xmm0, %xmm1, %xmm6 1666; XOPAVX1-NEXT: vprotd %xmm6, 4096(%rdi,%rax,4), %xmm6 1667; XOPAVX1-NEXT: vmovdqu %xmm6, 4096(%rdi,%rax,4) 1668; XOPAVX1-NEXT: vmovdqu %xmm5, 4112(%rdi,%rax,4) 1669; XOPAVX1-NEXT: addq $8, %rax 1670; XOPAVX1-NEXT: jne .LBB8_1 1671; XOPAVX1-NEXT: # %bb.2: # %exit 1672; XOPAVX1-NEXT: vzeroupper 1673; XOPAVX1-NEXT: retq 1674; 1675; XOPAVX2-LABEL: fancierRotate2: 1676; XOPAVX2: # %bb.0: # %entry 1677; XOPAVX2-NEXT: vmovd %edx, %xmm0 1678; XOPAVX2-NEXT: vpbroadcastd %xmm0, %ymm0 1679; XOPAVX2-NEXT: vmovd %ecx, %xmm1 1680; XOPAVX2-NEXT: vpbroadcastd %xmm1, %ymm1 1681; XOPAVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 1682; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1683; XOPAVX2-NEXT: .p2align 4, 0x90 1684; XOPAVX2-NEXT: .LBB8_1: # %loop 1685; XOPAVX2-NEXT: # =>This Inner Loop Header: Depth=1 1686; XOPAVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 1687; XOPAVX2-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3 1688; XOPAVX2-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3 1689; XOPAVX2-NEXT: vextractf128 $1, %ymm3, %xmm4 1690; XOPAVX2-NEXT: vprotd %xmm4, 4112(%rdi,%rax,4), %xmm4 1691; XOPAVX2-NEXT: vprotd %xmm3, 4096(%rdi,%rax,4), %xmm3 1692; XOPAVX2-NEXT: vmovdqu %xmm3, 4096(%rdi,%rax,4) 1693; XOPAVX2-NEXT: vmovdqu %xmm4, 4112(%rdi,%rax,4) 1694; XOPAVX2-NEXT: addq $8, %rax 1695; XOPAVX2-NEXT: jne .LBB8_1 1696; XOPAVX2-NEXT: # %bb.2: # %exit 1697; XOPAVX2-NEXT: vzeroupper 1698; XOPAVX2-NEXT: retq 1699entry: 1700 %i0 = insertelement <8 x i32> undef, i32 %rot0, i32 0 1701 %s0 = shufflevector <8 x i32> %i0, <8 x i32> undef, <8 x i32> zeroinitializer 1702 %i1 = insertelement <8 x i32> undef, i32 %rot1, i32 0 1703 %s1 = shufflevector <8 x i32> %i1, <8 x i32> undef, <8 x i32> zeroinitializer 1704 br label %loop 1705 1706loop: 1707 %index = phi i64 [ 0, %entry ], [ %index.next, %loop ] 1708 %t0 = getelementptr inbounds i8, i8* %control, i64 %index 1709 %t1 = bitcast i8* %t0 to <8 x i8>* 1710 %wide.load = load <8 x i8>, <8 x i8>* %t1, align 1 1711 %t2 = icmp eq <8 x i8> %wide.load, zeroinitializer 1712 %shamt = select <8 x i1> %t2, <8 x i32> %s0, <8 x i32> %s1 1713 %t4 = getelementptr inbounds i32, i32* %arr, i64 %index 1714 %t5 = bitcast i32* %t4 to <8 x i32>* 1715 %wide.load21 = load <8 x i32>, <8 x i32>* %t5, align 4 1716 %rot = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %wide.load21, <8 x i32> %wide.load21, <8 x i32> %shamt) 1717 store <8 x i32> %rot, <8 x i32>* %t5, align 4 1718 %index.next = add i64 %index, 8 1719 %t7 = icmp eq i64 %index.next, 1024 1720 br i1 %t7, label %exit, label %loop 1721 1722exit: 1723 ret void 1724} 1725 1726; 1727; Constant Shifts 1728; 1729 1730define <4 x i64> @constant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { 1731; AVX1-LABEL: constant_funnnel_v4i64: 1732; AVX1: # %bb.0: 1733; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1734; AVX1-NEXT: vpsrlq $4, %xmm2, %xmm3 1735; AVX1-NEXT: vpsrlq $14, %xmm2, %xmm2 1736; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] 1737; AVX1-NEXT: vpsrlq $50, %xmm1, %xmm3 1738; AVX1-NEXT: vpsrlq $60, %xmm1, %xmm1 1739; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] 1740; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1741; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1742; AVX1-NEXT: vpsllq $60, %xmm2, %xmm3 1743; AVX1-NEXT: vpsllq $50, %xmm2, %xmm2 1744; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] 1745; AVX1-NEXT: vpsllq $14, %xmm0, %xmm3 1746; AVX1-NEXT: vpsllq $4, %xmm0, %xmm0 1747; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 1748; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1749; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 1750; AVX1-NEXT: retq 1751; 1752; AVX2-LABEL: constant_funnnel_v4i64: 1753; AVX2: # %bb.0: 1754; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1755; AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1756; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1757; AVX2-NEXT: retq 1758; 1759; AVX512F-LABEL: constant_funnnel_v4i64: 1760; AVX512F: # %bb.0: 1761; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1762; AVX512F-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1763; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 1764; AVX512F-NEXT: retq 1765; 1766; AVX512VL-LABEL: constant_funnnel_v4i64: 1767; AVX512VL: # %bb.0: 1768; AVX512VL-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1769; AVX512VL-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1770; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 1771; AVX512VL-NEXT: retq 1772; 1773; AVX512BW-LABEL: constant_funnnel_v4i64: 1774; AVX512BW: # %bb.0: 1775; AVX512BW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1776; AVX512BW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1777; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 1778; AVX512BW-NEXT: retq 1779; 1780; AVX512VBMI2-LABEL: constant_funnnel_v4i64: 1781; AVX512VBMI2: # %bb.0: 1782; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1783; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1784; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,14,50,60] 1785; AVX512VBMI2-NEXT: vpshldvq %zmm2, %zmm1, %zmm0 1786; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1787; AVX512VBMI2-NEXT: retq 1788; 1789; AVX512VLBW-LABEL: constant_funnnel_v4i64: 1790; AVX512VLBW: # %bb.0: 1791; AVX512VLBW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1792; AVX512VLBW-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1793; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 1794; AVX512VLBW-NEXT: retq 1795; 1796; AVX512VLVBMI2-LABEL: constant_funnnel_v4i64: 1797; AVX512VLVBMI2: # %bb.0: 1798; AVX512VLVBMI2-NEXT: vpshldvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 1799; AVX512VLVBMI2-NEXT: retq 1800; 1801; XOPAVX1-LABEL: constant_funnnel_v4i64: 1802; XOPAVX1: # %bb.0: 1803; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 1804; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1805; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1806; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1807; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 1808; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1809; XOPAVX1-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1810; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 1811; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 1812; XOPAVX1-NEXT: retq 1813; 1814; XOPAVX2-LABEL: constant_funnnel_v4i64: 1815; XOPAVX2: # %bb.0: 1816; XOPAVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1817; XOPAVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1818; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1819; XOPAVX2-NEXT: retq 1820 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> <i64 4, i64 14, i64 50, i64 60>) 1821 ret <4 x i64> %res 1822} 1823 1824define <8 x i32> @constant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { 1825; AVX1-LABEL: constant_funnnel_v8i32: 1826; AVX1: # %bb.0: 1827; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1828; AVX1-NEXT: vpsrld $21, %xmm2, %xmm3 1829; AVX1-NEXT: vpsrld $23, %xmm2, %xmm4 1830; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1831; AVX1-NEXT: vpsrld $22, %xmm2, %xmm4 1832; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 1833; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] 1834; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1835; AVX1-NEXT: vpsrld $25, %xmm1, %xmm3 1836; AVX1-NEXT: vpsrld $27, %xmm1, %xmm4 1837; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1838; AVX1-NEXT: vpsrld $26, %xmm1, %xmm4 1839; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1 1840; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] 1841; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 1842; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1843; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 1844; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1845; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1846; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 1847; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 1848; AVX1-NEXT: retq 1849; 1850; AVX2-LABEL: constant_funnnel_v8i32: 1851; AVX2: # %bb.0: 1852; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1853; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1854; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1855; AVX2-NEXT: retq 1856; 1857; AVX512F-LABEL: constant_funnnel_v8i32: 1858; AVX512F: # %bb.0: 1859; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1860; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1861; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 1862; AVX512F-NEXT: retq 1863; 1864; AVX512VL-LABEL: constant_funnnel_v8i32: 1865; AVX512VL: # %bb.0: 1866; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1867; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1868; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 1869; AVX512VL-NEXT: retq 1870; 1871; AVX512BW-LABEL: constant_funnnel_v8i32: 1872; AVX512BW: # %bb.0: 1873; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1874; AVX512BW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1875; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 1876; AVX512BW-NEXT: retq 1877; 1878; AVX512VBMI2-LABEL: constant_funnnel_v8i32: 1879; AVX512VBMI2: # %bb.0: 1880; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1881; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1882; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,7,8,9,10,11] 1883; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0 1884; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1885; AVX512VBMI2-NEXT: retq 1886; 1887; AVX512VLBW-LABEL: constant_funnnel_v8i32: 1888; AVX512VLBW: # %bb.0: 1889; AVX512VLBW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1890; AVX512VLBW-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1891; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 1892; AVX512VLBW-NEXT: retq 1893; 1894; AVX512VLVBMI2-LABEL: constant_funnnel_v8i32: 1895; AVX512VLVBMI2: # %bb.0: 1896; AVX512VLVBMI2-NEXT: vpshldvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 1897; AVX512VLVBMI2-NEXT: retq 1898; 1899; XOPAVX1-LABEL: constant_funnnel_v8i32: 1900; XOPAVX1: # %bb.0: 1901; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 1902; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1903; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1904; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1905; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 1906; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1907; XOPAVX1-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1908; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 1909; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 1910; XOPAVX1-NEXT: retq 1911; 1912; XOPAVX2-LABEL: constant_funnnel_v8i32: 1913; XOPAVX2: # %bb.0: 1914; XOPAVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1915; XOPAVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1916; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1917; XOPAVX2-NEXT: retq 1918 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>) 1919 ret <8 x i32> %res 1920} 1921 1922define <16 x i16> @constant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwind { 1923; AVX1-LABEL: constant_funnnel_v16i16: 1924; AVX1: # %bb.0: 1925; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1926; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 1927; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 1928; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5,6],xmm2[7] 1929; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 1930; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1931; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 1932; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 1933; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1934; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1935; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 1936; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 1937; AVX1-NEXT: retq 1938; 1939; AVX2-LABEL: constant_funnnel_v16i16: 1940; AVX2: # %bb.0: 1941; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 1942; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 1943; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7],ymm2[8,9,10,11,12,13,14],ymm1[15] 1944; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 1945; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1946; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1947; AVX2-NEXT: retq 1948; 1949; AVX512F-LABEL: constant_funnnel_v16i16: 1950; AVX512F: # %bb.0: 1951; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 1952; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 1953; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7],ymm2[8,9,10,11,12,13,14],ymm1[15] 1954; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 1955; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1956; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 1957; AVX512F-NEXT: retq 1958; 1959; AVX512VL-LABEL: constant_funnnel_v16i16: 1960; AVX512VL: # %bb.0: 1961; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1 1962; AVX512VL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 1963; AVX512VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7],ymm2[8,9,10,11,12,13,14],ymm1[15] 1964; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 1965; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1966; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 1967; AVX512VL-NEXT: retq 1968; 1969; AVX512BW-LABEL: constant_funnnel_v16i16: 1970; AVX512BW: # %bb.0: 1971; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1972; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1973; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm0 1974; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 1975; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1 1976; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm1 1977; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 1978; AVX512BW-NEXT: retq 1979; 1980; AVX512VBMI2-LABEL: constant_funnnel_v16i16: 1981; AVX512VBMI2: # %bb.0: 1982; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1983; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1984; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1985; AVX512VBMI2-NEXT: vpshldvw %zmm2, %zmm1, %zmm0 1986; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1987; AVX512VBMI2-NEXT: retq 1988; 1989; AVX512VLBW-LABEL: constant_funnnel_v16i16: 1990; AVX512VLBW: # %bb.0: 1991; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1992; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 1993; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1994; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 1995; AVX512VLBW-NEXT: retq 1996; 1997; AVX512VLVBMI2-LABEL: constant_funnnel_v16i16: 1998; AVX512VLVBMI2: # %bb.0: 1999; AVX512VLVBMI2-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 2000; AVX512VLVBMI2-NEXT: retq 2001; 2002; XOPAVX1-LABEL: constant_funnnel_v16i16: 2003; XOPAVX1: # %bb.0: 2004; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 2005; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2006; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2007; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 2008; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm2 2009; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2010; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2011; XOPAVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 2012; XOPAVX1-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2013; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 2014; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 2015; XOPAVX1-NEXT: retq 2016; 2017; XOPAVX2-LABEL: constant_funnnel_v16i16: 2018; XOPAVX2: # %bb.0: 2019; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 2020; XOPAVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 2021; XOPAVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7],ymm2[8,9,10,11,12,13,14],ymm1[15] 2022; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 2023; XOPAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2024; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 2025; XOPAVX2-NEXT: retq 2026 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>) 2027 ret <16 x i16> %res 2028} 2029 2030define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { 2031; AVX1-LABEL: constant_funnnel_v32i8: 2032; AVX1: # %bb.0: 2033; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2034; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2035; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,128,64,32,16,8,4,2] 2036; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 2037; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 2038; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 2039; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2040; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,2,4,8,16,32,64,128] 2041; AVX1-NEXT: vpmullw %xmm6, %xmm2, %xmm2 2042; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 2043; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 2044; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2045; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 2046; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 2047; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2048; AVX1-NEXT: vpmullw %xmm6, %xmm0, %xmm0 2049; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 2050; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 2051; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2052; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2053; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 2054; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 2055; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 2056; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 2057; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] 2058; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [2,256,128,64,32,16,8,4] 2059; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm5 2060; AVX1-NEXT: vpsrlw $8, %xmm5, %xmm5 2061; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2062; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [2,4,8,16,32,64,128,256] 2063; AVX1-NEXT: vpmullw %xmm7, %xmm2, %xmm2 2064; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 2065; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2 2066; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 2067; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 2068; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] 2069; AVX1-NEXT: vpmullw %xmm6, %xmm3, %xmm3 2070; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 2071; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2072; AVX1-NEXT: vpmullw %xmm7, %xmm1, %xmm1 2073; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 2074; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 2075; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2076; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 2077; AVX1-NEXT: retq 2078; 2079; AVX2-LABEL: constant_funnnel_v32i8: 2080; AVX2: # %bb.0: 2081; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2 2082; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 2083; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] 2084; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 2085; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 2086; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2 2087; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 2088; AVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3 2089; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 2090; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 2091; AVX2-NEXT: vpaddb %ymm3, %ymm3, %ymm3 2092; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 2093; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 2094; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2095; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 2096; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] 2097; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 2098; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3 2099; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] 2100; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2101; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 2102; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 2103; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 2104; AVX2-NEXT: retq 2105; 2106; AVX512F-LABEL: constant_funnnel_v32i8: 2107; AVX512F: # %bb.0: 2108; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 2109; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 2110; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] 2111; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] 2112; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 2113; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2 2114; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 2115; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 2116; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 2117; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2 2118; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 2119; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 2120; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 2121; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2122; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 2123; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] 2124; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 2125; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 2126; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] 2127; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2128; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 2129; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 2130; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 2131; AVX512F-NEXT: retq 2132; 2133; AVX512VL-LABEL: constant_funnnel_v32i8: 2134; AVX512VL: # %bb.0: 2135; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 2136; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 2137; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] 2138; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1] 2139; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 2140; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2 2141; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 2142; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 2143; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 2144; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 2145; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 2146; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 2147; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm1 2148; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2149; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 2150; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] 2151; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 2152; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 2153; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] 2154; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2155; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 2156; AVX512VL-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 2157; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 2158; AVX512VL-NEXT: retq 2159; 2160; AVX512BW-LABEL: constant_funnnel_v32i8: 2161; AVX512BW: # %bb.0: 2162; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 2163; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2164; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1 2165; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2166; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 2167; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2168; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 2169; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2170; AVX512BW-NEXT: retq 2171; 2172; AVX512VBMI2-LABEL: constant_funnnel_v32i8: 2173; AVX512VBMI2: # %bb.0: 2174; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 2175; AVX512VBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2176; AVX512VBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1 2177; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2178; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 2179; AVX512VBMI2-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2180; AVX512VBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0 2181; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 2182; AVX512VBMI2-NEXT: retq 2183; 2184; AVX512VLBW-LABEL: constant_funnnel_v32i8: 2185; AVX512VLBW: # %bb.0: 2186; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 2187; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2188; AVX512VLBW-NEXT: vpsrlw $1, %ymm1, %ymm1 2189; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2190; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 2191; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2192; AVX512VLBW-NEXT: vporq %zmm1, %zmm0, %zmm0 2193; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 2194; AVX512VLBW-NEXT: retq 2195; 2196; AVX512VLVBMI2-LABEL: constant_funnnel_v32i8: 2197; AVX512VLVBMI2: # %bb.0: 2198; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 2199; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2200; AVX512VLVBMI2-NEXT: vpsrlw $1, %ymm1, %ymm1 2201; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2202; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 2203; AVX512VLVBMI2-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2204; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm0, %zmm0 2205; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 2206; AVX512VLVBMI2-NEXT: retq 2207; 2208; XOPAVX1-LABEL: constant_funnnel_v32i8: 2209; XOPAVX1: # %bb.0: 2210; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2211; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] 2212; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2 2213; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0 2214; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2215; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2216; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 2217; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2 2218; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [249,250,251,252,253,254,255,0,249,0,255,254,253,252,251,250] 2219; XOPAVX1-NEXT: vpshlb %xmm4, %xmm2, %xmm2 2220; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 2221; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1 2222; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2223; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 2224; XOPAVX1-NEXT: retq 2225; 2226; XOPAVX2-LABEL: constant_funnnel_v32i8: 2227; XOPAVX2: # %bb.0: 2228; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 2229; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1] 2230; XOPAVX2-NEXT: vpshlb %xmm3, %xmm2, %xmm2 2231; XOPAVX2-NEXT: vpshlb %xmm3, %xmm0, %xmm0 2232; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 2233; XOPAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 2234; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2235; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2236; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [249,250,251,252,253,254,255,0,249,0,255,254,253,252,251,250] 2237; XOPAVX2-NEXT: vpshlb %xmm3, %xmm2, %xmm2 2238; XOPAVX2-NEXT: vpshlb %xmm3, %xmm1, %xmm1 2239; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 2240; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 2241; XOPAVX2-NEXT: retq 2242 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>) 2243 ret <32 x i8> %res 2244} 2245 2246; 2247; Uniform Constant Shifts 2248; 2249 2250define <4 x i64> @splatconstant_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind { 2251; AVX1-LABEL: splatconstant_funnnel_v4i64: 2252; AVX1: # %bb.0: 2253; AVX1-NEXT: vpsrlq $50, %xmm1, %xmm2 2254; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2255; AVX1-NEXT: vpsrlq $50, %xmm1, %xmm1 2256; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 2257; AVX1-NEXT: vpsllq $14, %xmm0, %xmm2 2258; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2259; AVX1-NEXT: vpsllq $14, %xmm0, %xmm0 2260; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 2261; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 2262; AVX1-NEXT: retq 2263; 2264; AVX2-LABEL: splatconstant_funnnel_v4i64: 2265; AVX2: # %bb.0: 2266; AVX2-NEXT: vpsrlq $50, %ymm1, %ymm1 2267; AVX2-NEXT: vpsllq $14, %ymm0, %ymm0 2268; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 2269; AVX2-NEXT: retq 2270; 2271; AVX512F-LABEL: splatconstant_funnnel_v4i64: 2272; AVX512F: # %bb.0: 2273; AVX512F-NEXT: vpsrlq $50, %ymm1, %ymm1 2274; AVX512F-NEXT: vpsllq $14, %ymm0, %ymm0 2275; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 2276; AVX512F-NEXT: retq 2277; 2278; AVX512VL-LABEL: splatconstant_funnnel_v4i64: 2279; AVX512VL: # %bb.0: 2280; AVX512VL-NEXT: vpsrlq $50, %ymm1, %ymm1 2281; AVX512VL-NEXT: vpsllq $14, %ymm0, %ymm0 2282; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 2283; AVX512VL-NEXT: retq 2284; 2285; AVX512BW-LABEL: splatconstant_funnnel_v4i64: 2286; AVX512BW: # %bb.0: 2287; AVX512BW-NEXT: vpsrlq $50, %ymm1, %ymm1 2288; AVX512BW-NEXT: vpsllq $14, %ymm0, %ymm0 2289; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 2290; AVX512BW-NEXT: retq 2291; 2292; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i64: 2293; AVX512VBMI2: # %bb.0: 2294; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2295; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2296; AVX512VBMI2-NEXT: vpshldq $14, %zmm1, %zmm0, %zmm0 2297; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2298; AVX512VBMI2-NEXT: retq 2299; 2300; AVX512VLBW-LABEL: splatconstant_funnnel_v4i64: 2301; AVX512VLBW: # %bb.0: 2302; AVX512VLBW-NEXT: vpsrlq $50, %ymm1, %ymm1 2303; AVX512VLBW-NEXT: vpsllq $14, %ymm0, %ymm0 2304; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 2305; AVX512VLBW-NEXT: retq 2306; 2307; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i64: 2308; AVX512VLVBMI2: # %bb.0: 2309; AVX512VLVBMI2-NEXT: vpshldq $14, %ymm1, %ymm0, %ymm0 2310; AVX512VLVBMI2-NEXT: retq 2311; 2312; XOPAVX1-LABEL: splatconstant_funnnel_v4i64: 2313; XOPAVX1: # %bb.0: 2314; XOPAVX1-NEXT: vpsrlq $50, %xmm1, %xmm2 2315; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2316; XOPAVX1-NEXT: vpsrlq $50, %xmm1, %xmm1 2317; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 2318; XOPAVX1-NEXT: vpsllq $14, %xmm0, %xmm2 2319; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2320; XOPAVX1-NEXT: vpsllq $14, %xmm0, %xmm0 2321; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 2322; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 2323; XOPAVX1-NEXT: retq 2324; 2325; XOPAVX2-LABEL: splatconstant_funnnel_v4i64: 2326; XOPAVX2: # %bb.0: 2327; XOPAVX2-NEXT: vpsrlq $50, %ymm1, %ymm1 2328; XOPAVX2-NEXT: vpsllq $14, %ymm0, %ymm0 2329; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 2330; XOPAVX2-NEXT: retq 2331 %res = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> <i64 14, i64 14, i64 14, i64 14>) 2332 ret <4 x i64> %res 2333} 2334 2335define <8 x i32> @splatconstant_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind { 2336; AVX1-LABEL: splatconstant_funnnel_v8i32: 2337; AVX1: # %bb.0: 2338; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2 2339; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2340; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1 2341; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 2342; AVX1-NEXT: vpslld $4, %xmm0, %xmm2 2343; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2344; AVX1-NEXT: vpslld $4, %xmm0, %xmm0 2345; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 2346; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 2347; AVX1-NEXT: retq 2348; 2349; AVX2-LABEL: splatconstant_funnnel_v8i32: 2350; AVX2: # %bb.0: 2351; AVX2-NEXT: vpsrld $28, %ymm1, %ymm1 2352; AVX2-NEXT: vpslld $4, %ymm0, %ymm0 2353; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 2354; AVX2-NEXT: retq 2355; 2356; AVX512F-LABEL: splatconstant_funnnel_v8i32: 2357; AVX512F: # %bb.0: 2358; AVX512F-NEXT: vpsrld $28, %ymm1, %ymm1 2359; AVX512F-NEXT: vpslld $4, %ymm0, %ymm0 2360; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 2361; AVX512F-NEXT: retq 2362; 2363; AVX512VL-LABEL: splatconstant_funnnel_v8i32: 2364; AVX512VL: # %bb.0: 2365; AVX512VL-NEXT: vpsrld $28, %ymm1, %ymm1 2366; AVX512VL-NEXT: vpslld $4, %ymm0, %ymm0 2367; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 2368; AVX512VL-NEXT: retq 2369; 2370; AVX512BW-LABEL: splatconstant_funnnel_v8i32: 2371; AVX512BW: # %bb.0: 2372; AVX512BW-NEXT: vpsrld $28, %ymm1, %ymm1 2373; AVX512BW-NEXT: vpslld $4, %ymm0, %ymm0 2374; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 2375; AVX512BW-NEXT: retq 2376; 2377; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i32: 2378; AVX512VBMI2: # %bb.0: 2379; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2380; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2381; AVX512VBMI2-NEXT: vpshldd $4, %zmm1, %zmm0, %zmm0 2382; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2383; AVX512VBMI2-NEXT: retq 2384; 2385; AVX512VLBW-LABEL: splatconstant_funnnel_v8i32: 2386; AVX512VLBW: # %bb.0: 2387; AVX512VLBW-NEXT: vpsrld $28, %ymm1, %ymm1 2388; AVX512VLBW-NEXT: vpslld $4, %ymm0, %ymm0 2389; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 2390; AVX512VLBW-NEXT: retq 2391; 2392; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i32: 2393; AVX512VLVBMI2: # %bb.0: 2394; AVX512VLVBMI2-NEXT: vpshldd $4, %ymm1, %ymm0, %ymm0 2395; AVX512VLVBMI2-NEXT: retq 2396; 2397; XOPAVX1-LABEL: splatconstant_funnnel_v8i32: 2398; XOPAVX1: # %bb.0: 2399; XOPAVX1-NEXT: vpsrld $28, %xmm1, %xmm2 2400; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2401; XOPAVX1-NEXT: vpsrld $28, %xmm1, %xmm1 2402; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 2403; XOPAVX1-NEXT: vpslld $4, %xmm0, %xmm2 2404; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2405; XOPAVX1-NEXT: vpslld $4, %xmm0, %xmm0 2406; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 2407; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 2408; XOPAVX1-NEXT: retq 2409; 2410; XOPAVX2-LABEL: splatconstant_funnnel_v8i32: 2411; XOPAVX2: # %bb.0: 2412; XOPAVX2-NEXT: vpsrld $28, %ymm1, %ymm1 2413; XOPAVX2-NEXT: vpslld $4, %ymm0, %ymm0 2414; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 2415; XOPAVX2-NEXT: retq 2416 %res = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>) 2417 ret <8 x i32> %res 2418} 2419 2420define <16 x i16> @splatconstant_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y) nounwind { 2421; AVX1-LABEL: splatconstant_funnnel_v16i16: 2422; AVX1: # %bb.0: 2423; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm2 2424; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2425; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm1 2426; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 2427; AVX1-NEXT: vpsllw $7, %xmm0, %xmm2 2428; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2429; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 2430; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 2431; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 2432; AVX1-NEXT: retq 2433; 2434; AVX2-LABEL: splatconstant_funnnel_v16i16: 2435; AVX2: # %bb.0: 2436; AVX2-NEXT: vpsrlw $9, %ymm1, %ymm1 2437; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 2438; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 2439; AVX2-NEXT: retq 2440; 2441; AVX512F-LABEL: splatconstant_funnnel_v16i16: 2442; AVX512F: # %bb.0: 2443; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm1 2444; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 2445; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 2446; AVX512F-NEXT: retq 2447; 2448; AVX512VL-LABEL: splatconstant_funnnel_v16i16: 2449; AVX512VL: # %bb.0: 2450; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm1 2451; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0 2452; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 2453; AVX512VL-NEXT: retq 2454; 2455; AVX512BW-LABEL: splatconstant_funnnel_v16i16: 2456; AVX512BW: # %bb.0: 2457; AVX512BW-NEXT: vpsrlw $9, %ymm1, %ymm1 2458; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0 2459; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 2460; AVX512BW-NEXT: retq 2461; 2462; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i16: 2463; AVX512VBMI2: # %bb.0: 2464; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2465; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2466; AVX512VBMI2-NEXT: vpshldw $7, %zmm1, %zmm0, %zmm0 2467; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 2468; AVX512VBMI2-NEXT: retq 2469; 2470; AVX512VLBW-LABEL: splatconstant_funnnel_v16i16: 2471; AVX512VLBW: # %bb.0: 2472; AVX512VLBW-NEXT: vpsrlw $9, %ymm1, %ymm1 2473; AVX512VLBW-NEXT: vpsllw $7, %ymm0, %ymm0 2474; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 2475; AVX512VLBW-NEXT: retq 2476; 2477; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i16: 2478; AVX512VLVBMI2: # %bb.0: 2479; AVX512VLVBMI2-NEXT: vpshldw $7, %ymm1, %ymm0, %ymm0 2480; AVX512VLVBMI2-NEXT: retq 2481; 2482; XOPAVX1-LABEL: splatconstant_funnnel_v16i16: 2483; XOPAVX1: # %bb.0: 2484; XOPAVX1-NEXT: vpsrlw $9, %xmm1, %xmm2 2485; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2486; XOPAVX1-NEXT: vpsrlw $9, %xmm1, %xmm1 2487; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 2488; XOPAVX1-NEXT: vpsllw $7, %xmm0, %xmm2 2489; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2490; XOPAVX1-NEXT: vpsllw $7, %xmm0, %xmm0 2491; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 2492; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 2493; XOPAVX1-NEXT: retq 2494; 2495; XOPAVX2-LABEL: splatconstant_funnnel_v16i16: 2496; XOPAVX2: # %bb.0: 2497; XOPAVX2-NEXT: vpsrlw $9, %ymm1, %ymm1 2498; XOPAVX2-NEXT: vpsllw $7, %ymm0, %ymm0 2499; XOPAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 2500; XOPAVX2-NEXT: retq 2501 %res = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>) 2502 ret <16 x i16> %res 2503} 2504 2505define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { 2506; AVX1-LABEL: splatconstant_funnnel_v32i8: 2507; AVX1: # %bb.0: 2508; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2509; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm2 2510; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 2511; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 2512; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 2513; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 2514; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2515; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2516; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 2517; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 2518; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 2519; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 2520; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 2521; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2522; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 2523; AVX1-NEXT: retq 2524; 2525; AVX2-LABEL: splatconstant_funnnel_v32i8: 2526; AVX2: # %bb.0: 2527; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2528; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2529; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0 2530; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2531; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 2532; AVX2-NEXT: retq 2533; 2534; AVX512F-LABEL: splatconstant_funnnel_v32i8: 2535; AVX512F: # %bb.0: 2536; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 2537; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2538; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 2539; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2540; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 2541; AVX512F-NEXT: retq 2542; 2543; AVX512VL-LABEL: splatconstant_funnnel_v32i8: 2544; AVX512VL: # %bb.0: 2545; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 2546; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm0 2547; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 2548; AVX512VL-NEXT: retq 2549; 2550; AVX512BW-LABEL: splatconstant_funnnel_v32i8: 2551; AVX512BW: # %bb.0: 2552; AVX512BW-NEXT: vpsrlw $4, %ymm1, %ymm1 2553; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2554; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm0 2555; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2556; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 2557; AVX512BW-NEXT: retq 2558; 2559; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8: 2560; AVX512VBMI2: # %bb.0: 2561; AVX512VBMI2-NEXT: vpsrlw $4, %ymm1, %ymm1 2562; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2563; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm0 2564; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2565; AVX512VBMI2-NEXT: vpor %ymm1, %ymm0, %ymm0 2566; AVX512VBMI2-NEXT: retq 2567; 2568; AVX512VLBW-LABEL: splatconstant_funnnel_v32i8: 2569; AVX512VLBW: # %bb.0: 2570; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm2 2571; AVX512VLBW-NEXT: vpsrlw $4, %ymm1, %ymm0 2572; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 2573; AVX512VLBW-NEXT: retq 2574; 2575; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v32i8: 2576; AVX512VLVBMI2: # %bb.0: 2577; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm2 2578; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm1, %ymm0 2579; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 2580; AVX512VLVBMI2-NEXT: retq 2581; 2582; XOPAVX1-LABEL: splatconstant_funnnel_v32i8: 2583; XOPAVX1: # %bb.0: 2584; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2585; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 2586; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2 2587; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1 2588; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 2589; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2590; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] 2591; XOPAVX1-NEXT: vpshlb %xmm3, %xmm2, %xmm2 2592; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0 2593; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2594; XOPAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 2595; XOPAVX1-NEXT: retq 2596; 2597; XOPAVX2-LABEL: splatconstant_funnnel_v32i8: 2598; XOPAVX2: # %bb.0: 2599; XOPAVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 2600; XOPAVX2-NEXT: vpsllw $4, %ymm0, %ymm0 2601; XOPAVX2-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0, %ymm0 2602; XOPAVX2-NEXT: retq 2603 %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>) 2604 ret <32 x i8> %res 2605} 2606