1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 5 6; Verify that the following shifts are lowered into a sequence of two shifts plus 7; a blend. On pre-avx2 targets, instead of scalarizing logical and arithmetic 8; packed shift right by a constant build_vector the backend should always try to 9; emit a simpler sequence of two shifts + blend when possible. 10 11define <8 x i16> @test1(<8 x i16> %a) { 12; SSE-LABEL: test1: 13; SSE: # %bb.0: 14; SSE-NEXT: movdqa %xmm0, %xmm1 15; SSE-NEXT: psrlw $3, %xmm1 16; SSE-NEXT: psrlw $2, %xmm0 17; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 18; SSE-NEXT: retq 19; 20; AVX1-LABEL: test1: 21; AVX1: # %bb.0: 22; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1 23; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0 24; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 25; AVX1-NEXT: retq 26; 27; AVX2-LABEL: test1: 28; AVX2: # %bb.0: 29; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm1 30; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm0 31; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 32; AVX2-NEXT: retq 33 %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> 34 ret <8 x i16> %lshr 35} 36 37define <8 x i16> @test2(<8 x i16> %a) { 38; SSE-LABEL: test2: 39; SSE: # %bb.0: 40; SSE-NEXT: movdqa %xmm0, %xmm1 41; SSE-NEXT: psrlw $3, %xmm1 42; SSE-NEXT: psrlw $2, %xmm0 43; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 44; SSE-NEXT: retq 45; 46; AVX1-LABEL: test2: 47; AVX1: # %bb.0: 48; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 49; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 50; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 51; AVX1-NEXT: retq 52; 53; AVX2-LABEL: test2: 54; AVX2: # %bb.0: 55; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm1 56; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 57; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 58; AVX2-NEXT: retq 59 %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2> 60 ret <8 x i16> %lshr 61} 62 63define <4 x i32> @test3(<4 x i32> %a) { 64; SSE-LABEL: test3: 65; SSE: # %bb.0: 66; SSE-NEXT: movdqa %xmm0, %xmm1 67; SSE-NEXT: psrld $3, %xmm1 68; SSE-NEXT: psrld $2, %xmm0 69; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 70; SSE-NEXT: retq 71; 72; AVX1-LABEL: test3: 73; AVX1: # %bb.0: 74; AVX1-NEXT: vpsrld $3, %xmm0, %xmm1 75; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0 76; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 77; AVX1-NEXT: retq 78; 79; AVX2-LABEL: test3: 80; AVX2: # %bb.0: 81; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 82; AVX2-NEXT: retq 83 %lshr = lshr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2> 84 ret <4 x i32> %lshr 85} 86 87define <4 x i32> @test4(<4 x i32> %a) { 88; SSE-LABEL: test4: 89; SSE: # %bb.0: 90; SSE-NEXT: movdqa %xmm0, %xmm1 91; SSE-NEXT: psrld $3, %xmm1 92; SSE-NEXT: psrld $2, %xmm0 93; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 94; SSE-NEXT: retq 95; 96; AVX1-LABEL: test4: 97; AVX1: # %bb.0: 98; AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 99; AVX1-NEXT: vpsrld $3, %xmm0, %xmm0 100; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 101; AVX1-NEXT: retq 102; 103; AVX2-LABEL: test4: 104; AVX2: # %bb.0: 105; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 106; AVX2-NEXT: retq 107 %lshr = lshr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2> 108 ret <4 x i32> %lshr 109} 110 111define <8 x i16> @test5(<8 x i16> %a) { 112; SSE-LABEL: test5: 113; SSE: # %bb.0: 114; SSE-NEXT: movdqa %xmm0, %xmm1 115; SSE-NEXT: psraw $3, %xmm1 116; SSE-NEXT: psraw $2, %xmm0 117; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 118; SSE-NEXT: retq 119; 120; AVX1-LABEL: test5: 121; AVX1: # %bb.0: 122; AVX1-NEXT: vpsraw $3, %xmm0, %xmm1 123; AVX1-NEXT: vpsraw $2, %xmm0, %xmm0 124; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 125; AVX1-NEXT: retq 126; 127; AVX2-LABEL: test5: 128; AVX2: # %bb.0: 129; AVX2-NEXT: vpsraw $3, %xmm0, %xmm1 130; AVX2-NEXT: vpsraw $2, %xmm0, %xmm0 131; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 132; AVX2-NEXT: retq 133 %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> 134 ret <8 x i16> %lshr 135} 136 137define <8 x i16> @test6(<8 x i16> %a) { 138; SSE-LABEL: test6: 139; SSE: # %bb.0: 140; SSE-NEXT: movdqa %xmm0, %xmm1 141; SSE-NEXT: psraw $3, %xmm1 142; SSE-NEXT: psraw $2, %xmm0 143; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 144; SSE-NEXT: retq 145; 146; AVX1-LABEL: test6: 147; AVX1: # %bb.0: 148; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 149; AVX1-NEXT: vpsraw $3, %xmm0, %xmm0 150; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 151; AVX1-NEXT: retq 152; 153; AVX2-LABEL: test6: 154; AVX2: # %bb.0: 155; AVX2-NEXT: vpsraw $2, %xmm0, %xmm1 156; AVX2-NEXT: vpsraw $3, %xmm0, %xmm0 157; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 158; AVX2-NEXT: retq 159 %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2> 160 ret <8 x i16> %lshr 161} 162 163define <4 x i32> @test7(<4 x i32> %a) { 164; SSE-LABEL: test7: 165; SSE: # %bb.0: 166; SSE-NEXT: movdqa %xmm0, %xmm1 167; SSE-NEXT: psrad $3, %xmm1 168; SSE-NEXT: psrad $2, %xmm0 169; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 170; SSE-NEXT: retq 171; 172; AVX1-LABEL: test7: 173; AVX1: # %bb.0: 174; AVX1-NEXT: vpsrad $3, %xmm0, %xmm1 175; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0 176; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 177; AVX1-NEXT: retq 178; 179; AVX2-LABEL: test7: 180; AVX2: # %bb.0: 181; AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 182; AVX2-NEXT: retq 183 %lshr = ashr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2> 184 ret <4 x i32> %lshr 185} 186 187define <4 x i32> @test8(<4 x i32> %a) { 188; SSE-LABEL: test8: 189; SSE: # %bb.0: 190; SSE-NEXT: movdqa %xmm0, %xmm1 191; SSE-NEXT: psrad $3, %xmm1 192; SSE-NEXT: psrad $2, %xmm0 193; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 194; SSE-NEXT: retq 195; 196; AVX1-LABEL: test8: 197; AVX1: # %bb.0: 198; AVX1-NEXT: vpsrad $2, %xmm0, %xmm1 199; AVX1-NEXT: vpsrad $3, %xmm0, %xmm0 200; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 201; AVX1-NEXT: retq 202; 203; AVX2-LABEL: test8: 204; AVX2: # %bb.0: 205; AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 206; AVX2-NEXT: retq 207 %lshr = ashr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2> 208 ret <4 x i32> %lshr 209} 210 211define <8 x i16> @test9(<8 x i16> %a) { 212; SSE-LABEL: test9: 213; SSE: # %bb.0: 214; SSE-NEXT: movdqa %xmm0, %xmm1 215; SSE-NEXT: psraw $3, %xmm1 216; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,0,0] 217; SSE-NEXT: psraw $1, %xmm0 218; SSE-NEXT: pand %xmm2, %xmm0 219; SSE-NEXT: pandn %xmm1, %xmm2 220; SSE-NEXT: por %xmm2, %xmm0 221; SSE-NEXT: retq 222; 223; AVX-LABEL: test9: 224; AVX: # %bb.0: 225; AVX-NEXT: vpsraw $3, %xmm0, %xmm1 226; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 227; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4],xmm1[5,6,7] 228; AVX-NEXT: retq 229 %lshr = ashr <8 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3> 230 ret <8 x i16> %lshr 231} 232 233define <8 x i32> @test10(<8 x i32>* %a) { 234; SSE-LABEL: test10: 235; SSE: # %bb.0: 236; SSE-NEXT: movdqa (%rdi), %xmm0 237; SSE-NEXT: psrad $1, %xmm0 238; SSE-NEXT: retq 239; 240; AVX1-LABEL: test10: 241; AVX1: # %bb.0: 242; AVX1-NEXT: vmovdqa (%rdi), %xmm0 243; AVX1-NEXT: vpsrad $1, %xmm0, %xmm0 244; AVX1-NEXT: retq 245; 246; AVX2-LABEL: test10: 247; AVX2: # %bb.0: 248; AVX2-NEXT: vmovdqa (%rdi), %ymm0 249; AVX2-NEXT: vpsrad $1, %ymm0, %ymm0 250; AVX2-NEXT: retq 251 %ld = load <8 x i32>, <8 x i32>* %a, align 32 252 %ashr = ashr <8 x i32> %ld, <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 253 ret <8 x i32> %ashr 254} 255 256; test11 vs test12 - show difference between v16i16 that is repeated/non-repeated at v8i16 level (for PBLENDW masks). 257 258define <16 x i16> @test11(<16 x i16> %a) { 259; SSE-LABEL: test11: 260; SSE: # %bb.0: 261; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 262; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 263; SSE-NEXT: retq 264; 265; AVX1-LABEL: test11: 266; AVX1: # %bb.0: 267; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 268; AVX1-NEXT: vpsllw $1, %xmm1, %xmm2 269; AVX1-NEXT: vpsllw $3, %xmm1, %xmm1 270; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6],xmm2[7] 271; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2 272; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 273; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7] 274; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 275; AVX1-NEXT: retq 276; 277; AVX2-LABEL: test11: 278; AVX2: # %bb.0: 279; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 280; AVX2-NEXT: retq 281 %lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 1, i16 1, i16 1, i16 3, i16 1> 282 ret <16 x i16> %lshr 283} 284 285define <16 x i16> @test12(<16 x i16> %a) { 286; SSE-LABEL: test12: 287; SSE: # %bb.0: 288; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,8,2,2,2,8,8,8] 289; SSE-NEXT: pmullw %xmm2, %xmm0 290; SSE-NEXT: pmullw %xmm2, %xmm1 291; SSE-NEXT: retq 292; 293; AVX1-LABEL: test12: 294; AVX1: # %bb.0: 295; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 296; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2 297; AVX1-NEXT: vpsllw $1, %xmm1, %xmm1 298; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4],xmm2[5,6,7] 299; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2 300; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 301; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7] 302; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 303; AVX1-NEXT: retq 304; 305; AVX2-LABEL: test12: 306; AVX2: # %bb.0: 307; AVX2-NEXT: vpsllw $3, %ymm0, %ymm1 308; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0 309; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5,6,7],ymm0[8],ymm1[9],ymm0[10,11,12],ymm1[13,14,15] 310; AVX2-NEXT: retq 311 %lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3> 312 ret <16 x i16> %lshr 313} 314