1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST-ALL 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST-PERLANE 7 8; fold (shl 0, x) -> 0 9define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) { 10; SSE-LABEL: combine_vec_shl_zero: 11; SSE: # %bb.0: 12; SSE-NEXT: xorps %xmm0, %xmm0 13; SSE-NEXT: retq 14; 15; AVX-LABEL: combine_vec_shl_zero: 16; AVX: # %bb.0: 17; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 18; AVX-NEXT: retq 19 %1 = shl <4 x i32> zeroinitializer, %x 20 ret <4 x i32> %1 21} 22 23; fold (shl x, c >= size(x)) -> undef 24define <4 x i32> @combine_vec_shl_outofrange0(<4 x i32> %x) { 25; CHECK-LABEL: combine_vec_shl_outofrange0: 26; CHECK: # %bb.0: 27; CHECK-NEXT: retq 28 %1 = shl <4 x i32> %x, <i32 33, i32 33, i32 33, i32 33> 29 ret <4 x i32> %1 30} 31 32define <4 x i32> @combine_vec_shl_outofrange1(<4 x i32> %x) { 33; CHECK-LABEL: combine_vec_shl_outofrange1: 34; CHECK: # %bb.0: 35; CHECK-NEXT: retq 36 %1 = shl <4 x i32> %x, <i32 33, i32 34, i32 35, i32 36> 37 ret <4 x i32> %1 38} 39 40define <4 x i32> @combine_vec_shl_outofrange2(<4 x i32> %a0) { 41; CHECK-LABEL: combine_vec_shl_outofrange2: 42; CHECK: # %bb.0: 43; CHECK-NEXT: retq 44 %1 = and <4 x i32> %a0, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647> 45 %2 = shl <4 x i32> %1, <i32 33, i32 33, i32 33, i32 33> 46 ret <4 x i32> %2 47} 48 49define <4 x i32> @combine_vec_shl_outofrange3(<4 x i32> %a0) { 50; CHECK-LABEL: combine_vec_shl_outofrange3: 51; CHECK: # %bb.0: 52; CHECK-NEXT: retq 53 %1 = shl <4 x i32> %a0, <i32 33, i32 34, i32 35, i32 undef> 54 ret <4 x i32> %1 55} 56 57; fold (shl x, 0) -> x 58define <4 x i32> @combine_vec_shl_by_zero(<4 x i32> %x) { 59; CHECK-LABEL: combine_vec_shl_by_zero: 60; CHECK: # %bb.0: 61; CHECK-NEXT: retq 62 %1 = shl <4 x i32> %x, zeroinitializer 63 ret <4 x i32> %1 64} 65 66; if (shl x, c) is known to be zero, return 0 67define <4 x i32> @combine_vec_shl_known_zero0(<4 x i32> %x) { 68; SSE-LABEL: combine_vec_shl_known_zero0: 69; SSE: # %bb.0: 70; SSE-NEXT: xorps %xmm0, %xmm0 71; SSE-NEXT: retq 72; 73; AVX-LABEL: combine_vec_shl_known_zero0: 74; AVX: # %bb.0: 75; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 76; AVX-NEXT: retq 77 %1 = and <4 x i32> %x, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760> 78 %2 = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16> 79 ret <4 x i32> %2 80} 81 82define <4 x i32> @combine_vec_shl_known_zero1(<4 x i32> %x) { 83; SSE2-LABEL: combine_vec_shl_known_zero1: 84; SSE2: # %bb.0: 85; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 86; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65536,32768,16384,8192] 87; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 88; SSE2-NEXT: pmuludq %xmm1, %xmm0 89; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 90; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 91; SSE2-NEXT: pmuludq %xmm2, %xmm1 92; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 93; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 94; SSE2-NEXT: retq 95; 96; SSE41-LABEL: combine_vec_shl_known_zero1: 97; SSE41: # %bb.0: 98; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 99; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 100; SSE41-NEXT: retq 101; 102; AVX-LABEL: combine_vec_shl_known_zero1: 103; AVX: # %bb.0: 104; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 105; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 106; AVX-NEXT: retq 107 %1 = and <4 x i32> %x, <i32 4294901760, i32 8589803520, i32 17179607040, i32 34359214080> 108 %2 = shl <4 x i32> %1, <i32 16, i32 15, i32 14, i32 13> 109 ret <4 x i32> %2 110} 111 112; fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))). 113define <4 x i32> @combine_vec_shl_trunc_and(<4 x i32> %x, <4 x i64> %y) { 114; SSE2-LABEL: combine_vec_shl_trunc_and: 115; SSE2: # %bb.0: 116; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 117; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 118; SSE2-NEXT: pslld $23, %xmm1 119; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 120; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 121; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 122; SSE2-NEXT: pmuludq %xmm1, %xmm0 123; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 124; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 125; SSE2-NEXT: pmuludq %xmm2, %xmm1 126; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 127; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 128; SSE2-NEXT: retq 129; 130; SSE41-LABEL: combine_vec_shl_trunc_and: 131; SSE41: # %bb.0: 132; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 133; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 134; SSE41-NEXT: pslld $23, %xmm1 135; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 136; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 137; SSE41-NEXT: pmulld %xmm1, %xmm0 138; SSE41-NEXT: retq 139; 140; AVX-SLOW-LABEL: combine_vec_shl_trunc_and: 141; AVX-SLOW: # %bb.0: 142; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 143; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 144; AVX-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 145; AVX-SLOW-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 146; AVX-SLOW-NEXT: vzeroupper 147; AVX-SLOW-NEXT: retq 148; 149; AVX-FAST-ALL-LABEL: combine_vec_shl_trunc_and: 150; AVX-FAST-ALL: # %bb.0: 151; AVX-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> 152; AVX-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 153; AVX-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 154; AVX-FAST-ALL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 155; AVX-FAST-ALL-NEXT: vzeroupper 156; AVX-FAST-ALL-NEXT: retq 157; 158; AVX-FAST-PERLANE-LABEL: combine_vec_shl_trunc_and: 159; AVX-FAST-PERLANE: # %bb.0: 160; AVX-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 161; AVX-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 162; AVX-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 163; AVX-FAST-PERLANE-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 164; AVX-FAST-PERLANE-NEXT: vzeroupper 165; AVX-FAST-PERLANE-NEXT: retq 166 %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535> 167 %2 = trunc <4 x i64> %1 to <4 x i32> 168 %3 = shl <4 x i32> %x, %2 169 ret <4 x i32> %3 170} 171 172; fold (shl (shl x, c1), c2) -> (shl x, (add c1, c2)) 173define <4 x i32> @combine_vec_shl_shl0(<4 x i32> %x) { 174; SSE-LABEL: combine_vec_shl_shl0: 175; SSE: # %bb.0: 176; SSE-NEXT: pslld $6, %xmm0 177; SSE-NEXT: retq 178; 179; AVX-LABEL: combine_vec_shl_shl0: 180; AVX: # %bb.0: 181; AVX-NEXT: vpslld $6, %xmm0, %xmm0 182; AVX-NEXT: retq 183 %1 = shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2> 184 %2 = shl <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4> 185 ret <4 x i32> %2 186} 187 188define <4 x i32> @combine_vec_shl_shl1(<4 x i32> %x) { 189; SSE2-LABEL: combine_vec_shl_shl1: 190; SSE2: # %bb.0: 191; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,64,256,1024] 192; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 193; SSE2-NEXT: pmuludq %xmm1, %xmm0 194; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 195; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 196; SSE2-NEXT: pmuludq %xmm2, %xmm1 197; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 198; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 199; SSE2-NEXT: retq 200; 201; SSE41-LABEL: combine_vec_shl_shl1: 202; SSE41: # %bb.0: 203; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 204; SSE41-NEXT: retq 205; 206; AVX-LABEL: combine_vec_shl_shl1: 207; AVX: # %bb.0: 208; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 209; AVX-NEXT: retq 210 %1 = shl <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3> 211 %2 = shl <4 x i32> %1, <i32 4, i32 5, i32 6, i32 7> 212 ret <4 x i32> %2 213} 214 215; fold (shl (shl x, c1), c2) -> 0 216define <4 x i32> @combine_vec_shl_shlr_zero0(<4 x i32> %x) { 217; SSE-LABEL: combine_vec_shl_shlr_zero0: 218; SSE: # %bb.0: 219; SSE-NEXT: xorps %xmm0, %xmm0 220; SSE-NEXT: retq 221; 222; AVX-LABEL: combine_vec_shl_shlr_zero0: 223; AVX: # %bb.0: 224; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 225; AVX-NEXT: retq 226 %1 = shl <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16> 227 %2 = shl <4 x i32> %1, <i32 20, i32 20, i32 20, i32 20> 228 ret <4 x i32> %2 229} 230 231define <4 x i32> @combine_vec_shl_shl_zero1(<4 x i32> %x) { 232; SSE-LABEL: combine_vec_shl_shl_zero1: 233; SSE: # %bb.0: 234; SSE-NEXT: xorps %xmm0, %xmm0 235; SSE-NEXT: retq 236; 237; AVX-LABEL: combine_vec_shl_shl_zero1: 238; AVX: # %bb.0: 239; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 240; AVX-NEXT: retq 241 %1 = shl <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20> 242 %2 = shl <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28> 243 ret <4 x i32> %2 244} 245 246; fold (shl (ext (shl x, c1)), c2) -> (shl (ext x), (add c1, c2)) 247define <8 x i32> @combine_vec_shl_ext_shl0(<8 x i16> %x) { 248; SSE2-LABEL: combine_vec_shl_ext_shl0: 249; SSE2: # %bb.0: 250; SSE2-NEXT: movdqa %xmm0, %xmm1 251; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 252; SSE2-NEXT: pslld $20, %xmm0 253; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 254; SSE2-NEXT: pslld $20, %xmm1 255; SSE2-NEXT: retq 256; 257; SSE41-LABEL: combine_vec_shl_ext_shl0: 258; SSE41: # %bb.0: 259; SSE41-NEXT: movdqa %xmm0, %xmm1 260; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 261; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 262; SSE41-NEXT: pslld $20, %xmm1 263; SSE41-NEXT: pslld $20, %xmm0 264; SSE41-NEXT: retq 265; 266; AVX-LABEL: combine_vec_shl_ext_shl0: 267; AVX: # %bb.0: 268; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 269; AVX-NEXT: vpslld $20, %ymm0, %ymm0 270; AVX-NEXT: retq 271 %1 = shl <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4> 272 %2 = sext <8 x i16> %1 to <8 x i32> 273 %3 = shl <8 x i32> %2, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 274 ret <8 x i32> %3 275} 276 277define <8 x i32> @combine_vec_shl_ext_shl1(<8 x i16> %x) { 278; SSE-LABEL: combine_vec_shl_ext_shl1: 279; SSE: # %bb.0: 280; SSE-NEXT: xorps %xmm0, %xmm0 281; SSE-NEXT: xorps %xmm1, %xmm1 282; SSE-NEXT: retq 283; 284; AVX-LABEL: combine_vec_shl_ext_shl1: 285; AVX: # %bb.0: 286; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 287; AVX-NEXT: retq 288 %1 = shl <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8> 289 %2 = sext <8 x i16> %1 to <8 x i32> 290 %3 = shl <8 x i32> %2, <i32 31, i32 31, i32 30, i32 30, i32 29, i32 29, i32 28, i32 28> 291 ret <8 x i32> %3 292} 293 294define <8 x i32> @combine_vec_shl_ext_shl2(<8 x i16> %x) { 295; SSE2-LABEL: combine_vec_shl_ext_shl2: 296; SSE2: # %bb.0: 297; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 298; SSE2-NEXT: psrad $16, %xmm1 299; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [131072,524288,2097152,8388608] 300; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 301; SSE2-NEXT: pmuludq %xmm3, %xmm1 302; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] 303; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] 304; SSE2-NEXT: pmuludq %xmm4, %xmm1 305; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 306; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 307; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 308; SSE2-NEXT: psrad $16, %xmm0 309; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [33554432,134217728,536870912,2147483648] 310; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 311; SSE2-NEXT: pmuludq %xmm3, %xmm0 312; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] 313; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] 314; SSE2-NEXT: pmuludq %xmm4, %xmm0 315; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 316; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 317; SSE2-NEXT: movdqa %xmm2, %xmm0 318; SSE2-NEXT: retq 319; 320; SSE41-LABEL: combine_vec_shl_ext_shl2: 321; SSE41: # %bb.0: 322; SSE41-NEXT: pmovsxwd %xmm0, %xmm2 323; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 324; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 325; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 326; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 327; SSE41-NEXT: movdqa %xmm2, %xmm0 328; SSE41-NEXT: retq 329; 330; AVX-LABEL: combine_vec_shl_ext_shl2: 331; AVX: # %bb.0: 332; AVX-NEXT: vpmovsxwd %xmm0, %ymm0 333; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 334; AVX-NEXT: retq 335 %1 = shl <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8> 336 %2 = sext <8 x i16> %1 to <8 x i32> 337 %3 = shl <8 x i32> %2, <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 338 ret <8 x i32> %3 339} 340 341; fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C)) 342define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) { 343; SSE2-LABEL: combine_vec_shl_zext_lshr0: 344; SSE2: # %bb.0: 345; SSE2-NEXT: movdqa %xmm0, %xmm1 346; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 347; SSE2-NEXT: pxor %xmm2, %xmm2 348; SSE2-NEXT: movdqa %xmm1, %xmm0 349; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 350; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 351; SSE2-NEXT: retq 352; 353; SSE41-LABEL: combine_vec_shl_zext_lshr0: 354; SSE41: # %bb.0: 355; SSE41-NEXT: movdqa %xmm0, %xmm1 356; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 357; SSE41-NEXT: pxor %xmm2, %xmm2 358; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 359; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 360; SSE41-NEXT: retq 361; 362; AVX-LABEL: combine_vec_shl_zext_lshr0: 363; AVX: # %bb.0: 364; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 365; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 366; AVX-NEXT: retq 367 %1 = lshr <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4> 368 %2 = zext <8 x i16> %1 to <8 x i32> 369 %3 = shl <8 x i32> %2, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> 370 ret <8 x i32> %3 371} 372 373define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) { 374; SSE2-LABEL: combine_vec_shl_zext_lshr1: 375; SSE2: # %bb.0: 376; SSE2-NEXT: movdqa %xmm0, %xmm1 377; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 378; SSE2-NEXT: pxor %xmm2, %xmm2 379; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 380; SSE2-NEXT: movdqa %xmm1, %xmm0 381; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 382; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 383; SSE2-NEXT: retq 384; 385; SSE41-LABEL: combine_vec_shl_zext_lshr1: 386; SSE41: # %bb.0: 387; SSE41-NEXT: movdqa %xmm0, %xmm1 388; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 389; SSE41-NEXT: pxor %xmm2, %xmm2 390; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 391; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 392; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 393; SSE41-NEXT: retq 394; 395; AVX-LABEL: combine_vec_shl_zext_lshr1: 396; AVX: # %bb.0: 397; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 398; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 399; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 400; AVX-NEXT: retq 401 %1 = lshr <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8> 402 %2 = zext <8 x i16> %1 to <8 x i32> 403 %3 = shl <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 404 ret <8 x i32> %3 405} 406 407; fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2 408define <4 x i32> @combine_vec_shl_ge_ashr_extact0(<4 x i32> %x) { 409; SSE-LABEL: combine_vec_shl_ge_ashr_extact0: 410; SSE: # %bb.0: 411; SSE-NEXT: pslld $2, %xmm0 412; SSE-NEXT: retq 413; 414; AVX-LABEL: combine_vec_shl_ge_ashr_extact0: 415; AVX: # %bb.0: 416; AVX-NEXT: vpslld $2, %xmm0, %xmm0 417; AVX-NEXT: retq 418 %1 = ashr exact <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3> 419 %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5> 420 ret <4 x i32> %2 421} 422 423define <4 x i32> @combine_vec_shl_ge_ashr_extact1(<4 x i32> %x) { 424; SSE2-LABEL: combine_vec_shl_ge_ashr_extact1: 425; SSE2: # %bb.0: 426; SSE2-NEXT: movdqa %xmm0, %xmm1 427; SSE2-NEXT: psrad $3, %xmm1 428; SSE2-NEXT: movdqa %xmm0, %xmm2 429; SSE2-NEXT: psrad $5, %xmm2 430; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 431; SSE2-NEXT: movdqa %xmm0, %xmm1 432; SSE2-NEXT: psrad $8, %xmm1 433; SSE2-NEXT: psrad $4, %xmm0 434; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3] 435; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32,64,128,256] 436; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 437; SSE2-NEXT: pmuludq %xmm0, %xmm3 438; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 439; SSE2-NEXT: pmuludq %xmm1, %xmm2 440; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 441; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 442; SSE2-NEXT: retq 443; 444; SSE41-LABEL: combine_vec_shl_ge_ashr_extact1: 445; SSE41: # %bb.0: 446; SSE41-NEXT: movdqa %xmm0, %xmm1 447; SSE41-NEXT: psrad $8, %xmm1 448; SSE41-NEXT: movdqa %xmm0, %xmm2 449; SSE41-NEXT: psrad $4, %xmm2 450; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 451; SSE41-NEXT: movdqa %xmm0, %xmm1 452; SSE41-NEXT: psrad $5, %xmm1 453; SSE41-NEXT: psrad $3, %xmm0 454; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 455; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 456; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 457; SSE41-NEXT: retq 458; 459; AVX-LABEL: combine_vec_shl_ge_ashr_extact1: 460; AVX: # %bb.0: 461; AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 462; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 463; AVX-NEXT: retq 464 %1 = ashr exact <4 x i32> %x, <i32 3, i32 4, i32 5, i32 8> 465 %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 8> 466 ret <4 x i32> %2 467} 468 469; fold (shl (sr[la] exact SEL(X,Y), C1), C2) -> (shl SEL(X,Y), (C2-C1)) if C1 <= C2 470define i32 @combine_shl_ge_sel_ashr_extact0(i32 %x, i32 %y, i32 %z) { 471; CHECK-LABEL: combine_shl_ge_sel_ashr_extact0: 472; CHECK: # %bb.0: 473; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 474; CHECK-NEXT: testl %edx, %edx 475; CHECK-NEXT: cmovel %esi, %edi 476; CHECK-NEXT: leal (,%rdi,4), %eax 477; CHECK-NEXT: retq 478 %cmp = icmp ne i32 %z, 0 479 %ashrx = ashr exact i32 %x, 3 480 %ashry = ashr exact i32 %y, 3 481 %sel = select i1 %cmp, i32 %ashrx, i32 %ashry 482 %shl = shl i32 %sel, 5 483 ret i32 %shl 484} 485 486; fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 > C2 487define <4 x i32> @combine_vec_shl_lt_ashr_extact0(<4 x i32> %x) { 488; SSE-LABEL: combine_vec_shl_lt_ashr_extact0: 489; SSE: # %bb.0: 490; SSE-NEXT: psrad $2, %xmm0 491; SSE-NEXT: retq 492; 493; AVX-LABEL: combine_vec_shl_lt_ashr_extact0: 494; AVX: # %bb.0: 495; AVX-NEXT: vpsrad $2, %xmm0, %xmm0 496; AVX-NEXT: retq 497 %1 = ashr exact <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5> 498 %2 = shl <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3> 499 ret <4 x i32> %2 500} 501 502define <4 x i32> @combine_vec_shl_lt_ashr_extact1(<4 x i32> %x) { 503; SSE2-LABEL: combine_vec_shl_lt_ashr_extact1: 504; SSE2: # %bb.0: 505; SSE2-NEXT: movdqa %xmm0, %xmm1 506; SSE2-NEXT: psrad $5, %xmm1 507; SSE2-NEXT: movdqa %xmm0, %xmm2 508; SSE2-NEXT: psrad $7, %xmm2 509; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 510; SSE2-NEXT: movdqa %xmm0, %xmm1 511; SSE2-NEXT: psrad $8, %xmm1 512; SSE2-NEXT: psrad $6, %xmm0 513; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3] 514; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,16,32,256] 515; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 516; SSE2-NEXT: pmuludq %xmm0, %xmm3 517; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 518; SSE2-NEXT: pmuludq %xmm1, %xmm2 519; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 520; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 521; SSE2-NEXT: retq 522; 523; SSE41-LABEL: combine_vec_shl_lt_ashr_extact1: 524; SSE41: # %bb.0: 525; SSE41-NEXT: movdqa %xmm0, %xmm1 526; SSE41-NEXT: psrad $8, %xmm1 527; SSE41-NEXT: movdqa %xmm0, %xmm2 528; SSE41-NEXT: psrad $6, %xmm2 529; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 530; SSE41-NEXT: movdqa %xmm0, %xmm1 531; SSE41-NEXT: psrad $7, %xmm1 532; SSE41-NEXT: psrad $5, %xmm0 533; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 534; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 535; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 536; SSE41-NEXT: retq 537; 538; AVX-LABEL: combine_vec_shl_lt_ashr_extact1: 539; AVX: # %bb.0: 540; AVX-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 541; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 542; AVX-NEXT: retq 543 %1 = ashr exact <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8> 544 %2 = shl <4 x i32> %1, <i32 3, i32 4, i32 5, i32 8> 545 ret <4 x i32> %2 546} 547 548; fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) if C2 > C1 549define <4 x i32> @combine_vec_shl_gt_lshr0(<4 x i32> %x) { 550; SSE-LABEL: combine_vec_shl_gt_lshr0: 551; SSE: # %bb.0: 552; SSE-NEXT: pslld $2, %xmm0 553; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 554; SSE-NEXT: retq 555; 556; AVX-LABEL: combine_vec_shl_gt_lshr0: 557; AVX: # %bb.0: 558; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264] 559; AVX-NEXT: vpslld $2, %xmm0, %xmm0 560; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 561; AVX-NEXT: retq 562 %1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3> 563 %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5> 564 ret <4 x i32> %2 565} 566 567define <4 x i32> @combine_vec_shl_gt_lshr1(<4 x i32> %x) { 568; SSE2-LABEL: combine_vec_shl_gt_lshr1: 569; SSE2: # %bb.0: 570; SSE2-NEXT: movdqa %xmm0, %xmm1 571; SSE2-NEXT: psrld $3, %xmm1 572; SSE2-NEXT: movdqa %xmm0, %xmm2 573; SSE2-NEXT: psrld $5, %xmm2 574; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 575; SSE2-NEXT: movdqa %xmm0, %xmm1 576; SSE2-NEXT: psrld $8, %xmm1 577; SSE2-NEXT: psrld $4, %xmm0 578; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3] 579; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32,64,128,256] 580; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 581; SSE2-NEXT: pmuludq %xmm0, %xmm3 582; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 583; SSE2-NEXT: pmuludq %xmm1, %xmm2 584; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 585; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 586; SSE2-NEXT: retq 587; 588; SSE41-LABEL: combine_vec_shl_gt_lshr1: 589; SSE41: # %bb.0: 590; SSE41-NEXT: movdqa %xmm0, %xmm1 591; SSE41-NEXT: psrld $8, %xmm1 592; SSE41-NEXT: movdqa %xmm0, %xmm2 593; SSE41-NEXT: psrld $4, %xmm2 594; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 595; SSE41-NEXT: movdqa %xmm0, %xmm1 596; SSE41-NEXT: psrld $5, %xmm1 597; SSE41-NEXT: psrld $3, %xmm0 598; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 599; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 600; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 601; SSE41-NEXT: retq 602; 603; AVX-LABEL: combine_vec_shl_gt_lshr1: 604; AVX: # %bb.0: 605; AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 606; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 607; AVX-NEXT: retq 608 %1 = lshr <4 x i32> %x, <i32 3, i32 4, i32 5, i32 8> 609 %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 8> 610 ret <4 x i32> %2 611} 612 613; fold (shl (srl x, c1), c2) -> (and (srl x, (sub c1, c2), MASK) if C1 >= C2 614define <4 x i32> @combine_vec_shl_le_lshr0(<4 x i32> %x) { 615; SSE-LABEL: combine_vec_shl_le_lshr0: 616; SSE: # %bb.0: 617; SSE-NEXT: psrld $2, %xmm0 618; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 619; SSE-NEXT: retq 620; 621; AVX-LABEL: combine_vec_shl_le_lshr0: 622; AVX: # %bb.0: 623; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816] 624; AVX-NEXT: vpsrld $2, %xmm0, %xmm0 625; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 626; AVX-NEXT: retq 627 %1 = lshr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5> 628 %2 = shl <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3> 629 ret <4 x i32> %2 630} 631 632define <4 x i32> @combine_vec_shl_le_lshr1(<4 x i32> %x) { 633; SSE2-LABEL: combine_vec_shl_le_lshr1: 634; SSE2: # %bb.0: 635; SSE2-NEXT: movdqa %xmm0, %xmm1 636; SSE2-NEXT: psrld $5, %xmm1 637; SSE2-NEXT: movdqa %xmm0, %xmm2 638; SSE2-NEXT: psrld $7, %xmm2 639; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 640; SSE2-NEXT: movdqa %xmm0, %xmm1 641; SSE2-NEXT: psrld $8, %xmm1 642; SSE2-NEXT: psrld $6, %xmm0 643; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3] 644; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,16,32,256] 645; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 646; SSE2-NEXT: pmuludq %xmm0, %xmm3 647; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 648; SSE2-NEXT: pmuludq %xmm1, %xmm2 649; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 650; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 651; SSE2-NEXT: retq 652; 653; SSE41-LABEL: combine_vec_shl_le_lshr1: 654; SSE41: # %bb.0: 655; SSE41-NEXT: movdqa %xmm0, %xmm1 656; SSE41-NEXT: psrld $8, %xmm1 657; SSE41-NEXT: movdqa %xmm0, %xmm2 658; SSE41-NEXT: psrld $6, %xmm2 659; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 660; SSE41-NEXT: movdqa %xmm0, %xmm1 661; SSE41-NEXT: psrld $7, %xmm1 662; SSE41-NEXT: psrld $5, %xmm0 663; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 664; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 665; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 666; SSE41-NEXT: retq 667; 668; AVX-LABEL: combine_vec_shl_le_lshr1: 669; AVX: # %bb.0: 670; AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 671; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 672; AVX-NEXT: retq 673 %1 = lshr <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8> 674 %2 = shl <4 x i32> %1, <i32 3, i32 4, i32 5, i32 8> 675 ret <4 x i32> %2 676} 677 678; fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1)) 679define <4 x i32> @combine_vec_shl_ashr0(<4 x i32> %x) { 680; SSE-LABEL: combine_vec_shl_ashr0: 681; SSE: # %bb.0: 682; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 683; SSE-NEXT: retq 684; 685; AVX-LABEL: combine_vec_shl_ashr0: 686; AVX: # %bb.0: 687; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264] 688; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 689; AVX-NEXT: retq 690 %1 = ashr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5> 691 %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5> 692 ret <4 x i32> %2 693} 694 695define <4 x i32> @combine_vec_shl_ashr1(<4 x i32> %x) { 696; SSE-LABEL: combine_vec_shl_ashr1: 697; SSE: # %bb.0: 698; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 699; SSE-NEXT: retq 700; 701; AVX-LABEL: combine_vec_shl_ashr1: 702; AVX: # %bb.0: 703; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 704; AVX-NEXT: retq 705 %1 = ashr <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8> 706 %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 8> 707 ret <4 x i32> %2 708} 709 710; fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) 711define <4 x i32> @combine_vec_shl_add0(<4 x i32> %x) { 712; SSE-LABEL: combine_vec_shl_add0: 713; SSE: # %bb.0: 714; SSE-NEXT: pslld $2, %xmm0 715; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 716; SSE-NEXT: retq 717; 718; AVX-LABEL: combine_vec_shl_add0: 719; AVX: # %bb.0: 720; AVX-NEXT: vpslld $2, %xmm0, %xmm0 721; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20] 722; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 723; AVX-NEXT: retq 724 %1 = add <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5> 725 %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2> 726 ret <4 x i32> %2 727} 728 729define <4 x i32> @combine_vec_shl_add1(<4 x i32> %x) { 730; SSE2-LABEL: combine_vec_shl_add1: 731; SSE2: # %bb.0: 732; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2,4,8,16] 733; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 734; SSE2-NEXT: pmuludq %xmm1, %xmm0 735; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 736; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 737; SSE2-NEXT: pmuludq %xmm2, %xmm1 738; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 739; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 740; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 741; SSE2-NEXT: retq 742; 743; SSE41-LABEL: combine_vec_shl_add1: 744; SSE41: # %bb.0: 745; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 746; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 747; SSE41-NEXT: retq 748; 749; AVX-LABEL: combine_vec_shl_add1: 750; AVX: # %bb.0: 751; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 752; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 753; AVX-NEXT: retq 754 %1 = add <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8> 755 %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4> 756 ret <4 x i32> %2 757} 758 759; fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) 760define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) { 761; SSE-LABEL: combine_vec_shl_or0: 762; SSE: # %bb.0: 763; SSE-NEXT: pslld $2, %xmm0 764; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 765; SSE-NEXT: retq 766; 767; AVX-LABEL: combine_vec_shl_or0: 768; AVX: # %bb.0: 769; AVX-NEXT: vpslld $2, %xmm0, %xmm0 770; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20] 771; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 772; AVX-NEXT: retq 773 %1 = or <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5> 774 %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2> 775 ret <4 x i32> %2 776} 777 778define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) { 779; SSE2-LABEL: combine_vec_shl_or1: 780; SSE2: # %bb.0: 781; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2,4,8,16] 782; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 783; SSE2-NEXT: pmuludq %xmm1, %xmm0 784; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 785; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 786; SSE2-NEXT: pmuludq %xmm2, %xmm1 787; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 788; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 789; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 790; SSE2-NEXT: retq 791; 792; SSE41-LABEL: combine_vec_shl_or1: 793; SSE41: # %bb.0: 794; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 795; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 796; SSE41-NEXT: retq 797; 798; AVX-LABEL: combine_vec_shl_or1: 799; AVX: # %bb.0: 800; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 801; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 802; AVX-NEXT: retq 803 %1 = or <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8> 804 %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4> 805 ret <4 x i32> %2 806} 807 808; fold (shl (mul x, c1), c2) -> (mul x, c1 << c2) 809define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) { 810; SSE2-LABEL: combine_vec_shl_mul0: 811; SSE2: # %bb.0: 812; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [20,20,20,20] 813; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 814; SSE2-NEXT: pmuludq %xmm1, %xmm0 815; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 816; SSE2-NEXT: pmuludq %xmm1, %xmm2 817; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 818; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 819; SSE2-NEXT: retq 820; 821; SSE41-LABEL: combine_vec_shl_mul0: 822; SSE41: # %bb.0: 823; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 824; SSE41-NEXT: retq 825; 826; AVX-LABEL: combine_vec_shl_mul0: 827; AVX: # %bb.0: 828; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20] 829; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 830; AVX-NEXT: retq 831 %1 = mul <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5> 832 %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2> 833 ret <4 x i32> %2 834} 835 836define <4 x i32> @combine_vec_shl_mul1(<4 x i32> %x) { 837; SSE2-LABEL: combine_vec_shl_mul1: 838; SSE2: # %bb.0: 839; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [10,24,56,128] 840; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 841; SSE2-NEXT: pmuludq %xmm1, %xmm0 842; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 843; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 844; SSE2-NEXT: pmuludq %xmm2, %xmm1 845; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 846; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 847; SSE2-NEXT: retq 848; 849; SSE41-LABEL: combine_vec_shl_mul1: 850; SSE41: # %bb.0: 851; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 852; SSE41-NEXT: retq 853; 854; AVX-LABEL: combine_vec_shl_mul1: 855; AVX: # %bb.0: 856; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 857; AVX-NEXT: retq 858 %1 = mul <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8> 859 %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4> 860 ret <4 x i32> %2 861} 862 863; fold (add (shl x, c1), c2) -> (or (shl x, c1), c2) 864define <4 x i32> @combine_vec_add_shl_nonsplat(<4 x i32> %a0) { 865; SSE2-LABEL: combine_vec_add_shl_nonsplat: 866; SSE2: # %bb.0: 867; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4,8,16,32] 868; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 869; SSE2-NEXT: pmuludq %xmm1, %xmm0 870; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 871; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 872; SSE2-NEXT: pmuludq %xmm2, %xmm1 873; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 874; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 875; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 876; SSE2-NEXT: retq 877; 878; SSE41-LABEL: combine_vec_add_shl_nonsplat: 879; SSE41: # %bb.0: 880; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 881; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 882; SSE41-NEXT: retq 883; 884; AVX-LABEL: combine_vec_add_shl_nonsplat: 885; AVX: # %bb.0: 886; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 887; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] 888; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 889; AVX-NEXT: retq 890 %1 = shl <4 x i32> %a0, <i32 2, i32 3, i32 4, i32 5> 891 %2 = add <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3> 892 ret <4 x i32> %2 893} 894 895define <4 x i32> @combine_vec_add_shl_and_nonsplat(<4 x i32> %a0) { 896; SSE2-LABEL: combine_vec_add_shl_and_nonsplat: 897; SSE2: # %bb.0: 898; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 899; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4,8,16,32] 900; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 901; SSE2-NEXT: pmuludq %xmm1, %xmm0 902; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 903; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 904; SSE2-NEXT: pmuludq %xmm2, %xmm1 905; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 906; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 907; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 908; SSE2-NEXT: retq 909; 910; SSE41-LABEL: combine_vec_add_shl_and_nonsplat: 911; SSE41: # %bb.0: 912; SSE41-NEXT: pxor %xmm1, %xmm1 913; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] 914; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 915; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 916; SSE41-NEXT: retq 917; 918; AVX-LABEL: combine_vec_add_shl_and_nonsplat: 919; AVX: # %bb.0: 920; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 921; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] 922; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 923; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] 924; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 925; AVX-NEXT: retq 926 %1 = and <4 x i32> %a0, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760> 927 %2 = shl <4 x i32> %1, <i32 2, i32 3, i32 4, i32 5> 928 %3 = add <4 x i32> %2, <i32 15, i32 15, i32 15, i32 15> 929 ret <4 x i32> %3 930} 931 932define <4 x i32> @combine_vec_add_shuffle_shl(<4 x i32> %a0) { 933; SSE2-LABEL: combine_vec_add_shuffle_shl: 934; SSE2: # %bb.0: 935; SSE2-NEXT: movdqa %xmm0, %xmm1 936; SSE2-NEXT: pslld $3, %xmm1 937; SSE2-NEXT: pslld $2, %xmm0 938; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 939; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,3,0] 940; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 941; SSE2-NEXT: retq 942; 943; SSE41-LABEL: combine_vec_add_shuffle_shl: 944; SSE41: # %bb.0: 945; SSE41-NEXT: movdqa %xmm0, %xmm1 946; SSE41-NEXT: pslld $3, %xmm1 947; SSE41-NEXT: pslld $2, %xmm0 948; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 949; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] 950; SSE41-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 951; SSE41-NEXT: retq 952; 953; AVX-LABEL: combine_vec_add_shuffle_shl: 954; AVX: # %bb.0: 955; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 956; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] 957; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] 958; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 959; AVX-NEXT: retq 960 %1 = shl <4 x i32> %a0, <i32 2, i32 3, i32 0, i32 1> 961 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0> 962 %3 = add <4 x i32> %2, <i32 3, i32 3, i32 3, i32 3> 963 ret <4 x i32> %3 964} 965