1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 3; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 4; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 5; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 6; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW 7; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL 8; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE 9; 10; Verify that the DAG combiner correctly folds bitwise operations across 11; shuffles, nested shuffles with undef, pairs of nested shuffles, and other 12; basic and always-safe patterns. Also test that the DAG combiner will combine 13; target-specific shuffle instructions where reasonable. 14 15target triple = "x86_64-unknown-unknown" 16 17declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) 18declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) 19declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) 20 21define <4 x i32> @combine_pshufd1(<4 x i32> %a) { 22; CHECK-LABEL: combine_pshufd1: 23; CHECK: # %bb.0: # %entry 24; CHECK-NEXT: retq 25entry: 26 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 27 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27) 28 ret <4 x i32> %c 29} 30 31define <4 x i32> @combine_pshufd2(<4 x i32> %a) { 32; CHECK-LABEL: combine_pshufd2: 33; CHECK: # %bb.0: # %entry 34; CHECK-NEXT: retq 35entry: 36 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 37 %b.cast = bitcast <4 x i32> %b to <8 x i16> 38 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28) 39 %c.cast = bitcast <8 x i16> %c to <4 x i32> 40 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 41 ret <4 x i32> %d 42} 43 44define <4 x i32> @combine_pshufd3(<4 x i32> %a) { 45; CHECK-LABEL: combine_pshufd3: 46; CHECK: # %bb.0: # %entry 47; CHECK-NEXT: retq 48entry: 49 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 50 %b.cast = bitcast <4 x i32> %b to <8 x i16> 51 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28) 52 %c.cast = bitcast <8 x i16> %c to <4 x i32> 53 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 54 ret <4 x i32> %d 55} 56 57define <4 x i32> @combine_pshufd4(<4 x i32> %a) { 58; SSE-LABEL: combine_pshufd4: 59; SSE: # %bb.0: # %entry 60; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 61; SSE-NEXT: retq 62; 63; AVX-LABEL: combine_pshufd4: 64; AVX: # %bb.0: # %entry 65; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 66; AVX-NEXT: retq 67entry: 68 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31) 69 %b.cast = bitcast <4 x i32> %b to <8 x i16> 70 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27) 71 %c.cast = bitcast <8 x i16> %c to <4 x i32> 72 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31) 73 ret <4 x i32> %d 74} 75 76define <4 x i32> @combine_pshufd5(<4 x i32> %a) { 77; SSE-LABEL: combine_pshufd5: 78; SSE: # %bb.0: # %entry 79; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 80; SSE-NEXT: retq 81; 82; AVX-LABEL: combine_pshufd5: 83; AVX: # %bb.0: # %entry 84; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 85; AVX-NEXT: retq 86entry: 87 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76) 88 %b.cast = bitcast <4 x i32> %b to <8 x i16> 89 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27) 90 %c.cast = bitcast <8 x i16> %c to <4 x i32> 91 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76) 92 ret <4 x i32> %d 93} 94 95define <4 x i32> @combine_pshufd6(<4 x i32> %a) { 96; SSE-LABEL: combine_pshufd6: 97; SSE: # %bb.0: # %entry 98; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 99; SSE-NEXT: retq 100; 101; AVX1-LABEL: combine_pshufd6: 102; AVX1: # %bb.0: # %entry 103; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] 104; AVX1-NEXT: retq 105; 106; AVX2-LABEL: combine_pshufd6: 107; AVX2: # %bb.0: # %entry 108; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 109; AVX2-NEXT: retq 110entry: 111 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0) 112 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8) 113 ret <4 x i32> %c 114} 115 116define <8 x i16> @combine_pshuflw1(<8 x i16> %a) { 117; CHECK-LABEL: combine_pshuflw1: 118; CHECK: # %bb.0: # %entry 119; CHECK-NEXT: retq 120entry: 121 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 122 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 123 ret <8 x i16> %c 124} 125 126define <8 x i16> @combine_pshuflw2(<8 x i16> %a) { 127; CHECK-LABEL: combine_pshuflw2: 128; CHECK: # %bb.0: # %entry 129; CHECK-NEXT: retq 130entry: 131 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 132 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28) 133 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 134 ret <8 x i16> %d 135} 136 137define <8 x i16> @combine_pshuflw3(<8 x i16> %a) { 138; SSE-LABEL: combine_pshuflw3: 139; SSE: # %bb.0: # %entry 140; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 141; SSE-NEXT: retq 142; 143; AVX-LABEL: combine_pshuflw3: 144; AVX: # %bb.0: # %entry 145; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 146; AVX-NEXT: retq 147entry: 148 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 149 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27) 150 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 151 ret <8 x i16> %d 152} 153 154define <8 x i16> @combine_pshufhw1(<8 x i16> %a) { 155; SSE-LABEL: combine_pshufhw1: 156; SSE: # %bb.0: # %entry 157; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 158; SSE-NEXT: retq 159; 160; AVX-LABEL: combine_pshufhw1: 161; AVX: # %bb.0: # %entry 162; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 163; AVX-NEXT: retq 164entry: 165 %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27) 166 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 167 %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27) 168 ret <8 x i16> %d 169} 170 171define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 172; SSE-LABEL: combine_bitwise_ops_test1: 173; SSE: # %bb.0: 174; SSE-NEXT: pand %xmm1, %xmm0 175; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 176; SSE-NEXT: retq 177; 178; AVX-LABEL: combine_bitwise_ops_test1: 179; AVX: # %bb.0: 180; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 181; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 182; AVX-NEXT: retq 183 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 184 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 185 %and = and <4 x i32> %shuf1, %shuf2 186 ret <4 x i32> %and 187} 188 189define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 190; SSE-LABEL: combine_bitwise_ops_test2: 191; SSE: # %bb.0: 192; SSE-NEXT: por %xmm1, %xmm0 193; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 194; SSE-NEXT: retq 195; 196; AVX-LABEL: combine_bitwise_ops_test2: 197; AVX: # %bb.0: 198; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 199; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 200; AVX-NEXT: retq 201 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 202 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 203 %or = or <4 x i32> %shuf1, %shuf2 204 ret <4 x i32> %or 205} 206 207define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 208; SSE-LABEL: combine_bitwise_ops_test3: 209; SSE: # %bb.0: 210; SSE-NEXT: pxor %xmm1, %xmm0 211; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 212; SSE-NEXT: retq 213; 214; AVX-LABEL: combine_bitwise_ops_test3: 215; AVX: # %bb.0: 216; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 217; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 218; AVX-NEXT: retq 219 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 220 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 221 %xor = xor <4 x i32> %shuf1, %shuf2 222 ret <4 x i32> %xor 223} 224 225define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 226; SSE-LABEL: combine_bitwise_ops_test4: 227; SSE: # %bb.0: 228; SSE-NEXT: pand %xmm1, %xmm0 229; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 230; SSE-NEXT: retq 231; 232; AVX-LABEL: combine_bitwise_ops_test4: 233; AVX: # %bb.0: 234; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 235; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 236; AVX-NEXT: retq 237 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 238 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 239 %and = and <4 x i32> %shuf1, %shuf2 240 ret <4 x i32> %and 241} 242 243define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 244; SSE-LABEL: combine_bitwise_ops_test5: 245; SSE: # %bb.0: 246; SSE-NEXT: por %xmm1, %xmm0 247; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 248; SSE-NEXT: retq 249; 250; AVX-LABEL: combine_bitwise_ops_test5: 251; AVX: # %bb.0: 252; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 253; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 254; AVX-NEXT: retq 255 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 256 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 257 %or = or <4 x i32> %shuf1, %shuf2 258 ret <4 x i32> %or 259} 260 261define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 262; SSE-LABEL: combine_bitwise_ops_test6: 263; SSE: # %bb.0: 264; SSE-NEXT: pxor %xmm1, %xmm0 265; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 266; SSE-NEXT: retq 267; 268; AVX-LABEL: combine_bitwise_ops_test6: 269; AVX: # %bb.0: 270; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 271; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 272; AVX-NEXT: retq 273 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 274 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 275 %xor = xor <4 x i32> %shuf1, %shuf2 276 ret <4 x i32> %xor 277} 278 279 280; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles 281; are not performing a swizzle operations. 282 283define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 284; SSE2-LABEL: combine_bitwise_ops_test1b: 285; SSE2: # %bb.0: 286; SSE2-NEXT: pand %xmm1, %xmm0 287; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 288; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 289; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 290; SSE2-NEXT: retq 291; 292; SSSE3-LABEL: combine_bitwise_ops_test1b: 293; SSSE3: # %bb.0: 294; SSSE3-NEXT: pand %xmm1, %xmm0 295; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 296; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 297; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 298; SSSE3-NEXT: retq 299; 300; SSE41-LABEL: combine_bitwise_ops_test1b: 301; SSE41: # %bb.0: 302; SSE41-NEXT: andps %xmm1, %xmm0 303; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 304; SSE41-NEXT: retq 305; 306; AVX-LABEL: combine_bitwise_ops_test1b: 307; AVX: # %bb.0: 308; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 309; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 310; AVX-NEXT: retq 311 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 312 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 313 %and = and <4 x i32> %shuf1, %shuf2 314 ret <4 x i32> %and 315} 316 317define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 318; SSE2-LABEL: combine_bitwise_ops_test2b: 319; SSE2: # %bb.0: 320; SSE2-NEXT: por %xmm1, %xmm0 321; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 322; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 323; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 324; SSE2-NEXT: retq 325; 326; SSSE3-LABEL: combine_bitwise_ops_test2b: 327; SSSE3: # %bb.0: 328; SSSE3-NEXT: por %xmm1, %xmm0 329; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 330; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 331; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 332; SSSE3-NEXT: retq 333; 334; SSE41-LABEL: combine_bitwise_ops_test2b: 335; SSE41: # %bb.0: 336; SSE41-NEXT: orps %xmm1, %xmm0 337; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 338; SSE41-NEXT: retq 339; 340; AVX-LABEL: combine_bitwise_ops_test2b: 341; AVX: # %bb.0: 342; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 343; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 344; AVX-NEXT: retq 345 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 346 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 347 %or = or <4 x i32> %shuf1, %shuf2 348 ret <4 x i32> %or 349} 350 351define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 352; SSE2-LABEL: combine_bitwise_ops_test3b: 353; SSE2: # %bb.0: 354; SSE2-NEXT: xorps %xmm1, %xmm0 355; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 356; SSE2-NEXT: retq 357; 358; SSSE3-LABEL: combine_bitwise_ops_test3b: 359; SSSE3: # %bb.0: 360; SSSE3-NEXT: xorps %xmm1, %xmm0 361; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 362; SSSE3-NEXT: retq 363; 364; SSE41-LABEL: combine_bitwise_ops_test3b: 365; SSE41: # %bb.0: 366; SSE41-NEXT: xorps %xmm1, %xmm0 367; SSE41-NEXT: xorps %xmm1, %xmm1 368; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 369; SSE41-NEXT: retq 370; 371; AVX-LABEL: combine_bitwise_ops_test3b: 372; AVX: # %bb.0: 373; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 374; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 375; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 376; AVX-NEXT: retq 377 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 378 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 379 %xor = xor <4 x i32> %shuf1, %shuf2 380 ret <4 x i32> %xor 381} 382 383define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 384; SSE2-LABEL: combine_bitwise_ops_test4b: 385; SSE2: # %bb.0: 386; SSE2-NEXT: pand %xmm1, %xmm0 387; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 388; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 389; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 390; SSE2-NEXT: retq 391; 392; SSSE3-LABEL: combine_bitwise_ops_test4b: 393; SSSE3: # %bb.0: 394; SSSE3-NEXT: pand %xmm1, %xmm0 395; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 396; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 397; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 398; SSSE3-NEXT: retq 399; 400; SSE41-LABEL: combine_bitwise_ops_test4b: 401; SSE41: # %bb.0: 402; SSE41-NEXT: andps %xmm1, %xmm0 403; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 404; SSE41-NEXT: retq 405; 406; AVX-LABEL: combine_bitwise_ops_test4b: 407; AVX: # %bb.0: 408; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 409; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 410; AVX-NEXT: retq 411 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 412 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 413 %and = and <4 x i32> %shuf1, %shuf2 414 ret <4 x i32> %and 415} 416 417define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 418; SSE2-LABEL: combine_bitwise_ops_test5b: 419; SSE2: # %bb.0: 420; SSE2-NEXT: por %xmm1, %xmm0 421; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 422; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 423; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 424; SSE2-NEXT: retq 425; 426; SSSE3-LABEL: combine_bitwise_ops_test5b: 427; SSSE3: # %bb.0: 428; SSSE3-NEXT: por %xmm1, %xmm0 429; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 430; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 431; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 432; SSSE3-NEXT: retq 433; 434; SSE41-LABEL: combine_bitwise_ops_test5b: 435; SSE41: # %bb.0: 436; SSE41-NEXT: orps %xmm1, %xmm0 437; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 438; SSE41-NEXT: retq 439; 440; AVX-LABEL: combine_bitwise_ops_test5b: 441; AVX: # %bb.0: 442; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 443; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 444; AVX-NEXT: retq 445 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 446 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 447 %or = or <4 x i32> %shuf1, %shuf2 448 ret <4 x i32> %or 449} 450 451define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 452; SSE2-LABEL: combine_bitwise_ops_test6b: 453; SSE2: # %bb.0: 454; SSE2-NEXT: xorps %xmm1, %xmm0 455; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 456; SSE2-NEXT: retq 457; 458; SSSE3-LABEL: combine_bitwise_ops_test6b: 459; SSSE3: # %bb.0: 460; SSSE3-NEXT: xorps %xmm1, %xmm0 461; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 462; SSSE3-NEXT: retq 463; 464; SSE41-LABEL: combine_bitwise_ops_test6b: 465; SSE41: # %bb.0: 466; SSE41-NEXT: xorps %xmm1, %xmm0 467; SSE41-NEXT: xorps %xmm1, %xmm1 468; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 469; SSE41-NEXT: retq 470; 471; AVX-LABEL: combine_bitwise_ops_test6b: 472; AVX: # %bb.0: 473; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 474; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 475; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 476; AVX-NEXT: retq 477 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 478 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 479 %xor = xor <4 x i32> %shuf1, %shuf2 480 ret <4 x i32> %xor 481} 482 483define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 484; SSE-LABEL: combine_bitwise_ops_test1c: 485; SSE: # %bb.0: 486; SSE-NEXT: andps %xmm1, %xmm0 487; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 488; SSE-NEXT: retq 489; 490; AVX-LABEL: combine_bitwise_ops_test1c: 491; AVX: # %bb.0: 492; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 493; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 494; AVX-NEXT: retq 495 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 496 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 497 %and = and <4 x i32> %shuf1, %shuf2 498 ret <4 x i32> %and 499} 500 501define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 502; SSE-LABEL: combine_bitwise_ops_test2c: 503; SSE: # %bb.0: 504; SSE-NEXT: orps %xmm1, %xmm0 505; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 506; SSE-NEXT: retq 507; 508; AVX-LABEL: combine_bitwise_ops_test2c: 509; AVX: # %bb.0: 510; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 511; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 512; AVX-NEXT: retq 513 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 514 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 515 %or = or <4 x i32> %shuf1, %shuf2 516 ret <4 x i32> %or 517} 518 519define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 520; SSE2-LABEL: combine_bitwise_ops_test3c: 521; SSE2: # %bb.0: 522; SSE2-NEXT: xorps %xmm1, %xmm0 523; SSE2-NEXT: xorps %xmm1, %xmm1 524; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 525; SSE2-NEXT: retq 526; 527; SSSE3-LABEL: combine_bitwise_ops_test3c: 528; SSSE3: # %bb.0: 529; SSSE3-NEXT: xorps %xmm1, %xmm0 530; SSSE3-NEXT: xorps %xmm1, %xmm1 531; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 532; SSSE3-NEXT: retq 533; 534; SSE41-LABEL: combine_bitwise_ops_test3c: 535; SSE41: # %bb.0: 536; SSE41-NEXT: xorps %xmm1, %xmm0 537; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 538; SSE41-NEXT: retq 539; 540; AVX-LABEL: combine_bitwise_ops_test3c: 541; AVX: # %bb.0: 542; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 543; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 544; AVX-NEXT: retq 545 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 546 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 547 %xor = xor <4 x i32> %shuf1, %shuf2 548 ret <4 x i32> %xor 549} 550 551define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 552; SSE-LABEL: combine_bitwise_ops_test4c: 553; SSE: # %bb.0: 554; SSE-NEXT: andps %xmm1, %xmm0 555; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] 556; SSE-NEXT: movaps %xmm2, %xmm0 557; SSE-NEXT: retq 558; 559; AVX-LABEL: combine_bitwise_ops_test4c: 560; AVX: # %bb.0: 561; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 562; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] 563; AVX-NEXT: retq 564 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 565 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 566 %and = and <4 x i32> %shuf1, %shuf2 567 ret <4 x i32> %and 568} 569 570define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 571; SSE-LABEL: combine_bitwise_ops_test5c: 572; SSE: # %bb.0: 573; SSE-NEXT: orps %xmm1, %xmm0 574; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] 575; SSE-NEXT: movaps %xmm2, %xmm0 576; SSE-NEXT: retq 577; 578; AVX-LABEL: combine_bitwise_ops_test5c: 579; AVX: # %bb.0: 580; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 581; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] 582; AVX-NEXT: retq 583 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 584 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 585 %or = or <4 x i32> %shuf1, %shuf2 586 ret <4 x i32> %or 587} 588 589define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 590; SSE2-LABEL: combine_bitwise_ops_test6c: 591; SSE2: # %bb.0: 592; SSE2-NEXT: xorps %xmm1, %xmm0 593; SSE2-NEXT: xorps %xmm1, %xmm1 594; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3] 595; SSE2-NEXT: movaps %xmm1, %xmm0 596; SSE2-NEXT: retq 597; 598; SSSE3-LABEL: combine_bitwise_ops_test6c: 599; SSSE3: # %bb.0: 600; SSSE3-NEXT: xorps %xmm1, %xmm0 601; SSSE3-NEXT: xorps %xmm1, %xmm1 602; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3] 603; SSSE3-NEXT: movaps %xmm1, %xmm0 604; SSSE3-NEXT: retq 605; 606; SSE41-LABEL: combine_bitwise_ops_test6c: 607; SSE41: # %bb.0: 608; SSE41-NEXT: xorps %xmm1, %xmm0 609; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3] 610; SSE41-NEXT: retq 611; 612; AVX-LABEL: combine_bitwise_ops_test6c: 613; AVX: # %bb.0: 614; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 615; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3] 616; AVX-NEXT: retq 617 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 618 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 619 %xor = xor <4 x i32> %shuf1, %shuf2 620 ret <4 x i32> %xor 621} 622 623define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) { 624; SSE-LABEL: combine_nested_undef_test1: 625; SSE: # %bb.0: 626; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 627; SSE-NEXT: retq 628; 629; AVX-LABEL: combine_nested_undef_test1: 630; AVX: # %bb.0: 631; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] 632; AVX-NEXT: retq 633 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> 634 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 635 ret <4 x i32> %2 636} 637 638define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) { 639; SSE-LABEL: combine_nested_undef_test2: 640; SSE: # %bb.0: 641; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 642; SSE-NEXT: retq 643; 644; AVX-LABEL: combine_nested_undef_test2: 645; AVX: # %bb.0: 646; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] 647; AVX-NEXT: retq 648 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 649 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 650 ret <4 x i32> %2 651} 652 653define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) { 654; SSE-LABEL: combine_nested_undef_test3: 655; SSE: # %bb.0: 656; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 657; SSE-NEXT: retq 658; 659; AVX-LABEL: combine_nested_undef_test3: 660; AVX: # %bb.0: 661; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] 662; AVX-NEXT: retq 663 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3> 664 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 665 ret <4 x i32> %2 666} 667 668define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) { 669; SSE-LABEL: combine_nested_undef_test4: 670; SSE: # %bb.0: 671; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 672; SSE-NEXT: retq 673; 674; AVX1-LABEL: combine_nested_undef_test4: 675; AVX1: # %bb.0: 676; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 677; AVX1-NEXT: retq 678; 679; AVX2-LABEL: combine_nested_undef_test4: 680; AVX2: # %bb.0: 681; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 682; AVX2-NEXT: retq 683 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1> 684 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3> 685 ret <4 x i32> %2 686} 687 688define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) { 689; SSE-LABEL: combine_nested_undef_test5: 690; SSE: # %bb.0: 691; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 692; SSE-NEXT: retq 693; 694; AVX-LABEL: combine_nested_undef_test5: 695; AVX: # %bb.0: 696; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] 697; AVX-NEXT: retq 698 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3> 699 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3> 700 ret <4 x i32> %2 701} 702 703define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) { 704; SSE-LABEL: combine_nested_undef_test6: 705; SSE: # %bb.0: 706; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 707; SSE-NEXT: retq 708; 709; AVX-LABEL: combine_nested_undef_test6: 710; AVX: # %bb.0: 711; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] 712; AVX-NEXT: retq 713 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> 714 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4> 715 ret <4 x i32> %2 716} 717 718define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) { 719; SSE-LABEL: combine_nested_undef_test7: 720; SSE: # %bb.0: 721; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] 722; SSE-NEXT: retq 723; 724; AVX-LABEL: combine_nested_undef_test7: 725; AVX: # %bb.0: 726; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] 727; AVX-NEXT: retq 728 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 729 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 730 ret <4 x i32> %2 731} 732 733define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) { 734; SSE-LABEL: combine_nested_undef_test8: 735; SSE: # %bb.0: 736; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 737; SSE-NEXT: retq 738; 739; AVX-LABEL: combine_nested_undef_test8: 740; AVX: # %bb.0: 741; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3] 742; AVX-NEXT: retq 743 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 744 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4> 745 ret <4 x i32> %2 746} 747 748define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) { 749; SSE-LABEL: combine_nested_undef_test9: 750; SSE: # %bb.0: 751; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2] 752; SSE-NEXT: retq 753; 754; AVX-LABEL: combine_nested_undef_test9: 755; AVX: # %bb.0: 756; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,2] 757; AVX-NEXT: retq 758 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5> 759 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2> 760 ret <4 x i32> %2 761} 762 763define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) { 764; SSE-LABEL: combine_nested_undef_test10: 765; SSE: # %bb.0: 766; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 767; SSE-NEXT: retq 768; 769; AVX-LABEL: combine_nested_undef_test10: 770; AVX: # %bb.0: 771; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] 772; AVX-NEXT: retq 773 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5> 774 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4> 775 ret <4 x i32> %2 776} 777 778define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) { 779; SSE-LABEL: combine_nested_undef_test11: 780; SSE: # %bb.0: 781; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1] 782; SSE-NEXT: retq 783; 784; AVX-LABEL: combine_nested_undef_test11: 785; AVX: # %bb.0: 786; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,1] 787; AVX-NEXT: retq 788 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4> 789 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0> 790 ret <4 x i32> %2 791} 792 793define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) { 794; SSE-LABEL: combine_nested_undef_test12: 795; SSE: # %bb.0: 796; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 797; SSE-NEXT: retq 798; 799; AVX1-LABEL: combine_nested_undef_test12: 800; AVX1: # %bb.0: 801; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 802; AVX1-NEXT: retq 803; 804; AVX2-LABEL: combine_nested_undef_test12: 805; AVX2: # %bb.0: 806; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 807; AVX2-NEXT: retq 808 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4> 809 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4> 810 ret <4 x i32> %2 811} 812 813; The following pair of shuffles is folded into vector %A. 814define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) { 815; CHECK-LABEL: combine_nested_undef_test13: 816; CHECK: # %bb.0: 817; CHECK-NEXT: retq 818 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6> 819 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4> 820 ret <4 x i32> %2 821} 822 823; The following pair of shuffles is folded into vector %B. 824define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) { 825; SSE-LABEL: combine_nested_undef_test14: 826; SSE: # %bb.0: 827; SSE-NEXT: movaps %xmm1, %xmm0 828; SSE-NEXT: retq 829; 830; AVX-LABEL: combine_nested_undef_test14: 831; AVX: # %bb.0: 832; AVX-NEXT: vmovaps %xmm1, %xmm0 833; AVX-NEXT: retq 834 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> 835 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4> 836 ret <4 x i32> %2 837} 838 839 840; Verify that we don't optimize the following cases. We expect more than one shuffle. 841; 842; FIXME: Many of these already don't make sense, and the rest should stop 843; making sense with th enew vector shuffle lowering. Revisit at least testing for 844; it. 845 846define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) { 847; SSE2-LABEL: combine_nested_undef_test15: 848; SSE2: # %bb.0: 849; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 850; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1] 851; SSE2-NEXT: movaps %xmm1, %xmm0 852; SSE2-NEXT: retq 853; 854; SSSE3-LABEL: combine_nested_undef_test15: 855; SSSE3: # %bb.0: 856; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 857; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1] 858; SSSE3-NEXT: movaps %xmm1, %xmm0 859; SSSE3-NEXT: retq 860; 861; SSE41-LABEL: combine_nested_undef_test15: 862; SSE41: # %bb.0: 863; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 864; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 865; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 866; SSE41-NEXT: retq 867; 868; AVX1-LABEL: combine_nested_undef_test15: 869; AVX1: # %bb.0: 870; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] 871; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] 872; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 873; AVX1-NEXT: retq 874; 875; AVX2-LABEL: combine_nested_undef_test15: 876; AVX2: # %bb.0: 877; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 878; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] 879; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 880; AVX2-NEXT: retq 881 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> 882 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 883 ret <4 x i32> %2 884} 885 886define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) { 887; SSE2-LABEL: combine_nested_undef_test16: 888; SSE2: # %bb.0: 889; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 890; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 891; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 892; SSE2-NEXT: retq 893; 894; SSSE3-LABEL: combine_nested_undef_test16: 895; SSSE3: # %bb.0: 896; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 897; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 898; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 899; SSSE3-NEXT: retq 900; 901; SSE41-LABEL: combine_nested_undef_test16: 902; SSE41: # %bb.0: 903; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 904; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 905; SSE41-NEXT: retq 906; 907; AVX-LABEL: combine_nested_undef_test16: 908; AVX: # %bb.0: 909; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] 910; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 911; AVX-NEXT: retq 912 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 913 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 914 ret <4 x i32> %2 915} 916 917define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) { 918; SSE2-LABEL: combine_nested_undef_test17: 919; SSE2: # %bb.0: 920; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] 921; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2] 922; SSE2-NEXT: retq 923; 924; SSSE3-LABEL: combine_nested_undef_test17: 925; SSSE3: # %bb.0: 926; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] 927; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2] 928; SSSE3-NEXT: retq 929; 930; SSE41-LABEL: combine_nested_undef_test17: 931; SSE41: # %bb.0: 932; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 933; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 934; SSE41-NEXT: retq 935; 936; AVX-LABEL: combine_nested_undef_test17: 937; AVX: # %bb.0: 938; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 939; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] 940; AVX-NEXT: retq 941 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> 942 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 943 ret <4 x i32> %2 944} 945 946define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) { 947; SSE-LABEL: combine_nested_undef_test18: 948; SSE: # %bb.0: 949; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3] 950; SSE-NEXT: retq 951; 952; AVX-LABEL: combine_nested_undef_test18: 953; AVX: # %bb.0: 954; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,0,3] 955; AVX-NEXT: retq 956 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 957 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3> 958 ret <4 x i32> %2 959} 960 961define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) { 962; SSE2-LABEL: combine_nested_undef_test19: 963; SSE2: # %bb.0: 964; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 965; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] 966; SSE2-NEXT: retq 967; 968; SSSE3-LABEL: combine_nested_undef_test19: 969; SSSE3: # %bb.0: 970; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 971; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] 972; SSSE3-NEXT: retq 973; 974; SSE41-LABEL: combine_nested_undef_test19: 975; SSE41: # %bb.0: 976; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 977; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] 978; SSE41-NEXT: retq 979; 980; AVX-LABEL: combine_nested_undef_test19: 981; AVX: # %bb.0: 982; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 983; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] 984; AVX-NEXT: retq 985 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6> 986 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0> 987 ret <4 x i32> %2 988} 989 990define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) { 991; SSE2-LABEL: combine_nested_undef_test20: 992; SSE2: # %bb.0: 993; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 994; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 995; SSE2-NEXT: movaps %xmm1, %xmm0 996; SSE2-NEXT: retq 997; 998; SSSE3-LABEL: combine_nested_undef_test20: 999; SSSE3: # %bb.0: 1000; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 1001; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 1002; SSSE3-NEXT: movaps %xmm1, %xmm0 1003; SSSE3-NEXT: retq 1004; 1005; SSE41-LABEL: combine_nested_undef_test20: 1006; SSE41: # %bb.0: 1007; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 1008; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0] 1009; SSE41-NEXT: retq 1010; 1011; AVX-LABEL: combine_nested_undef_test20: 1012; AVX: # %bb.0: 1013; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 1014; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,3,0] 1015; AVX-NEXT: retq 1016 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4> 1017 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 1018 ret <4 x i32> %2 1019} 1020 1021define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) { 1022; SSE2-LABEL: combine_nested_undef_test21: 1023; SSE2: # %bb.0: 1024; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1025; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3] 1026; SSE2-NEXT: retq 1027; 1028; SSSE3-LABEL: combine_nested_undef_test21: 1029; SSSE3: # %bb.0: 1030; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1031; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3] 1032; SSSE3-NEXT: retq 1033; 1034; SSE41-LABEL: combine_nested_undef_test21: 1035; SSE41: # %bb.0: 1036; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1037; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1038; SSE41-NEXT: retq 1039; 1040; AVX1-LABEL: combine_nested_undef_test21: 1041; AVX1: # %bb.0: 1042; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1043; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 1044; AVX1-NEXT: retq 1045; 1046; AVX2-LABEL: combine_nested_undef_test21: 1047; AVX2: # %bb.0: 1048; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1049; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1050; AVX2-NEXT: retq 1051 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> 1052 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> 1053 ret <4 x i32> %2 1054} 1055 1056 1057; Test that we correctly combine shuffles according to rule 1058; shuffle(shuffle(x, y), undef) -> shuffle(y, undef) 1059 1060define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) { 1061; SSE-LABEL: combine_nested_undef_test22: 1062; SSE: # %bb.0: 1063; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3] 1064; SSE-NEXT: retq 1065; 1066; AVX-LABEL: combine_nested_undef_test22: 1067; AVX: # %bb.0: 1068; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,1,3] 1069; AVX-NEXT: retq 1070 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 1071 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3> 1072 ret <4 x i32> %2 1073} 1074 1075define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) { 1076; SSE-LABEL: combine_nested_undef_test23: 1077; SSE: # %bb.0: 1078; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] 1079; SSE-NEXT: retq 1080; 1081; AVX-LABEL: combine_nested_undef_test23: 1082; AVX: # %bb.0: 1083; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,0,3] 1084; AVX-NEXT: retq 1085 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 1086 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> 1087 ret <4 x i32> %2 1088} 1089 1090define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) { 1091; SSE-LABEL: combine_nested_undef_test24: 1092; SSE: # %bb.0: 1093; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] 1094; SSE-NEXT: retq 1095; 1096; AVX-LABEL: combine_nested_undef_test24: 1097; AVX: # %bb.0: 1098; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,3,2,3] 1099; AVX-NEXT: retq 1100 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1101 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4> 1102 ret <4 x i32> %2 1103} 1104 1105define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) { 1106; SSE-LABEL: combine_nested_undef_test25: 1107; SSE: # %bb.0: 1108; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1109; SSE-NEXT: retq 1110; 1111; AVX1-LABEL: combine_nested_undef_test25: 1112; AVX1: # %bb.0: 1113; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 1114; AVX1-NEXT: retq 1115; 1116; AVX2-LABEL: combine_nested_undef_test25: 1117; AVX2: # %bb.0: 1118; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1119; AVX2-NEXT: retq 1120 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4> 1121 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1> 1122 ret <4 x i32> %2 1123} 1124 1125define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) { 1126; SSE-LABEL: combine_nested_undef_test26: 1127; SSE: # %bb.0: 1128; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1129; SSE-NEXT: retq 1130; 1131; AVX-LABEL: combine_nested_undef_test26: 1132; AVX: # %bb.0: 1133; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] 1134; AVX-NEXT: retq 1135 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7> 1136 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3> 1137 ret <4 x i32> %2 1138} 1139 1140define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) { 1141; SSE-LABEL: combine_nested_undef_test27: 1142; SSE: # %bb.0: 1143; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1144; SSE-NEXT: retq 1145; 1146; AVX1-LABEL: combine_nested_undef_test27: 1147; AVX1: # %bb.0: 1148; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 1149; AVX1-NEXT: retq 1150; 1151; AVX2-LABEL: combine_nested_undef_test27: 1152; AVX2: # %bb.0: 1153; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1154; AVX2-NEXT: retq 1155 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4> 1156 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2> 1157 ret <4 x i32> %2 1158} 1159 1160define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) { 1161; SSE-LABEL: combine_nested_undef_test28: 1162; SSE: # %bb.0: 1163; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] 1164; SSE-NEXT: retq 1165; 1166; AVX-LABEL: combine_nested_undef_test28: 1167; AVX: # %bb.0: 1168; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0] 1169; AVX-NEXT: retq 1170 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 1171 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2> 1172 ret <4 x i32> %2 1173} 1174 1175define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) { 1176; SSE-LABEL: combine_test1: 1177; SSE: # %bb.0: 1178; SSE-NEXT: movaps %xmm1, %xmm0 1179; SSE-NEXT: retq 1180; 1181; AVX-LABEL: combine_test1: 1182; AVX: # %bb.0: 1183; AVX-NEXT: vmovaps %xmm1, %xmm0 1184; AVX-NEXT: retq 1185 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1186 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1187 ret <4 x float> %2 1188} 1189 1190define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) { 1191; SSE2-LABEL: combine_test2: 1192; SSE2: # %bb.0: 1193; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1194; SSE2-NEXT: movaps %xmm1, %xmm0 1195; SSE2-NEXT: retq 1196; 1197; SSSE3-LABEL: combine_test2: 1198; SSSE3: # %bb.0: 1199; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1200; SSSE3-NEXT: movaps %xmm1, %xmm0 1201; SSSE3-NEXT: retq 1202; 1203; SSE41-LABEL: combine_test2: 1204; SSE41: # %bb.0: 1205; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1206; SSE41-NEXT: retq 1207; 1208; AVX-LABEL: combine_test2: 1209; AVX: # %bb.0: 1210; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1211; AVX-NEXT: retq 1212 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1213 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1214 ret <4 x float> %2 1215} 1216 1217define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) { 1218; SSE-LABEL: combine_test3: 1219; SSE: # %bb.0: 1220; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1221; SSE-NEXT: retq 1222; 1223; AVX-LABEL: combine_test3: 1224; AVX: # %bb.0: 1225; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1226; AVX-NEXT: retq 1227 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 1228 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1229 ret <4 x float> %2 1230} 1231 1232define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) { 1233; SSE-LABEL: combine_test4: 1234; SSE: # %bb.0: 1235; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1236; SSE-NEXT: retq 1237; 1238; AVX-LABEL: combine_test4: 1239; AVX: # %bb.0: 1240; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1241; AVX-NEXT: retq 1242 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1243 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1244 ret <4 x float> %2 1245} 1246 1247define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) { 1248; SSE2-LABEL: combine_test5: 1249; SSE2: # %bb.0: 1250; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1251; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1252; SSE2-NEXT: retq 1253; 1254; SSSE3-LABEL: combine_test5: 1255; SSSE3: # %bb.0: 1256; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1257; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1258; SSSE3-NEXT: retq 1259; 1260; SSE41-LABEL: combine_test5: 1261; SSE41: # %bb.0: 1262; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1263; SSE41-NEXT: retq 1264; 1265; AVX-LABEL: combine_test5: 1266; AVX: # %bb.0: 1267; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1268; AVX-NEXT: retq 1269 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1270 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1271 ret <4 x float> %2 1272} 1273 1274define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) { 1275; SSE-LABEL: combine_test6: 1276; SSE: # %bb.0: 1277; SSE-NEXT: movaps %xmm1, %xmm0 1278; SSE-NEXT: retq 1279; 1280; AVX-LABEL: combine_test6: 1281; AVX: # %bb.0: 1282; AVX-NEXT: vmovaps %xmm1, %xmm0 1283; AVX-NEXT: retq 1284 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1285 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1286 ret <4 x i32> %2 1287} 1288 1289define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) { 1290; SSE2-LABEL: combine_test7: 1291; SSE2: # %bb.0: 1292; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1293; SSE2-NEXT: movaps %xmm1, %xmm0 1294; SSE2-NEXT: retq 1295; 1296; SSSE3-LABEL: combine_test7: 1297; SSSE3: # %bb.0: 1298; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1299; SSSE3-NEXT: movaps %xmm1, %xmm0 1300; SSSE3-NEXT: retq 1301; 1302; SSE41-LABEL: combine_test7: 1303; SSE41: # %bb.0: 1304; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1305; SSE41-NEXT: retq 1306; 1307; AVX-LABEL: combine_test7: 1308; AVX: # %bb.0: 1309; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1310; AVX-NEXT: retq 1311 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1312 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1313 ret <4 x i32> %2 1314} 1315 1316define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) { 1317; SSE-LABEL: combine_test8: 1318; SSE: # %bb.0: 1319; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1320; SSE-NEXT: retq 1321; 1322; AVX-LABEL: combine_test8: 1323; AVX: # %bb.0: 1324; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1325; AVX-NEXT: retq 1326 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 1327 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1328 ret <4 x i32> %2 1329} 1330 1331define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) { 1332; SSE-LABEL: combine_test9: 1333; SSE: # %bb.0: 1334; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1335; SSE-NEXT: movaps %xmm1, %xmm0 1336; SSE-NEXT: retq 1337; 1338; AVX-LABEL: combine_test9: 1339; AVX: # %bb.0: 1340; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1341; AVX-NEXT: retq 1342 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1343 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1344 ret <4 x i32> %2 1345} 1346 1347define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) { 1348; SSE2-LABEL: combine_test10: 1349; SSE2: # %bb.0: 1350; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1351; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1352; SSE2-NEXT: retq 1353; 1354; SSSE3-LABEL: combine_test10: 1355; SSSE3: # %bb.0: 1356; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1357; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1358; SSSE3-NEXT: retq 1359; 1360; SSE41-LABEL: combine_test10: 1361; SSE41: # %bb.0: 1362; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1363; SSE41-NEXT: retq 1364; 1365; AVX-LABEL: combine_test10: 1366; AVX: # %bb.0: 1367; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1368; AVX-NEXT: retq 1369 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1370 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1371 ret <4 x i32> %2 1372} 1373 1374define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) { 1375; CHECK-LABEL: combine_test11: 1376; CHECK: # %bb.0: 1377; CHECK-NEXT: retq 1378 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1379 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1380 ret <4 x float> %2 1381} 1382 1383define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) { 1384; SSE2-LABEL: combine_test12: 1385; SSE2: # %bb.0: 1386; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1387; SSE2-NEXT: movaps %xmm1, %xmm0 1388; SSE2-NEXT: retq 1389; 1390; SSSE3-LABEL: combine_test12: 1391; SSSE3: # %bb.0: 1392; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1393; SSSE3-NEXT: movaps %xmm1, %xmm0 1394; SSSE3-NEXT: retq 1395; 1396; SSE41-LABEL: combine_test12: 1397; SSE41: # %bb.0: 1398; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1399; SSE41-NEXT: retq 1400; 1401; AVX-LABEL: combine_test12: 1402; AVX: # %bb.0: 1403; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1404; AVX-NEXT: retq 1405 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1406 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 1407 ret <4 x float> %2 1408} 1409 1410define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) { 1411; SSE-LABEL: combine_test13: 1412; SSE: # %bb.0: 1413; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1414; SSE-NEXT: retq 1415; 1416; AVX-LABEL: combine_test13: 1417; AVX: # %bb.0: 1418; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1419; AVX-NEXT: retq 1420 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1421 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1422 ret <4 x float> %2 1423} 1424 1425define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) { 1426; SSE-LABEL: combine_test14: 1427; SSE: # %bb.0: 1428; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1429; SSE-NEXT: retq 1430; 1431; AVX-LABEL: combine_test14: 1432; AVX: # %bb.0: 1433; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1434; AVX-NEXT: retq 1435 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> 1436 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1437 ret <4 x float> %2 1438} 1439 1440define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) { 1441; SSE2-LABEL: combine_test15: 1442; SSE2: # %bb.0: 1443; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1444; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1445; SSE2-NEXT: retq 1446; 1447; SSSE3-LABEL: combine_test15: 1448; SSSE3: # %bb.0: 1449; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1450; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1451; SSSE3-NEXT: retq 1452; 1453; SSE41-LABEL: combine_test15: 1454; SSE41: # %bb.0: 1455; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1456; SSE41-NEXT: retq 1457; 1458; AVX-LABEL: combine_test15: 1459; AVX: # %bb.0: 1460; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1461; AVX-NEXT: retq 1462 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1463 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1464 ret <4 x float> %2 1465} 1466 1467define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) { 1468; CHECK-LABEL: combine_test16: 1469; CHECK: # %bb.0: 1470; CHECK-NEXT: retq 1471 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1472 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1473 ret <4 x i32> %2 1474} 1475 1476define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) { 1477; SSE2-LABEL: combine_test17: 1478; SSE2: # %bb.0: 1479; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1480; SSE2-NEXT: movaps %xmm1, %xmm0 1481; SSE2-NEXT: retq 1482; 1483; SSSE3-LABEL: combine_test17: 1484; SSSE3: # %bb.0: 1485; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1486; SSSE3-NEXT: movaps %xmm1, %xmm0 1487; SSSE3-NEXT: retq 1488; 1489; SSE41-LABEL: combine_test17: 1490; SSE41: # %bb.0: 1491; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1492; SSE41-NEXT: retq 1493; 1494; AVX-LABEL: combine_test17: 1495; AVX: # %bb.0: 1496; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1497; AVX-NEXT: retq 1498 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1499 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 1500 ret <4 x i32> %2 1501} 1502 1503define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) { 1504; SSE-LABEL: combine_test18: 1505; SSE: # %bb.0: 1506; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1507; SSE-NEXT: retq 1508; 1509; AVX-LABEL: combine_test18: 1510; AVX: # %bb.0: 1511; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1512; AVX-NEXT: retq 1513 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1514 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1515 ret <4 x i32> %2 1516} 1517 1518define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) { 1519; SSE-LABEL: combine_test19: 1520; SSE: # %bb.0: 1521; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1522; SSE-NEXT: retq 1523; 1524; AVX-LABEL: combine_test19: 1525; AVX: # %bb.0: 1526; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1527; AVX-NEXT: retq 1528 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> 1529 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1530 ret <4 x i32> %2 1531} 1532 1533define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) { 1534; SSE2-LABEL: combine_test20: 1535; SSE2: # %bb.0: 1536; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1537; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1538; SSE2-NEXT: retq 1539; 1540; SSSE3-LABEL: combine_test20: 1541; SSSE3: # %bb.0: 1542; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1543; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1544; SSSE3-NEXT: retq 1545; 1546; SSE41-LABEL: combine_test20: 1547; SSE41: # %bb.0: 1548; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1549; SSE41-NEXT: retq 1550; 1551; AVX-LABEL: combine_test20: 1552; AVX: # %bb.0: 1553; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1554; AVX-NEXT: retq 1555 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1556 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1557 ret <4 x i32> %2 1558} 1559 1560define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) { 1561; SSE-LABEL: combine_test21: 1562; SSE: # %bb.0: 1563; SSE-NEXT: movaps %xmm0, %xmm2 1564; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] 1565; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1566; SSE-NEXT: movaps %xmm2, (%rdi) 1567; SSE-NEXT: retq 1568; 1569; AVX-LABEL: combine_test21: 1570; AVX: # %bb.0: 1571; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1572; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] 1573; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1574; AVX-NEXT: vmovaps %xmm2, (%rdi) 1575; AVX-NEXT: vzeroupper 1576; AVX-NEXT: retq 1577 %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1578 %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1579 store <4 x i32> %1, <4 x i32>* %ptr, align 16 1580 ret <4 x i32> %2 1581} 1582 1583define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) { 1584; SSE-LABEL: combine_test22: 1585; SSE: # %bb.0: 1586; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 1587; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 1588; SSE-NEXT: retq 1589; 1590; AVX-LABEL: combine_test22: 1591; AVX: # %bb.0: 1592; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1593; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 1594; AVX-NEXT: retq 1595; Current AVX2 lowering of this is still awful, not adding a test case. 1596 %1 = load <2 x float>, <2 x float>* %a, align 8 1597 %2 = load <2 x float>, <2 x float>* %b, align 8 1598 %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 1599 ret <8 x float> %3 1600} 1601 1602; PR22359 1603define void @combine_test23(<8 x float> %v, <2 x float>* %ptr) { 1604; SSE-LABEL: combine_test23: 1605; SSE: # %bb.0: 1606; SSE-NEXT: movups %xmm0, (%rdi) 1607; SSE-NEXT: retq 1608; 1609; AVX-LABEL: combine_test23: 1610; AVX: # %bb.0: 1611; AVX-NEXT: vmovups %xmm0, (%rdi) 1612; AVX-NEXT: vzeroupper 1613; AVX-NEXT: retq 1614 %idx2 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 1 1615 %shuffle0 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 0, i32 1> 1616 %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 2, i32 3> 1617 store <2 x float> %shuffle0, <2 x float>* %ptr, align 8 1618 store <2 x float> %shuffle1, <2 x float>* %idx2, align 8 1619 ret void 1620} 1621 1622; Check some negative cases. 1623; FIXME: Do any of these really make sense? Are they redundant with the above tests? 1624 1625define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) { 1626; SSE-LABEL: combine_test1b: 1627; SSE: # %bb.0: 1628; SSE-NEXT: movaps %xmm1, %xmm0 1629; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 1630; SSE-NEXT: retq 1631; 1632; AVX-LABEL: combine_test1b: 1633; AVX: # %bb.0: 1634; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0] 1635; AVX-NEXT: retq 1636 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1637 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0> 1638 ret <4 x float> %2 1639} 1640 1641define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) { 1642; SSE2-LABEL: combine_test2b: 1643; SSE2: # %bb.0: 1644; SSE2-NEXT: movaps %xmm1, %xmm0 1645; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1646; SSE2-NEXT: retq 1647; 1648; SSSE3-LABEL: combine_test2b: 1649; SSSE3: # %bb.0: 1650; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 1651; SSSE3-NEXT: retq 1652; 1653; SSE41-LABEL: combine_test2b: 1654; SSE41: # %bb.0: 1655; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 1656; SSE41-NEXT: retq 1657; 1658; AVX-LABEL: combine_test2b: 1659; AVX: # %bb.0: 1660; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] 1661; AVX-NEXT: retq 1662 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1663 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5> 1664 ret <4 x float> %2 1665} 1666 1667define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) { 1668; SSE2-LABEL: combine_test3b: 1669; SSE2: # %bb.0: 1670; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1671; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 1672; SSE2-NEXT: retq 1673; 1674; SSSE3-LABEL: combine_test3b: 1675; SSSE3: # %bb.0: 1676; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1677; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 1678; SSSE3-NEXT: retq 1679; 1680; SSE41-LABEL: combine_test3b: 1681; SSE41: # %bb.0: 1682; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1683; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,2,3] 1684; SSE41-NEXT: retq 1685; 1686; AVX-LABEL: combine_test3b: 1687; AVX: # %bb.0: 1688; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1689; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] 1690; AVX-NEXT: retq 1691 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3> 1692 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7> 1693 ret <4 x float> %2 1694} 1695 1696define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) { 1697; SSE-LABEL: combine_test4b: 1698; SSE: # %bb.0: 1699; SSE-NEXT: movaps %xmm1, %xmm0 1700; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] 1701; SSE-NEXT: retq 1702; 1703; AVX-LABEL: combine_test4b: 1704; AVX: # %bb.0: 1705; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3] 1706; AVX-NEXT: retq 1707 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1708 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7> 1709 ret <4 x float> %2 1710} 1711 1712 1713; Verify that we correctly fold shuffles even when we use illegal vector types. 1714 1715define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) { 1716; SSE2-LABEL: combine_test1c: 1717; SSE2: # %bb.0: 1718; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1719; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1720; SSE2-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1721; SSE2-NEXT: andps %xmm0, %xmm2 1722; SSE2-NEXT: andnps %xmm1, %xmm0 1723; SSE2-NEXT: orps %xmm2, %xmm0 1724; SSE2-NEXT: retq 1725; 1726; SSSE3-LABEL: combine_test1c: 1727; SSSE3: # %bb.0: 1728; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1729; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1730; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1731; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] 1732; SSSE3-NEXT: retq 1733; 1734; SSE41-LABEL: combine_test1c: 1735; SSE41: # %bb.0: 1736; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1737; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 1738; SSE41-NEXT: movaps {{.*#+}} xmm0 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u> 1739; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 1740; SSE41-NEXT: movdqa %xmm1, %xmm0 1741; SSE41-NEXT: retq 1742; 1743; AVX-LABEL: combine_test1c: 1744; AVX: # %bb.0: 1745; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1746; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1747; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u> 1748; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1749; AVX-NEXT: retq 1750 %A = load <4 x i8>, <4 x i8>* %a 1751 %B = load <4 x i8>, <4 x i8>* %b 1752 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1753 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1754 ret <4 x i8> %2 1755} 1756 1757define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) { 1758; SSE-LABEL: combine_test2c: 1759; SSE: # %bb.0: 1760; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1761; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1762; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1763; SSE-NEXT: retq 1764; 1765; AVX-LABEL: combine_test2c: 1766; AVX: # %bb.0: 1767; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1768; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1769; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1770; AVX-NEXT: retq 1771 %A = load <4 x i8>, <4 x i8>* %a 1772 %B = load <4 x i8>, <4 x i8>* %b 1773 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5> 1774 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1775 ret <4 x i8> %2 1776} 1777 1778define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) { 1779; SSE-LABEL: combine_test3c: 1780; SSE: # %bb.0: 1781; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1782; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1783; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1784; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1785; SSE-NEXT: retq 1786; 1787; AVX-LABEL: combine_test3c: 1788; AVX: # %bb.0: 1789; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1790; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1791; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1792; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1793; AVX-NEXT: retq 1794 %A = load <4 x i8>, <4 x i8>* %a 1795 %B = load <4 x i8>, <4 x i8>* %b 1796 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1797 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1798 ret <4 x i8> %2 1799} 1800 1801define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) { 1802; SSE2-LABEL: combine_test4c: 1803; SSE2: # %bb.0: 1804; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1805; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1806; SSE2-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1807; SSE2-NEXT: andps %xmm0, %xmm2 1808; SSE2-NEXT: andnps %xmm1, %xmm0 1809; SSE2-NEXT: orps %xmm2, %xmm0 1810; SSE2-NEXT: retq 1811; 1812; SSSE3-LABEL: combine_test4c: 1813; SSSE3: # %bb.0: 1814; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1815; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1816; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1817; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,3,4,6,u,u,u,u,u,u,u,u,u,u,u,u] 1818; SSSE3-NEXT: retq 1819; 1820; SSE41-LABEL: combine_test4c: 1821; SSE41: # %bb.0: 1822; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1823; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 1824; SSE41-NEXT: movaps {{.*#+}} xmm0 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u> 1825; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 1826; SSE41-NEXT: movdqa %xmm1, %xmm0 1827; SSE41-NEXT: retq 1828; 1829; AVX-LABEL: combine_test4c: 1830; AVX: # %bb.0: 1831; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1832; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1833; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u> 1834; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1835; AVX-NEXT: retq 1836 %A = load <4 x i8>, <4 x i8>* %a 1837 %B = load <4 x i8>, <4 x i8>* %b 1838 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1839 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1840 ret <4 x i8> %2 1841} 1842 1843 1844; The following test cases are generated from this C++ code 1845; 1846;__m128 blend_01(__m128 a, __m128 b) 1847;{ 1848; __m128 s = a; 1849; s = _mm_blend_ps( s, b, 1<<0 ); 1850; s = _mm_blend_ps( s, b, 1<<1 ); 1851; return s; 1852;} 1853; 1854;__m128 blend_02(__m128 a, __m128 b) 1855;{ 1856; __m128 s = a; 1857; s = _mm_blend_ps( s, b, 1<<0 ); 1858; s = _mm_blend_ps( s, b, 1<<2 ); 1859; return s; 1860;} 1861; 1862;__m128 blend_123(__m128 a, __m128 b) 1863;{ 1864; __m128 s = a; 1865; s = _mm_blend_ps( s, b, 1<<1 ); 1866; s = _mm_blend_ps( s, b, 1<<2 ); 1867; s = _mm_blend_ps( s, b, 1<<3 ); 1868; return s; 1869;} 1870 1871; Ideally, we should collapse the following shuffles into a single one. 1872 1873define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) { 1874; SSE2-LABEL: combine_blend_01: 1875; SSE2: # %bb.0: 1876; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1877; SSE2-NEXT: retq 1878; 1879; SSSE3-LABEL: combine_blend_01: 1880; SSSE3: # %bb.0: 1881; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1882; SSSE3-NEXT: retq 1883; 1884; SSE41-LABEL: combine_blend_01: 1885; SSE41: # %bb.0: 1886; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 1887; SSE41-NEXT: retq 1888; 1889; AVX-LABEL: combine_blend_01: 1890; AVX: # %bb.0: 1891; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 1892; AVX-NEXT: retq 1893 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3> 1894 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1895 ret <4 x float> %shuffle6 1896} 1897 1898define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) { 1899; SSE2-LABEL: combine_blend_02: 1900; SSE2: # %bb.0: 1901; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] 1902; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] 1903; SSE2-NEXT: movaps %xmm1, %xmm0 1904; SSE2-NEXT: retq 1905; 1906; SSSE3-LABEL: combine_blend_02: 1907; SSSE3: # %bb.0: 1908; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] 1909; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] 1910; SSSE3-NEXT: movaps %xmm1, %xmm0 1911; SSSE3-NEXT: retq 1912; 1913; SSE41-LABEL: combine_blend_02: 1914; SSE41: # %bb.0: 1915; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 1916; SSE41-NEXT: retq 1917; 1918; AVX-LABEL: combine_blend_02: 1919; AVX: # %bb.0: 1920; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 1921; AVX-NEXT: retq 1922 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3> 1923 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1924 ret <4 x float> %shuffle6 1925} 1926 1927define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) { 1928; SSE2-LABEL: combine_blend_123: 1929; SSE2: # %bb.0: 1930; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1931; SSE2-NEXT: movaps %xmm1, %xmm0 1932; SSE2-NEXT: retq 1933; 1934; SSSE3-LABEL: combine_blend_123: 1935; SSSE3: # %bb.0: 1936; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1937; SSSE3-NEXT: movaps %xmm1, %xmm0 1938; SSSE3-NEXT: retq 1939; 1940; SSE41-LABEL: combine_blend_123: 1941; SSE41: # %bb.0: 1942; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1943; SSE41-NEXT: retq 1944; 1945; AVX-LABEL: combine_blend_123: 1946; AVX: # %bb.0: 1947; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1948; AVX-NEXT: retq 1949 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef> 1950 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef> 1951 %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1952 ret <4 x float> %shuffle12 1953} 1954 1955define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) { 1956; SSE-LABEL: combine_test_movhl_1: 1957; SSE: # %bb.0: 1958; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1959; SSE-NEXT: movaps %xmm1, %xmm0 1960; SSE-NEXT: retq 1961; 1962; AVX-LABEL: combine_test_movhl_1: 1963; AVX: # %bb.0: 1964; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1965; AVX-NEXT: retq 1966 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3> 1967 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3> 1968 ret <4 x i32> %2 1969} 1970 1971define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) { 1972; SSE-LABEL: combine_test_movhl_2: 1973; SSE: # %bb.0: 1974; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1975; SSE-NEXT: movaps %xmm1, %xmm0 1976; SSE-NEXT: retq 1977; 1978; AVX-LABEL: combine_test_movhl_2: 1979; AVX: # %bb.0: 1980; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1981; AVX-NEXT: retq 1982 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6> 1983 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2> 1984 ret <4 x i32> %2 1985} 1986 1987define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) { 1988; SSE-LABEL: combine_test_movhl_3: 1989; SSE: # %bb.0: 1990; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1991; SSE-NEXT: movaps %xmm1, %xmm0 1992; SSE-NEXT: retq 1993; 1994; AVX-LABEL: combine_test_movhl_3: 1995; AVX: # %bb.0: 1996; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1997; AVX-NEXT: retq 1998 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2> 1999 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2> 2000 ret <4 x i32> %2 2001} 2002 2003 2004; Verify that we fold shuffles according to rule: 2005; (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2) 2006 2007define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) { 2008; SSE2-LABEL: combine_undef_input_test1: 2009; SSE2: # %bb.0: 2010; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2011; SSE2-NEXT: retq 2012; 2013; SSSE3-LABEL: combine_undef_input_test1: 2014; SSSE3: # %bb.0: 2015; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2016; SSSE3-NEXT: retq 2017; 2018; SSE41-LABEL: combine_undef_input_test1: 2019; SSE41: # %bb.0: 2020; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2021; SSE41-NEXT: retq 2022; 2023; AVX-LABEL: combine_undef_input_test1: 2024; AVX: # %bb.0: 2025; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2026; AVX-NEXT: retq 2027 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2028 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2> 2029 ret <4 x float> %2 2030} 2031 2032define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) { 2033; SSE-LABEL: combine_undef_input_test2: 2034; SSE: # %bb.0: 2035; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2036; SSE-NEXT: retq 2037; 2038; AVX-LABEL: combine_undef_input_test2: 2039; AVX: # %bb.0: 2040; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2041; AVX-NEXT: retq 2042 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2043 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 2044 ret <4 x float> %2 2045} 2046 2047define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) { 2048; SSE-LABEL: combine_undef_input_test3: 2049; SSE: # %bb.0: 2050; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2051; SSE-NEXT: retq 2052; 2053; AVX-LABEL: combine_undef_input_test3: 2054; AVX: # %bb.0: 2055; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2056; AVX-NEXT: retq 2057 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2058 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 2059 ret <4 x float> %2 2060} 2061 2062define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) { 2063; SSE-LABEL: combine_undef_input_test4: 2064; SSE: # %bb.0: 2065; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2066; SSE-NEXT: retq 2067; 2068; AVX-LABEL: combine_undef_input_test4: 2069; AVX: # %bb.0: 2070; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2071; AVX-NEXT: retq 2072 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2073 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 2074 ret <4 x float> %2 2075} 2076 2077define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) { 2078; SSE2-LABEL: combine_undef_input_test5: 2079; SSE2: # %bb.0: 2080; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2081; SSE2-NEXT: retq 2082; 2083; SSSE3-LABEL: combine_undef_input_test5: 2084; SSSE3: # %bb.0: 2085; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2086; SSSE3-NEXT: retq 2087; 2088; SSE41-LABEL: combine_undef_input_test5: 2089; SSE41: # %bb.0: 2090; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2091; SSE41-NEXT: retq 2092; 2093; AVX-LABEL: combine_undef_input_test5: 2094; AVX: # %bb.0: 2095; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2096; AVX-NEXT: retq 2097 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2098 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7> 2099 ret <4 x float> %2 2100} 2101 2102 2103; Verify that we fold shuffles according to rule: 2104; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) 2105 2106define <4 x float> @combine_undef_input_test6(<4 x float> %a) { 2107; CHECK-LABEL: combine_undef_input_test6: 2108; CHECK: # %bb.0: 2109; CHECK-NEXT: retq 2110 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2111 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2> 2112 ret <4 x float> %2 2113} 2114 2115define <4 x float> @combine_undef_input_test7(<4 x float> %a) { 2116; SSE2-LABEL: combine_undef_input_test7: 2117; SSE2: # %bb.0: 2118; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2119; SSE2-NEXT: retq 2120; 2121; SSSE3-LABEL: combine_undef_input_test7: 2122; SSSE3: # %bb.0: 2123; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2124; SSSE3-NEXT: retq 2125; 2126; SSE41-LABEL: combine_undef_input_test7: 2127; SSE41: # %bb.0: 2128; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2129; SSE41-NEXT: retq 2130; 2131; AVX-LABEL: combine_undef_input_test7: 2132; AVX: # %bb.0: 2133; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2134; AVX-NEXT: retq 2135 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2136 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 2137 ret <4 x float> %2 2138} 2139 2140define <4 x float> @combine_undef_input_test8(<4 x float> %a) { 2141; SSE2-LABEL: combine_undef_input_test8: 2142; SSE2: # %bb.0: 2143; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2144; SSE2-NEXT: retq 2145; 2146; SSSE3-LABEL: combine_undef_input_test8: 2147; SSSE3: # %bb.0: 2148; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2149; SSSE3-NEXT: retq 2150; 2151; SSE41-LABEL: combine_undef_input_test8: 2152; SSE41: # %bb.0: 2153; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2154; SSE41-NEXT: retq 2155; 2156; AVX-LABEL: combine_undef_input_test8: 2157; AVX: # %bb.0: 2158; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2159; AVX-NEXT: retq 2160 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2161 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 2162 ret <4 x float> %2 2163} 2164 2165define <4 x float> @combine_undef_input_test9(<4 x float> %a) { 2166; SSE-LABEL: combine_undef_input_test9: 2167; SSE: # %bb.0: 2168; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 2169; SSE-NEXT: retq 2170; 2171; AVX-LABEL: combine_undef_input_test9: 2172; AVX: # %bb.0: 2173; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] 2174; AVX-NEXT: retq 2175 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2176 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 2177 ret <4 x float> %2 2178} 2179 2180define <4 x float> @combine_undef_input_test10(<4 x float> %a) { 2181; CHECK-LABEL: combine_undef_input_test10: 2182; CHECK: # %bb.0: 2183; CHECK-NEXT: retq 2184 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2185 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7> 2186 ret <4 x float> %2 2187} 2188 2189define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) { 2190; SSE2-LABEL: combine_undef_input_test11: 2191; SSE2: # %bb.0: 2192; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2193; SSE2-NEXT: retq 2194; 2195; SSSE3-LABEL: combine_undef_input_test11: 2196; SSSE3: # %bb.0: 2197; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2198; SSSE3-NEXT: retq 2199; 2200; SSE41-LABEL: combine_undef_input_test11: 2201; SSE41: # %bb.0: 2202; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2203; SSE41-NEXT: retq 2204; 2205; AVX-LABEL: combine_undef_input_test11: 2206; AVX: # %bb.0: 2207; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2208; AVX-NEXT: retq 2209 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2210 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6> 2211 ret <4 x float> %2 2212} 2213 2214define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) { 2215; SSE-LABEL: combine_undef_input_test12: 2216; SSE: # %bb.0: 2217; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2218; SSE-NEXT: retq 2219; 2220; AVX-LABEL: combine_undef_input_test12: 2221; AVX: # %bb.0: 2222; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2223; AVX-NEXT: retq 2224 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2225 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> 2226 ret <4 x float> %2 2227} 2228 2229define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) { 2230; SSE-LABEL: combine_undef_input_test13: 2231; SSE: # %bb.0: 2232; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2233; SSE-NEXT: retq 2234; 2235; AVX-LABEL: combine_undef_input_test13: 2236; AVX: # %bb.0: 2237; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2238; AVX-NEXT: retq 2239 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2240 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5> 2241 ret <4 x float> %2 2242} 2243 2244define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) { 2245; SSE-LABEL: combine_undef_input_test14: 2246; SSE: # %bb.0: 2247; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2248; SSE-NEXT: retq 2249; 2250; AVX-LABEL: combine_undef_input_test14: 2251; AVX: # %bb.0: 2252; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2253; AVX-NEXT: retq 2254 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2255 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 2256 ret <4 x float> %2 2257} 2258 2259define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) { 2260; SSE2-LABEL: combine_undef_input_test15: 2261; SSE2: # %bb.0: 2262; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2263; SSE2-NEXT: retq 2264; 2265; SSSE3-LABEL: combine_undef_input_test15: 2266; SSSE3: # %bb.0: 2267; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2268; SSSE3-NEXT: retq 2269; 2270; SSE41-LABEL: combine_undef_input_test15: 2271; SSE41: # %bb.0: 2272; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2273; SSE41-NEXT: retq 2274; 2275; AVX-LABEL: combine_undef_input_test15: 2276; AVX: # %bb.0: 2277; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2278; AVX-NEXT: retq 2279 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2280 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2281 ret <4 x float> %2 2282} 2283 2284 2285; Verify that shuffles are canonicalized according to rules: 2286; shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) 2287; 2288; This allows to trigger the following combine rule: 2289; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) 2290; 2291; As a result, all the shuffle pairs in each function below should be 2292; combined into a single legal shuffle operation. 2293 2294define <4 x float> @combine_undef_input_test16(<4 x float> %a) { 2295; CHECK-LABEL: combine_undef_input_test16: 2296; CHECK: # %bb.0: 2297; CHECK-NEXT: retq 2298 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2299 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3> 2300 ret <4 x float> %2 2301} 2302 2303define <4 x float> @combine_undef_input_test17(<4 x float> %a) { 2304; SSE2-LABEL: combine_undef_input_test17: 2305; SSE2: # %bb.0: 2306; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2307; SSE2-NEXT: retq 2308; 2309; SSSE3-LABEL: combine_undef_input_test17: 2310; SSSE3: # %bb.0: 2311; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2312; SSSE3-NEXT: retq 2313; 2314; SSE41-LABEL: combine_undef_input_test17: 2315; SSE41: # %bb.0: 2316; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2317; SSE41-NEXT: retq 2318; 2319; AVX-LABEL: combine_undef_input_test17: 2320; AVX: # %bb.0: 2321; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2322; AVX-NEXT: retq 2323 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2324 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> 2325 ret <4 x float> %2 2326} 2327 2328define <4 x float> @combine_undef_input_test18(<4 x float> %a) { 2329; SSE2-LABEL: combine_undef_input_test18: 2330; SSE2: # %bb.0: 2331; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2332; SSE2-NEXT: retq 2333; 2334; SSSE3-LABEL: combine_undef_input_test18: 2335; SSSE3: # %bb.0: 2336; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2337; SSSE3-NEXT: retq 2338; 2339; SSE41-LABEL: combine_undef_input_test18: 2340; SSE41: # %bb.0: 2341; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2342; SSE41-NEXT: retq 2343; 2344; AVX-LABEL: combine_undef_input_test18: 2345; AVX: # %bb.0: 2346; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2347; AVX-NEXT: retq 2348 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2349 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5> 2350 ret <4 x float> %2 2351} 2352 2353define <4 x float> @combine_undef_input_test19(<4 x float> %a) { 2354; SSE-LABEL: combine_undef_input_test19: 2355; SSE: # %bb.0: 2356; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 2357; SSE-NEXT: retq 2358; 2359; AVX-LABEL: combine_undef_input_test19: 2360; AVX: # %bb.0: 2361; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] 2362; AVX-NEXT: retq 2363 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2364 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 2365 ret <4 x float> %2 2366} 2367 2368define <4 x float> @combine_undef_input_test20(<4 x float> %a) { 2369; CHECK-LABEL: combine_undef_input_test20: 2370; CHECK: # %bb.0: 2371; CHECK-NEXT: retq 2372 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2373 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2374 ret <4 x float> %2 2375} 2376 2377; These tests are designed to test the ability to combine away unnecessary 2378; operations feeding into a shuffle. The AVX cases are the important ones as 2379; they leverage operations which cannot be done naturally on the entire vector 2380; and thus are decomposed into multiple smaller operations. 2381 2382define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) { 2383; SSE-LABEL: combine_unneeded_subvector1: 2384; SSE: # %bb.0: 2385; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2386; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0] 2387; SSE-NEXT: movdqa %xmm0, %xmm1 2388; SSE-NEXT: retq 2389; 2390; AVX1-LABEL: combine_unneeded_subvector1: 2391; AVX1: # %bb.0: 2392; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2393; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2394; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2395; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2396; AVX1-NEXT: retq 2397; 2398; AVX2-SLOW-LABEL: combine_unneeded_subvector1: 2399; AVX2-SLOW: # %bb.0: 2400; AVX2-SLOW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2401; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2402; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 2403; AVX2-SLOW-NEXT: retq 2404; 2405; AVX2-FAST-ALL-LABEL: combine_unneeded_subvector1: 2406; AVX2-FAST-ALL: # %bb.0: 2407; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2408; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] 2409; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] 2410; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 2411; AVX2-FAST-ALL-NEXT: retq 2412; 2413; AVX2-FAST-PERLANE-LABEL: combine_unneeded_subvector1: 2414; AVX2-FAST-PERLANE: # %bb.0: 2415; AVX2-FAST-PERLANE-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2416; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2417; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 2418; AVX2-FAST-PERLANE-NEXT: retq 2419 %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 2420 %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4> 2421 ret <8 x i32> %c 2422} 2423 2424define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) { 2425; SSE-LABEL: combine_unneeded_subvector2: 2426; SSE: # %bb.0: 2427; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2428; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0] 2429; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] 2430; SSE-NEXT: retq 2431; 2432; AVX1-LABEL: combine_unneeded_subvector2: 2433; AVX1: # %bb.0: 2434; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2435; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2436; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2437; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 2438; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2439; AVX1-NEXT: retq 2440; 2441; AVX2-LABEL: combine_unneeded_subvector2: 2442; AVX2: # %bb.0: 2443; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2444; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 2445; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2446; AVX2-NEXT: retq 2447 %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 2448 %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12> 2449 ret <8 x i32> %d 2450} 2451 2452define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) { 2453; SSE2-LABEL: combine_insertps1: 2454; SSE2: # %bb.0: 2455; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] 2456; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 2457; SSE2-NEXT: movaps %xmm1, %xmm0 2458; SSE2-NEXT: retq 2459; 2460; SSSE3-LABEL: combine_insertps1: 2461; SSSE3: # %bb.0: 2462; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] 2463; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 2464; SSSE3-NEXT: movaps %xmm1, %xmm0 2465; SSSE3-NEXT: retq 2466; 2467; SSE41-LABEL: combine_insertps1: 2468; SSE41: # %bb.0: 2469; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3] 2470; SSE41-NEXT: retq 2471; 2472; AVX-LABEL: combine_insertps1: 2473; AVX: # %bb.0: 2474; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3] 2475; AVX-NEXT: retq 2476 2477 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4> 2478 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3> 2479 ret <4 x float> %d 2480} 2481 2482define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) { 2483; SSE2-LABEL: combine_insertps2: 2484; SSE2: # %bb.0: 2485; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0] 2486; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 2487; SSE2-NEXT: movaps %xmm1, %xmm0 2488; SSE2-NEXT: retq 2489; 2490; SSSE3-LABEL: combine_insertps2: 2491; SSSE3: # %bb.0: 2492; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0] 2493; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 2494; SSSE3-NEXT: movaps %xmm1, %xmm0 2495; SSSE3-NEXT: retq 2496; 2497; SSE41-LABEL: combine_insertps2: 2498; SSE41: # %bb.0: 2499; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3] 2500; SSE41-NEXT: retq 2501; 2502; AVX-LABEL: combine_insertps2: 2503; AVX: # %bb.0: 2504; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3] 2505; AVX-NEXT: retq 2506 2507 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7> 2508 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2509 ret <4 x float> %d 2510} 2511 2512define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) { 2513; SSE2-LABEL: combine_insertps3: 2514; SSE2: # %bb.0: 2515; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 2516; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 2517; SSE2-NEXT: retq 2518; 2519; SSSE3-LABEL: combine_insertps3: 2520; SSSE3: # %bb.0: 2521; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 2522; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 2523; SSSE3-NEXT: retq 2524; 2525; SSE41-LABEL: combine_insertps3: 2526; SSE41: # %bb.0: 2527; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 2528; SSE41-NEXT: retq 2529; 2530; AVX-LABEL: combine_insertps3: 2531; AVX: # %bb.0: 2532; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 2533; AVX-NEXT: retq 2534 2535 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5> 2536 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3> 2537 ret <4 x float> %d 2538} 2539 2540define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) { 2541; SSE2-LABEL: combine_insertps4: 2542; SSE2: # %bb.0: 2543; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 2544; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2545; SSE2-NEXT: retq 2546; 2547; SSSE3-LABEL: combine_insertps4: 2548; SSSE3: # %bb.0: 2549; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 2550; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2551; SSSE3-NEXT: retq 2552; 2553; SSE41-LABEL: combine_insertps4: 2554; SSE41: # %bb.0: 2555; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 2556; SSE41-NEXT: retq 2557; 2558; AVX-LABEL: combine_insertps4: 2559; AVX: # %bb.0: 2560; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 2561; AVX-NEXT: retq 2562 2563 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5> 2564 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5> 2565 ret <4 x float> %d 2566} 2567 2568define void @combine_scalar_load_with_blend_with_zero(double* %a0, <4 x float>* %a1) { 2569; SSE-LABEL: combine_scalar_load_with_blend_with_zero: 2570; SSE: # %bb.0: 2571; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 2572; SSE-NEXT: movaps %xmm0, (%rsi) 2573; SSE-NEXT: retq 2574; 2575; AVX-LABEL: combine_scalar_load_with_blend_with_zero: 2576; AVX: # %bb.0: 2577; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 2578; AVX-NEXT: vmovaps %xmm0, (%rsi) 2579; AVX-NEXT: retq 2580 %1 = load double, double* %a0, align 8 2581 %2 = insertelement <2 x double> undef, double %1, i32 0 2582 %3 = insertelement <2 x double> %2, double 0.000000e+00, i32 1 2583 %4 = bitcast <2 x double> %3 to <4 x float> 2584 %5 = shufflevector <4 x float> %4, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 3> 2585 store <4 x float> %5, <4 x float>* %a1, align 16 2586 ret void 2587} 2588 2589; PR30371 2590define <4 x float> @combine_constant_insertion_v4f32(float %f) { 2591; SSE2-LABEL: combine_constant_insertion_v4f32: 2592; SSE2: # %bb.0: 2593; SSE2-NEXT: movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0> 2594; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2595; SSE2-NEXT: movaps %xmm1, %xmm0 2596; SSE2-NEXT: retq 2597; 2598; SSSE3-LABEL: combine_constant_insertion_v4f32: 2599; SSSE3: # %bb.0: 2600; SSSE3-NEXT: movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0> 2601; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2602; SSSE3-NEXT: movaps %xmm1, %xmm0 2603; SSSE3-NEXT: retq 2604; 2605; SSE41-LABEL: combine_constant_insertion_v4f32: 2606; SSE41: # %bb.0: 2607; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] 2608; SSE41-NEXT: retq 2609; 2610; AVX-LABEL: combine_constant_insertion_v4f32: 2611; AVX: # %bb.0: 2612; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] 2613; AVX-NEXT: retq 2614 %a0 = insertelement <4 x float> undef, float %f, i32 0 2615 %ret = shufflevector <4 x float> %a0, <4 x float> <float undef, float 4.0, float 5.0, float 3.0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2616 ret <4 x float> %ret 2617} 2618 2619define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) { 2620; SSE2-LABEL: combine_constant_insertion_v4i32: 2621; SSE2: # %bb.0: 2622; SSE2-NEXT: movd %edi, %xmm1 2623; SSE2-NEXT: movaps {{.*#+}} xmm0 = <u,4,5,30> 2624; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 2625; SSE2-NEXT: retq 2626; 2627; SSSE3-LABEL: combine_constant_insertion_v4i32: 2628; SSSE3: # %bb.0: 2629; SSSE3-NEXT: movd %edi, %xmm1 2630; SSSE3-NEXT: movaps {{.*#+}} xmm0 = <u,4,5,30> 2631; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 2632; SSSE3-NEXT: retq 2633; 2634; SSE41-LABEL: combine_constant_insertion_v4i32: 2635; SSE41: # %bb.0: 2636; SSE41-NEXT: movdqa {{.*#+}} xmm0 = <u,4,5,30> 2637; SSE41-NEXT: pinsrd $0, %edi, %xmm0 2638; SSE41-NEXT: retq 2639; 2640; AVX-LABEL: combine_constant_insertion_v4i32: 2641; AVX: # %bb.0: 2642; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <u,4,5,30> 2643; AVX-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0 2644; AVX-NEXT: retq 2645 %a0 = insertelement <4 x i32> undef, i32 %f, i32 0 2646 %ret = shufflevector <4 x i32> %a0, <4 x i32> <i32 undef, i32 4, i32 5, i32 30>, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2647 ret <4 x i32> %ret 2648} 2649 2650define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) { 2651; SSE2-LABEL: PR22377: 2652; SSE2: # %bb.0: # %entry 2653; SSE2-NEXT: movaps %xmm0, %xmm1 2654; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[2,3] 2655; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2] 2656; SSE2-NEXT: addps %xmm0, %xmm1 2657; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2658; SSE2-NEXT: retq 2659; 2660; SSSE3-LABEL: PR22377: 2661; SSSE3: # %bb.0: # %entry 2662; SSSE3-NEXT: movaps %xmm0, %xmm1 2663; SSSE3-NEXT: haddps %xmm0, %xmm1 2664; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] 2665; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 2666; SSSE3-NEXT: retq 2667; 2668; SSE41-LABEL: PR22377: 2669; SSE41: # %bb.0: # %entry 2670; SSE41-NEXT: movaps %xmm0, %xmm1 2671; SSE41-NEXT: haddps %xmm0, %xmm1 2672; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] 2673; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 2674; SSE41-NEXT: retq 2675; 2676; AVX-LABEL: PR22377: 2677; AVX: # %bb.0: # %entry 2678; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm1 2679; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] 2680; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 2681; AVX-NEXT: retq 2682entry: 2683 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3> 2684 %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 2685 %r2 = fadd <4 x float> %s1, %s2 2686 %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 2687 ret <4 x float> %s3 2688} 2689 2690define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) { 2691; SSE2-LABEL: PR22390: 2692; SSE2: # %bb.0: # %entry 2693; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2694; SSE2-NEXT: movaps %xmm0, %xmm2 2695; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 2696; SSE2-NEXT: addps %xmm0, %xmm2 2697; SSE2-NEXT: movaps %xmm2, %xmm0 2698; SSE2-NEXT: retq 2699; 2700; SSSE3-LABEL: PR22390: 2701; SSSE3: # %bb.0: # %entry 2702; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2703; SSSE3-NEXT: movaps %xmm0, %xmm2 2704; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 2705; SSSE3-NEXT: addps %xmm0, %xmm2 2706; SSSE3-NEXT: movaps %xmm2, %xmm0 2707; SSSE3-NEXT: retq 2708; 2709; SSE41-LABEL: PR22390: 2710; SSE41: # %bb.0: # %entry 2711; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2712; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] 2713; SSE41-NEXT: addps %xmm1, %xmm0 2714; SSE41-NEXT: retq 2715; 2716; AVX-LABEL: PR22390: 2717; AVX: # %bb.0: # %entry 2718; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2719; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] 2720; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 2721; AVX-NEXT: retq 2722entry: 2723 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2> 2724 %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 2725 %r2 = fadd <4 x float> %s1, %s2 2726 ret <4 x float> %r2 2727} 2728 2729define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) { 2730; SSE-LABEL: PR22412: 2731; SSE: # %bb.0: # %entry 2732; SSE-NEXT: movaps %xmm3, %xmm1 2733; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2] 2734; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[3,2] 2735; SSE-NEXT: retq 2736; 2737; AVX1-LABEL: PR22412: 2738; AVX1: # %bb.0: # %entry 2739; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] 2740; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 2741; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm2[3,2],ymm0[5,4],ymm2[7,6] 2742; AVX1-NEXT: retq 2743; 2744; AVX2-LABEL: PR22412: 2745; AVX2: # %bb.0: # %entry 2746; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 2747; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1] 2748; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6] 2749; AVX2-NEXT: retq 2750entry: 2751 %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2752 %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2> 2753 ret <8 x float> %s2 2754} 2755 2756define <4 x float> @PR30264(<4 x float> %x) { 2757; SSE2-LABEL: PR30264: 2758; SSE2: # %bb.0: 2759; SSE2-NEXT: xorps %xmm1, %xmm1 2760; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2761; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3] 2762; SSE2-NEXT: movaps %xmm1, %xmm0 2763; SSE2-NEXT: retq 2764; 2765; SSSE3-LABEL: PR30264: 2766; SSSE3: # %bb.0: 2767; SSSE3-NEXT: xorps %xmm1, %xmm1 2768; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2769; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3] 2770; SSSE3-NEXT: movaps %xmm1, %xmm0 2771; SSSE3-NEXT: retq 2772; 2773; SSE41-LABEL: PR30264: 2774; SSE41: # %bb.0: 2775; SSE41-NEXT: movaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0> 2776; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm0[0],zero,xmm1[2,3] 2777; SSE41-NEXT: movaps %xmm1, %xmm0 2778; SSE41-NEXT: retq 2779; 2780; AVX-LABEL: PR30264: 2781; AVX: # %bb.0: 2782; AVX-NEXT: vmovaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0> 2783; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2,3] 2784; AVX-NEXT: retq 2785 %shuf1 = shufflevector <4 x float> %x, <4 x float> <float undef, float 0.0, float undef, float undef>, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef> 2786 %shuf2 = shufflevector <4 x float> %shuf1, <4 x float> <float undef, float undef, float 4.0, float 1.0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 2787 ret <4 x float> %shuf2 2788} 2789 2790define <8 x i16> @PR39549(<16 x i8> %x) { 2791; SSE-LABEL: PR39549: 2792; SSE: # %bb.0: 2793; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2794; SSE-NEXT: psraw $8, %xmm0 2795; SSE-NEXT: retq 2796; 2797; AVX-LABEL: PR39549: 2798; AVX: # %bb.0: 2799; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2800; AVX-NEXT: vpsraw $8, %xmm0, %xmm0 2801; AVX-NEXT: retq 2802 %a = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 8, i32 undef, i32 9, i32 undef, i32 10, i32 undef, i32 11, i32 undef, i32 12, i32 undef, i32 13, i32 undef, i32 14, i32 undef, i32 15, i32 undef> 2803 %b = bitcast <16 x i8> %a to <8 x i16> 2804 %c = shl <8 x i16> %b, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 2805 %d = ashr <8 x i16> %c, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 2806 ret <8 x i16> %d 2807} 2808 2809define <4 x i32> @PR41545(<4 x i32> %a0, <16 x i8> %a1) { 2810; SSE-LABEL: PR41545: 2811; SSE: # %bb.0: 2812; SSE-NEXT: paddd %xmm1, %xmm0 2813; SSE-NEXT: retq 2814; 2815; AVX-LABEL: PR41545: 2816; AVX: # %bb.0: 2817; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2818; AVX-NEXT: retq 2819 %1 = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 2820 %2 = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 2821 %3 = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 2822 %4 = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 2823 %5 = zext <4 x i8> %1 to <4 x i32> 2824 %6 = zext <4 x i8> %2 to <4 x i32> 2825 %7 = zext <4 x i8> %3 to <4 x i32> 2826 %8 = zext <4 x i8> %4 to <4 x i32> 2827 %9 = shl <4 x i32> %6, <i32 8, i32 8, i32 8, i32 8> 2828 %10 = shl <4 x i32> %7, <i32 16, i32 16, i32 16, i32 16> 2829 %11 = shl <4 x i32> %8, <i32 24, i32 24, i32 24, i32 24> 2830 %12 = or <4 x i32> %5, %9 2831 %13 = or <4 x i32> %12, %10 2832 %14 = or <4 x i32> %13, %11 2833 %15 = add <4 x i32> %a0, %14 2834 ret <4 x i32> %15 2835} 2836 2837define <8 x i16> @shuffle_extract_insert(<8 x i16> %a) { 2838; SSE-LABEL: shuffle_extract_insert: 2839; SSE: # %bb.0: 2840; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] 2841; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 2842; SSE-NEXT: retq 2843; 2844; AVX1-LABEL: shuffle_extract_insert: 2845; AVX1: # %bb.0: 2846; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] 2847; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 2848; AVX1-NEXT: retq 2849; 2850; AVX2-SLOW-LABEL: shuffle_extract_insert: 2851; AVX2-SLOW: # %bb.0: 2852; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] 2853; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 2854; AVX2-SLOW-NEXT: retq 2855; 2856; AVX2-FAST-LABEL: shuffle_extract_insert: 2857; AVX2-FAST: # %bb.0: 2858; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,12,13,10,11,8,9,14,15] 2859; AVX2-FAST-NEXT: retq 2860 %a0 = extractelement <8 x i16> %a, i32 0 2861 %a1 = extractelement <8 x i16> %a, i32 1 2862 %a3 = extractelement <8 x i16> %a, i32 3 2863 %a4 = extractelement <8 x i16> %a, i32 4 2864 %a5 = extractelement <8 x i16> %a, i32 5 2865 %a6 = extractelement <8 x i16> %a, i32 6 2866 %a7 = extractelement <8 x i16> %a, i32 7 2867 %1 = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2868 %2 = insertelement <8 x i16> %1, i16 %a1, i32 1 2869 %3 = insertelement <8 x i16> %2, i16 %a0, i32 2 2870 %4 = insertelement <8 x i16> %3, i16 %a3, i32 3 2871 %5 = insertelement <8 x i16> %4, i16 %a6, i32 4 2872 %6 = insertelement <8 x i16> %5, i16 %a5, i32 5 2873 %7 = insertelement <8 x i16> %6, i16 %a4, i32 6 2874 %8 = insertelement <8 x i16> %7, i16 %a7, i32 7 2875 ret <8 x i16> %8 2876} 2877 2878define <8 x i16> @shuffle_extract_insert_double(<8 x i16> %a, <8 x i16> %b) { 2879; SSE2-LABEL: shuffle_extract_insert_double: 2880; SSE2: # %bb.0: 2881; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 2882; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] 2883; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2884; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] 2885; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2886; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 2887; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2888; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 2889; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2890; SSE2-NEXT: retq 2891; 2892; SSSE3-LABEL: shuffle_extract_insert_double: 2893; SSSE3: # %bb.0: 2894; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 2895; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 2896; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2897; SSSE3-NEXT: retq 2898; 2899; SSE41-LABEL: shuffle_extract_insert_double: 2900; SSE41: # %bb.0: 2901; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 2902; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 2903; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2904; SSE41-NEXT: retq 2905; 2906; AVX-LABEL: shuffle_extract_insert_double: 2907; AVX: # %bb.0: 2908; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 2909; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 2910; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2911; AVX-NEXT: retq 2912 %a0 = extractelement <8 x i16> %a, i32 0 2913 %a4 = extractelement <8 x i16> %a, i32 4 2914 %a6 = extractelement <8 x i16> %a, i32 6 2915 %b11 = extractelement <8 x i16> %b, i32 3 2916 %b13 = extractelement <8 x i16> %b, i32 5 2917 %b15 = extractelement <8 x i16> %b, i32 7 2918 %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2919 %2 = insertelement <8 x i16> %1, i16 %a0, i32 2 2920 %3 = insertelement <8 x i16> %2, i16 %b11, i32 3 2921 %4 = insertelement <8 x i16> %3, i16 %a6, i32 4 2922 %5 = insertelement <8 x i16> %4, i16 %b13, i32 5 2923 %6 = insertelement <8 x i16> %5, i16 %a4, i32 6 2924 %7 = insertelement <8 x i16> %6, i16 %b15, i32 7 2925 ret <8 x i16> %7 2926} 2927 2928define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i16> %b) { 2929; SSE2-LABEL: shuffle_extract_concat_insert: 2930; SSE2: # %bb.0: 2931; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2932; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2933; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 2934; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2935; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 2936; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] 2937; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] 2938; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2939; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] 2940; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2941; SSE2-NEXT: retq 2942; 2943; SSSE3-LABEL: shuffle_extract_concat_insert: 2944; SSSE3: # %bb.0: 2945; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2946; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 2947; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 2948; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 2949; SSSE3-NEXT: retq 2950; 2951; SSE41-LABEL: shuffle_extract_concat_insert: 2952; SSE41: # %bb.0: 2953; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2954; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 2955; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 2956; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 2957; SSE41-NEXT: retq 2958; 2959; AVX-LABEL: shuffle_extract_concat_insert: 2960; AVX: # %bb.0: 2961; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2962; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 2963; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 2964; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2965; AVX-NEXT: retq 2966 %a = shufflevector <4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2967 %a0 = extractelement <8 x i16> %a, i32 0 2968 %a4 = extractelement <8 x i16> %a, i32 4 2969 %a6 = extractelement <8 x i16> %a, i32 6 2970 %b11 = extractelement <8 x i16> %b, i32 3 2971 %b13 = extractelement <8 x i16> %b, i32 5 2972 %b15 = extractelement <8 x i16> %b, i32 7 2973 %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2974 %2 = insertelement <8 x i16> %1, i16 %a0, i32 2 2975 %3 = insertelement <8 x i16> %2, i16 %b11, i32 3 2976 %4 = insertelement <8 x i16> %3, i16 %a6, i32 4 2977 %5 = insertelement <8 x i16> %4, i16 %b13, i32 5 2978 %6 = insertelement <8 x i16> %5, i16 %a4, i32 6 2979 %7 = insertelement <8 x i16> %6, i16 %b15, i32 7 2980 ret <8 x i16> %7 2981} 2982 2983define <8 x i16> @shuffle_scalar_to_vector_extract(<8 x i8>* %p0, i8* %p1, i8* %p2) { 2984; SSE2-LABEL: shuffle_scalar_to_vector_extract: 2985; SSE2: # %bb.0: 2986; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 2987; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 2988; SSE2-NEXT: psraw $8, %xmm1 2989; SSE2-NEXT: pextrw $7, %xmm1, %eax 2990; SSE2-NEXT: movd %eax, %xmm2 2991; SSE2-NEXT: movsbl (%rsi), %eax 2992; SSE2-NEXT: movd %eax, %xmm0 2993; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 2994; SSE2-NEXT: movsbl (%rdx), %eax 2995; SSE2-NEXT: movd %eax, %xmm0 2996; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 2997; SSE2-NEXT: pxor %xmm0, %xmm0 2998; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 2999; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3000; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 3001; SSE2-NEXT: retq 3002; 3003; SSSE3-LABEL: shuffle_scalar_to_vector_extract: 3004; SSSE3: # %bb.0: 3005; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 3006; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 3007; SSSE3-NEXT: psraw $8, %xmm1 3008; SSSE3-NEXT: movsbl (%rsi), %eax 3009; SSSE3-NEXT: movd %eax, %xmm2 3010; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm1[14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 3011; SSSE3-NEXT: movsbl (%rdx), %eax 3012; SSSE3-NEXT: movd %eax, %xmm0 3013; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 3014; SSSE3-NEXT: pxor %xmm0, %xmm0 3015; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 3016; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3017; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 3018; SSSE3-NEXT: retq 3019; 3020; SSE41-LABEL: shuffle_scalar_to_vector_extract: 3021; SSE41: # %bb.0: 3022; SSE41-NEXT: pmovsxbw (%rdi), %xmm0 3023; SSE41-NEXT: pextrw $4, %xmm0, %eax 3024; SSE41-NEXT: pextrw $7, %xmm0, %ecx 3025; SSE41-NEXT: pxor %xmm0, %xmm0 3026; SSE41-NEXT: pinsrw $1, %eax, %xmm0 3027; SSE41-NEXT: movl $65531, %eax # imm = 0xFFFB 3028; SSE41-NEXT: pinsrw $2, %eax, %xmm0 3029; SSE41-NEXT: pinsrw $4, %ecx, %xmm0 3030; SSE41-NEXT: movsbl (%rsi), %eax 3031; SSE41-NEXT: pinsrw $5, %eax, %xmm0 3032; SSE41-NEXT: movsbl (%rdx), %eax 3033; SSE41-NEXT: pinsrw $6, %eax, %xmm0 3034; SSE41-NEXT: retq 3035; 3036; AVX-LABEL: shuffle_scalar_to_vector_extract: 3037; AVX: # %bb.0: 3038; AVX-NEXT: vpmovsxbw (%rdi), %xmm0 3039; AVX-NEXT: vpextrw $4, %xmm0, %eax 3040; AVX-NEXT: vpextrw $7, %xmm0, %ecx 3041; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 3042; AVX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 3043; AVX-NEXT: movl $65531, %eax # imm = 0xFFFB 3044; AVX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 3045; AVX-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 3046; AVX-NEXT: movsbl (%rsi), %eax 3047; AVX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 3048; AVX-NEXT: movsbl (%rdx), %eax 3049; AVX-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 3050; AVX-NEXT: retq 3051 %tmp = load <8 x i8>, <8 x i8>* %p0, align 1 3052 %tmp1 = sext <8 x i8> %tmp to <8 x i16> 3053 %tmp2 = load i8, i8* %p1, align 1 3054 %cvt1 = sext i8 %tmp2 to i16 3055 %tmp3 = load i8, i8* %p2, align 1 3056 %cvt2 = sext i8 %tmp3 to i16 3057 %tmp4 = extractelement <8 x i16> %tmp1, i32 4 3058 %tmp5 = extractelement <8 x i16> %tmp1, i32 7 3059 %tmp6 = insertelement <8 x i16> <i16 undef, i16 undef, i16 -5, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 undef, i32 0 3060 %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp4, i32 1 3061 %tmp8 = insertelement <8 x i16> %tmp7, i16 undef, i32 3 3062 %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp5, i32 4 3063 %tmp10 = insertelement <8 x i16> %tmp9, i16 %cvt1, i32 5 3064 %tmp11 = insertelement <8 x i16> %tmp10, i16 %cvt2, i32 6 3065 %tmp12 = insertelement <8 x i16> %tmp11, i16 undef, i32 7 3066 %tmp13 = shufflevector <8 x i16> %tmp12, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7> 3067 ret <8 x i16> %tmp13 3068} 3069 3070; Bug noticed in D96345 3071define i32 @shuffle_binops_with_undef() { 3072; SSE-LABEL: shuffle_binops_with_undef: 3073; SSE: # %bb.0: # %entry 3074; SSE-NEXT: movdqa (%rax), %xmm0 3075; SSE-NEXT: paddw %xmm0, %xmm0 3076; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 3077; SSE-NEXT: psrlw %xmm1, %xmm0 3078; SSE-NEXT: movdqa %xmm0, (%rax) 3079; SSE-NEXT: retq 3080; 3081; AVX-LABEL: shuffle_binops_with_undef: 3082; AVX: # %bb.0: # %entry 3083; AVX-NEXT: vmovdqa (%rax), %xmm0 3084; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 3085; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 3086; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 3087; AVX-NEXT: vmovdqa %xmm0, (%rax) 3088; AVX-NEXT: retq 3089entry: 3090 %load0 = load <8 x i16>, <8 x i16>* undef, align 16 3091 %load1 = load <8 x i16>, <8 x i16>* undef, align 16 3092 %shuf0 = shufflevector <16 x i8> undef, <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 3093 %addi = add <8 x i16> %load0, %load1 3094 %bc0 = bitcast <8 x i16> %addi to <2 x i64> 3095 %bc1 = bitcast <16 x i8> %shuf0 to <8 x i16> 3096 %shuf1 = shufflevector <8 x i16> %load1, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 3097 %addi24 = add <8 x i16> %shuf1, %bc1 3098 %bc2 = bitcast <8 x i16> %addi24 to <2 x i64> 3099 %shuf2 = shufflevector <2 x i64> %bc0, <2 x i64> %bc2, <2 x i32> <i32 0, i32 2> 3100 %bc3 = bitcast <2 x i64> %shuf2 to <8 x i16> 3101 %psrli = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %bc3, i32 ptrtoint (i32 ()* @shuffle_binops_with_undef to i32)) 3102 store <8 x i16> %psrli, <8 x i16>* undef, align 16 3103 ret i32 undef 3104} 3105declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) 3106 3107define void @PR43024() { 3108; SSE-LABEL: PR43024: 3109; SSE: # %bb.0: 3110; SSE-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] 3111; SSE-NEXT: movaps %xmm0, (%rax) 3112; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3113; SSE-NEXT: xorps %xmm1, %xmm1 3114; SSE-NEXT: addss %xmm1, %xmm0 3115; SSE-NEXT: addss %xmm1, %xmm0 3116; SSE-NEXT: movss %xmm0, (%rax) 3117; SSE-NEXT: retq 3118; 3119; AVX-LABEL: PR43024: 3120; AVX: # %bb.0: 3121; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] 3122; AVX-NEXT: vmovaps %xmm0, (%rax) 3123; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm0 3124; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 3125; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 3126; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}+12(%rip), %xmm0, %xmm0 3127; AVX-NEXT: vmovss %xmm0, (%rax) 3128; AVX-NEXT: retq 3129 store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, <4 x float>* undef, align 16 3130 %1 = load <4 x float>, <4 x float>* undef, align 16 3131 %2 = fmul <4 x float> %1, <float 0x0, float 0x0, float 0x0, float 0x0> 3132 %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 3133 %4 = fadd <4 x float> %2, %3 3134 %5 = fadd <4 x float> zeroinitializer, %4 3135 %6 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> 3136 %7 = fadd <4 x float> %6, %5 3137 %8 = extractelement <4 x float> %7, i32 0 3138 store float %8, float* undef, align 8 3139 ret void 3140} 3141 3142define void @PR45604(<32 x i16>* %dst, <8 x i16>* %src) { 3143; SSE2-LABEL: PR45604: 3144; SSE2: # %bb.0: 3145; SSE2-NEXT: movdqa (%rsi), %xmm1 3146; SSE2-NEXT: movd %xmm1, %eax 3147; SSE2-NEXT: movzwl %ax, %eax 3148; SSE2-NEXT: movd %eax, %xmm0 3149; SSE2-NEXT: movl $11, %eax 3150; SSE2-NEXT: pinsrw $2, %eax, %xmm0 3151; SSE2-NEXT: pextrw $1, %xmm1, %ecx 3152; SSE2-NEXT: pinsrw $4, %ecx, %xmm0 3153; SSE2-NEXT: pinsrw $6, %eax, %xmm0 3154; SSE2-NEXT: pextrw $2, %xmm1, %ecx 3155; SSE2-NEXT: movd %ecx, %xmm2 3156; SSE2-NEXT: pinsrw $2, %eax, %xmm2 3157; SSE2-NEXT: pextrw $3, %xmm1, %ecx 3158; SSE2-NEXT: pinsrw $4, %ecx, %xmm2 3159; SSE2-NEXT: pinsrw $6, %eax, %xmm2 3160; SSE2-NEXT: pextrw $4, %xmm1, %ecx 3161; SSE2-NEXT: movd %ecx, %xmm3 3162; SSE2-NEXT: pinsrw $2, %eax, %xmm3 3163; SSE2-NEXT: pextrw $5, %xmm1, %ecx 3164; SSE2-NEXT: pinsrw $4, %ecx, %xmm3 3165; SSE2-NEXT: pinsrw $6, %eax, %xmm3 3166; SSE2-NEXT: pextrw $6, %xmm1, %ecx 3167; SSE2-NEXT: movd %ecx, %xmm4 3168; SSE2-NEXT: pinsrw $2, %eax, %xmm4 3169; SSE2-NEXT: pextrw $7, %xmm1, %ecx 3170; SSE2-NEXT: pinsrw $4, %ecx, %xmm4 3171; SSE2-NEXT: pinsrw $6, %eax, %xmm4 3172; SSE2-NEXT: movdqa %xmm4, 48(%rdi) 3173; SSE2-NEXT: movdqa %xmm3, 32(%rdi) 3174; SSE2-NEXT: movdqa %xmm2, 16(%rdi) 3175; SSE2-NEXT: movdqa %xmm0, (%rdi) 3176; SSE2-NEXT: retq 3177; 3178; SSSE3-LABEL: PR45604: 3179; SSSE3: # %bb.0: 3180; SSSE3-NEXT: movdqa (%rsi), %xmm1 3181; SSSE3-NEXT: movd %xmm1, %eax 3182; SSSE3-NEXT: movzwl %ax, %eax 3183; SSSE3-NEXT: movd %eax, %xmm0 3184; SSSE3-NEXT: movl $11, %eax 3185; SSSE3-NEXT: pinsrw $2, %eax, %xmm0 3186; SSSE3-NEXT: pextrw $1, %xmm1, %ecx 3187; SSSE3-NEXT: pinsrw $4, %ecx, %xmm0 3188; SSSE3-NEXT: pinsrw $6, %eax, %xmm0 3189; SSSE3-NEXT: pextrw $2, %xmm1, %ecx 3190; SSSE3-NEXT: movd %ecx, %xmm2 3191; SSSE3-NEXT: pinsrw $2, %eax, %xmm2 3192; SSSE3-NEXT: pextrw $3, %xmm1, %ecx 3193; SSSE3-NEXT: pinsrw $4, %ecx, %xmm2 3194; SSSE3-NEXT: pinsrw $6, %eax, %xmm2 3195; SSSE3-NEXT: pextrw $4, %xmm1, %ecx 3196; SSSE3-NEXT: movd %ecx, %xmm3 3197; SSSE3-NEXT: pinsrw $2, %eax, %xmm3 3198; SSSE3-NEXT: pextrw $5, %xmm1, %ecx 3199; SSSE3-NEXT: pinsrw $4, %ecx, %xmm3 3200; SSSE3-NEXT: pinsrw $6, %eax, %xmm3 3201; SSSE3-NEXT: pextrw $6, %xmm1, %ecx 3202; SSSE3-NEXT: movd %ecx, %xmm4 3203; SSSE3-NEXT: pinsrw $2, %eax, %xmm4 3204; SSSE3-NEXT: pextrw $7, %xmm1, %ecx 3205; SSSE3-NEXT: pinsrw $4, %ecx, %xmm4 3206; SSSE3-NEXT: pinsrw $6, %eax, %xmm4 3207; SSSE3-NEXT: movdqa %xmm4, 48(%rdi) 3208; SSSE3-NEXT: movdqa %xmm3, 32(%rdi) 3209; SSSE3-NEXT: movdqa %xmm2, 16(%rdi) 3210; SSSE3-NEXT: movdqa %xmm0, (%rdi) 3211; SSSE3-NEXT: retq 3212; 3213; SSE41-LABEL: PR45604: 3214; SSE41: # %bb.0: 3215; SSE41-NEXT: movdqa (%rsi), %xmm1 3216; SSE41-NEXT: pextrw $2, %xmm1, %eax 3217; SSE41-NEXT: movd %eax, %xmm0 3218; SSE41-NEXT: movl $11, %eax 3219; SSE41-NEXT: pinsrw $2, %eax, %xmm0 3220; SSE41-NEXT: pextrw $3, %xmm1, %ecx 3221; SSE41-NEXT: pinsrw $4, %ecx, %xmm0 3222; SSE41-NEXT: pinsrw $6, %eax, %xmm0 3223; SSE41-NEXT: pextrw $4, %xmm1, %ecx 3224; SSE41-NEXT: movd %ecx, %xmm2 3225; SSE41-NEXT: pinsrw $2, %eax, %xmm2 3226; SSE41-NEXT: pextrw $5, %xmm1, %ecx 3227; SSE41-NEXT: pinsrw $4, %ecx, %xmm2 3228; SSE41-NEXT: pinsrw $6, %eax, %xmm2 3229; SSE41-NEXT: pextrw $6, %xmm1, %ecx 3230; SSE41-NEXT: movd %ecx, %xmm3 3231; SSE41-NEXT: pinsrw $2, %eax, %xmm3 3232; SSE41-NEXT: pextrw $7, %xmm1, %ecx 3233; SSE41-NEXT: pinsrw $4, %ecx, %xmm3 3234; SSE41-NEXT: pinsrw $6, %eax, %xmm3 3235; SSE41-NEXT: pxor %xmm4, %xmm4 3236; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3,4,5,6,7] 3237; SSE41-NEXT: pinsrw $2, %eax, %xmm4 3238; SSE41-NEXT: pextrw $1, %xmm1, %ecx 3239; SSE41-NEXT: pinsrw $4, %ecx, %xmm4 3240; SSE41-NEXT: pinsrw $6, %eax, %xmm4 3241; SSE41-NEXT: movdqa %xmm4, (%rdi) 3242; SSE41-NEXT: movdqa %xmm3, 48(%rdi) 3243; SSE41-NEXT: movdqa %xmm2, 32(%rdi) 3244; SSE41-NEXT: movdqa %xmm0, 16(%rdi) 3245; SSE41-NEXT: retq 3246; 3247; AVX1-LABEL: PR45604: 3248; AVX1: # %bb.0: 3249; AVX1-NEXT: vmovdqa (%rsi), %xmm0 3250; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] 3251; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 3252; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [11,11,11,0,11,11,11,0] 3253; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 3254; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 3255; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 3256; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 3257; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 3258; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 3259; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 3260; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 3261; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3262; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 3263; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 3264; AVX1-NEXT: vmovups %ymm0, (%rdi) 3265; AVX1-NEXT: vmovups %ymm1, 32(%rdi) 3266; AVX1-NEXT: vzeroupper 3267; AVX1-NEXT: retq 3268; 3269; AVX2-LABEL: PR45604: 3270; AVX2: # %bb.0: 3271; AVX2-NEXT: vmovdqa (%rsi), %xmm0 3272; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,0,2] 3273; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u> 3274; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3275; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0> 3276; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] 3277; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] 3278; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3279; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] 3280; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi) 3281; AVX2-NEXT: vmovdqu %ymm1, (%rdi) 3282; AVX2-NEXT: vzeroupper 3283; AVX2-NEXT: retq 3284 %v1 = load <8 x i16>, <8 x i16>* %src, align 16 3285 %v2 = shufflevector <8 x i16> %v1, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3286 %v3 = shufflevector <16 x i16> %v2, <16 x i16> <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31> 3287 store <32 x i16> %v3, <32 x i16>* %dst, align 16 3288 ret void 3289} 3290 3291; Test case reported on D105827 3292define void @SpinningCube() { 3293; SSE2-LABEL: SpinningCube: 3294; SSE2: # %bb.0: # %entry 3295; SSE2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 3296; SSE2-NEXT: movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0> 3297; SSE2-NEXT: movaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u> 3298; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 3299; SSE2-NEXT: movaps %xmm2, %xmm3 3300; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[1,3] 3301; SSE2-NEXT: xorps %xmm4, %xmm4 3302; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] 3303; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,3] 3304; SSE2-NEXT: addps %xmm4, %xmm2 3305; SSE2-NEXT: movaps %xmm2, (%rax) 3306; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 3307; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,0] 3308; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] 3309; SSE2-NEXT: mulps %xmm2, %xmm1 3310; SSE2-NEXT: addps %xmm0, %xmm1 3311; SSE2-NEXT: movaps %xmm1, (%rax) 3312; SSE2-NEXT: retq 3313; 3314; SSSE3-LABEL: SpinningCube: 3315; SSSE3: # %bb.0: # %entry 3316; SSSE3-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 3317; SSSE3-NEXT: movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0> 3318; SSSE3-NEXT: movaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u> 3319; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 3320; SSSE3-NEXT: movaps %xmm2, %xmm3 3321; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[1,3] 3322; SSSE3-NEXT: xorps %xmm4, %xmm4 3323; SSSE3-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] 3324; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,3] 3325; SSSE3-NEXT: addps %xmm4, %xmm2 3326; SSSE3-NEXT: movaps %xmm2, (%rax) 3327; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 3328; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] 3329; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,2] 3330; SSSE3-NEXT: mulps %xmm1, %xmm2 3331; SSSE3-NEXT: addps %xmm0, %xmm2 3332; SSSE3-NEXT: movaps %xmm2, (%rax) 3333; SSSE3-NEXT: retq 3334; 3335; SSE41-LABEL: SpinningCube: 3336; SSE41: # %bb.0: # %entry 3337; SSE41-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 3338; SSE41-NEXT: movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0> 3339; SSE41-NEXT: movaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u> 3340; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] 3341; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 3342; SSE41-NEXT: movaps %xmm1, %xmm3 3343; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[0] 3344; SSE41-NEXT: movaps %xmm0, %xmm4 3345; SSE41-NEXT: insertps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[2,3] 3346; SSE41-NEXT: addps %xmm3, %xmm4 3347; SSE41-NEXT: movaps %xmm4, (%rax) 3348; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 3349; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,2] 3350; SSE41-NEXT: mulps %xmm1, %xmm2 3351; SSE41-NEXT: addps %xmm0, %xmm2 3352; SSE41-NEXT: movaps %xmm2, (%rax) 3353; SSE41-NEXT: retq 3354; 3355; AVX1-LABEL: SpinningCube: 3356; AVX1: # %bb.0: # %entry 3357; AVX1-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 3358; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <u,u,u,1.0E+0> 3359; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u> 3360; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,3] 3361; AVX1-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 3362; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] 3363; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[2,3] 3364; AVX1-NEXT: vaddps %xmm3, %xmm2, %xmm2 3365; AVX1-NEXT: vmovaps %xmm2, (%rax) 3366; AVX1-NEXT: vbroadcastss (%rax), %xmm2 3367; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 3368; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,3] 3369; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 3370; AVX1-NEXT: vmovaps %xmm0, (%rax) 3371; AVX1-NEXT: retq 3372; 3373; AVX2-LABEL: SpinningCube: 3374; AVX2: # %bb.0: # %entry 3375; AVX2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 3376; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 3377; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,-2.0E+0,u,u> 3378; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,3] 3379; AVX2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 3380; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] 3381; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[2,3] 3382; AVX2-NEXT: vaddps %xmm3, %xmm2, %xmm2 3383; AVX2-NEXT: vmovaps %xmm2, (%rax) 3384; AVX2-NEXT: vbroadcastss (%rax), %xmm2 3385; AVX2-NEXT: vmulps %xmm1, %xmm2, %xmm1 3386; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,3] 3387; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 3388; AVX2-NEXT: vmovaps %xmm0, (%rax) 3389; AVX2-NEXT: retq 3390entry: 3391 store float 1.000000e+00, float* undef, align 4 3392 %0 = load float, float* undef, align 4 3393 %1 = fmul float undef, 0.000000e+00 3394 %2 = insertelement <4 x float> poison, float %0, i32 3 3395 %3 = load float, float* undef, align 4 3396 %4 = insertelement <2 x float> poison, float %3, i32 0 3397 %5 = shufflevector <2 x float> %4, <2 x float> poison, <2 x i32> zeroinitializer 3398 %6 = fmul <2 x float> %5, <float 0.000000e+00, float -2.000000e+00> 3399 %7 = fadd float %1, undef 3400 %8 = shufflevector <2 x float> %6, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 3401 %9 = shufflevector <4 x float> undef, <4 x float> %8, <4 x i32> <i32 0, i32 4, i32 5, i32 undef> 3402 %10 = insertelement <4 x float> %9, float %7, i32 3 3403 %11 = insertelement <4 x float> %2, float 0x7FF8000000000000, i32 1 3404 %12 = insertelement <4 x float> %11, float undef, i32 0 3405 %13 = insertelement <4 x float> %12, float undef, i32 2 3406 %14 = fadd <4 x float> %10, %13 3407 store <4 x float> %14, <4 x float>* undef, align 16 3408 %15 = load float, float* undef, align 4 3409 %16 = insertelement <2 x float> poison, float %15, i32 0 3410 %17 = shufflevector <2 x float> %16, <2 x float> poison, <2 x i32> zeroinitializer 3411 %18 = fmul <2 x float> %17, <float 0.000000e+00, float -2.000000e+00> 3412 %19 = shufflevector <2 x float> %18, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 3413 %20 = shufflevector <4 x float> undef, <4 x float> %19, <4 x i32> <i32 0, i32 4, i32 5, i32 undef> 3414 %21 = fadd <4 x float> %20, %2 3415 store <4 x float> %21, <4 x float>* undef, align 16 3416 ret void 3417} 3418