1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3-SLOW 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3-FAST 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2-SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2-FAST 8; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-SHUF 9; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-SHUF 10 11define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) { 12; SSSE3-LABEL: phaddw1: 13; SSSE3: # %bb.0: 14; SSSE3-NEXT: phaddw %xmm1, %xmm0 15; SSSE3-NEXT: retq 16; 17; AVX-LABEL: phaddw1: 18; AVX: # %bb.0: 19; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0 20; AVX-NEXT: retq 21 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 22 %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 23 %r = add <8 x i16> %a, %b 24 ret <8 x i16> %r 25} 26 27define <8 x i16> @phaddw2(<8 x i16> %x, <8 x i16> %y) { 28; SSSE3-LABEL: phaddw2: 29; SSSE3: # %bb.0: 30; SSSE3-NEXT: phaddw %xmm1, %xmm0 31; SSSE3-NEXT: retq 32; 33; AVX-LABEL: phaddw2: 34; AVX: # %bb.0: 35; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0 36; AVX-NEXT: retq 37 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14> 38 %b = shufflevector <8 x i16> %y, <8 x i16> %x, <8 x i32> <i32 8, i32 11, i32 12, i32 15, i32 0, i32 3, i32 4, i32 7> 39 %r = add <8 x i16> %a, %b 40 ret <8 x i16> %r 41} 42 43define <4 x i32> @phaddd1(<4 x i32> %x, <4 x i32> %y) { 44; SSSE3-LABEL: phaddd1: 45; SSSE3: # %bb.0: 46; SSSE3-NEXT: phaddd %xmm1, %xmm0 47; SSSE3-NEXT: retq 48; 49; AVX-LABEL: phaddd1: 50; AVX: # %bb.0: 51; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 52; AVX-NEXT: retq 53 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 54 %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 55 %r = add <4 x i32> %a, %b 56 ret <4 x i32> %r 57} 58 59define <4 x i32> @phaddd2(<4 x i32> %x, <4 x i32> %y) { 60; SSSE3-LABEL: phaddd2: 61; SSSE3: # %bb.0: 62; SSSE3-NEXT: phaddd %xmm1, %xmm0 63; SSSE3-NEXT: retq 64; 65; AVX-LABEL: phaddd2: 66; AVX: # %bb.0: 67; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 68; AVX-NEXT: retq 69 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6> 70 %b = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3> 71 %r = add <4 x i32> %a, %b 72 ret <4 x i32> %r 73} 74 75define <4 x i32> @phaddd3(<4 x i32> %x) { 76; SSSE3-LABEL: phaddd3: 77; SSSE3: # %bb.0: 78; SSSE3-NEXT: phaddd %xmm0, %xmm0 79; SSSE3-NEXT: retq 80; 81; AVX-LABEL: phaddd3: 82; AVX: # %bb.0: 83; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 84; AVX-NEXT: retq 85 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6> 86 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7> 87 %r = add <4 x i32> %a, %b 88 ret <4 x i32> %r 89} 90 91define <4 x i32> @phaddd4(<4 x i32> %x) { 92; SSSE3-LABEL: phaddd4: 93; SSSE3: # %bb.0: 94; SSSE3-NEXT: phaddd %xmm0, %xmm0 95; SSSE3-NEXT: retq 96; 97; AVX-LABEL: phaddd4: 98; AVX: # %bb.0: 99; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 100; AVX-NEXT: retq 101 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 102 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 103 %r = add <4 x i32> %a, %b 104 ret <4 x i32> %r 105} 106 107define <4 x i32> @phaddd5(<4 x i32> %x) { 108; SSSE3-LABEL: phaddd5: 109; SSSE3: # %bb.0: 110; SSSE3-NEXT: phaddd %xmm0, %xmm0 111; SSSE3-NEXT: retq 112; 113; AVX-LABEL: phaddd5: 114; AVX: # %bb.0: 115; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 116; AVX-NEXT: retq 117 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef> 118 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef> 119 %r = add <4 x i32> %a, %b 120 ret <4 x i32> %r 121} 122 123define <4 x i32> @phaddd6(<4 x i32> %x) { 124; SSSE3-SLOW-LABEL: phaddd6: 125; SSSE3-SLOW: # %bb.0: 126; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 127; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 128; SSSE3-SLOW-NEXT: retq 129; 130; SSSE3-FAST-LABEL: phaddd6: 131; SSSE3-FAST: # %bb.0: 132; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 133; SSSE3-FAST-NEXT: retq 134; 135; AVX-SLOW-LABEL: phaddd6: 136; AVX-SLOW: # %bb.0: 137; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 138; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 139; AVX-SLOW-NEXT: retq 140; 141; AVX-FAST-LABEL: phaddd6: 142; AVX-FAST: # %bb.0: 143; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 144; AVX-FAST-NEXT: retq 145; 146; AVX2-SHUF-LABEL: phaddd6: 147; AVX2-SHUF: # %bb.0: 148; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 149; AVX2-SHUF-NEXT: vpaddd %xmm1, %xmm0, %xmm0 150; AVX2-SHUF-NEXT: retq 151 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 152 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 153 %r = add <4 x i32> %a, %b 154 ret <4 x i32> %r 155} 156 157define <4 x i32> @phaddd7(<4 x i32> %x) { 158; SSSE3-LABEL: phaddd7: 159; SSSE3: # %bb.0: 160; SSSE3-NEXT: phaddd %xmm0, %xmm0 161; SSSE3-NEXT: retq 162; 163; AVX-LABEL: phaddd7: 164; AVX: # %bb.0: 165; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 166; AVX-NEXT: retq 167 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef> 168 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef> 169 %r = add <4 x i32> %a, %b 170 ret <4 x i32> %r 171} 172 173define <8 x i16> @phsubw1(<8 x i16> %x, <8 x i16> %y) { 174; SSSE3-LABEL: phsubw1: 175; SSSE3: # %bb.0: 176; SSSE3-NEXT: phsubw %xmm1, %xmm0 177; SSSE3-NEXT: retq 178; 179; AVX-LABEL: phsubw1: 180; AVX: # %bb.0: 181; AVX-NEXT: vphsubw %xmm1, %xmm0, %xmm0 182; AVX-NEXT: retq 183 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 184 %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 185 %r = sub <8 x i16> %a, %b 186 ret <8 x i16> %r 187} 188 189define <4 x i32> @phsubd1(<4 x i32> %x, <4 x i32> %y) { 190; SSSE3-LABEL: phsubd1: 191; SSSE3: # %bb.0: 192; SSSE3-NEXT: phsubd %xmm1, %xmm0 193; SSSE3-NEXT: retq 194; 195; AVX-LABEL: phsubd1: 196; AVX: # %bb.0: 197; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0 198; AVX-NEXT: retq 199 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 200 %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 201 %r = sub <4 x i32> %a, %b 202 ret <4 x i32> %r 203} 204 205define <4 x i32> @phsubd2(<4 x i32> %x) { 206; SSSE3-LABEL: phsubd2: 207; SSSE3: # %bb.0: 208; SSSE3-NEXT: phsubd %xmm0, %xmm0 209; SSSE3-NEXT: retq 210; 211; AVX-LABEL: phsubd2: 212; AVX: # %bb.0: 213; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0 214; AVX-NEXT: retq 215 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6> 216 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7> 217 %r = sub <4 x i32> %a, %b 218 ret <4 x i32> %r 219} 220 221define <4 x i32> @phsubd3(<4 x i32> %x) { 222; SSSE3-LABEL: phsubd3: 223; SSSE3: # %bb.0: 224; SSSE3-NEXT: phsubd %xmm0, %xmm0 225; SSSE3-NEXT: retq 226; 227; AVX-LABEL: phsubd3: 228; AVX: # %bb.0: 229; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0 230; AVX-NEXT: retq 231 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 232 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 233 %r = sub <4 x i32> %a, %b 234 ret <4 x i32> %r 235} 236 237define <4 x i32> @phsubd4(<4 x i32> %x) { 238; SSSE3-SLOW-LABEL: phsubd4: 239; SSSE3-SLOW: # %bb.0: 240; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 241; SSSE3-SLOW-NEXT: psubd %xmm1, %xmm0 242; SSSE3-SLOW-NEXT: retq 243; 244; SSSE3-FAST-LABEL: phsubd4: 245; SSSE3-FAST: # %bb.0: 246; SSSE3-FAST-NEXT: phsubd %xmm0, %xmm0 247; SSSE3-FAST-NEXT: retq 248; 249; AVX-SLOW-LABEL: phsubd4: 250; AVX-SLOW: # %bb.0: 251; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 252; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 253; AVX-SLOW-NEXT: retq 254; 255; AVX-FAST-LABEL: phsubd4: 256; AVX-FAST: # %bb.0: 257; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 258; AVX-FAST-NEXT: retq 259; 260; AVX2-SHUF-LABEL: phsubd4: 261; AVX2-SHUF: # %bb.0: 262; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 263; AVX2-SHUF-NEXT: vpsubd %xmm1, %xmm0, %xmm0 264; AVX2-SHUF-NEXT: retq 265 %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 266 %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 267 %r = sub <4 x i32> %a, %b 268 ret <4 x i32> %r 269} 270 271define <8 x i16> @phsubw1_reverse(<8 x i16> %x, <8 x i16> %y) { 272; SSSE3-LABEL: phsubw1_reverse: 273; SSSE3: # %bb.0: 274; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 275; SSSE3-NEXT: movdqa %xmm1, %xmm4 276; SSSE3-NEXT: pshufb %xmm3, %xmm4 277; SSSE3-NEXT: movdqa %xmm0, %xmm2 278; SSSE3-NEXT: pshufb %xmm3, %xmm2 279; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] 280; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 281; SSSE3-NEXT: pshufb %xmm3, %xmm1 282; SSSE3-NEXT: pshufb %xmm3, %xmm0 283; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 284; SSSE3-NEXT: psubw %xmm0, %xmm2 285; SSSE3-NEXT: movdqa %xmm2, %xmm0 286; SSSE3-NEXT: retq 287; 288; AVX-LABEL: phsubw1_reverse: 289; AVX: # %bb.0: 290; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] 291; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm3 292; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2 293; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 294; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 295; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7] 296; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] 297; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 298; AVX-NEXT: vpsubw %xmm0, %xmm2, %xmm0 299; AVX-NEXT: retq 300 %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 301 %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 302 %r = sub <8 x i16> %a, %b 303 ret <8 x i16> %r 304} 305 306define <4 x i32> @phsubd1_reverse(<4 x i32> %x, <4 x i32> %y) { 307; SSSE3-LABEL: phsubd1_reverse: 308; SSSE3: # %bb.0: 309; SSSE3-NEXT: movaps %xmm0, %xmm2 310; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] 311; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 312; SSSE3-NEXT: psubd %xmm0, %xmm2 313; SSSE3-NEXT: movdqa %xmm2, %xmm0 314; SSSE3-NEXT: retq 315; 316; AVX-LABEL: phsubd1_reverse: 317; AVX: # %bb.0: 318; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,3] 319; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 320; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0 321; AVX-NEXT: retq 322 %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 323 %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 324 %r = sub <4 x i32> %a, %b 325 ret <4 x i32> %r 326} 327 328define <4 x i32> @phaddd_single_source1(<4 x i32> %x) { 329; SSSE3-LABEL: phaddd_single_source1: 330; SSSE3: # %bb.0: 331; SSSE3-NEXT: phaddd %xmm0, %xmm0 332; SSSE3-NEXT: retq 333; 334; AVX-LABEL: phaddd_single_source1: 335; AVX: # %bb.0: 336; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 337; AVX-NEXT: retq 338 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2> 339 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3> 340 %add = add <4 x i32> %l, %r 341 ret <4 x i32> %add 342} 343 344define <4 x i32> @phaddd_single_source2(<4 x i32> %x) { 345; SSSE3-SLOW-LABEL: phaddd_single_source2: 346; SSSE3-SLOW: # %bb.0: 347; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 348; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 349; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 350; SSSE3-SLOW-NEXT: retq 351; 352; SSSE3-FAST-LABEL: phaddd_single_source2: 353; SSSE3-FAST: # %bb.0: 354; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 355; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] 356; SSSE3-FAST-NEXT: retq 357; 358; AVX-SLOW-LABEL: phaddd_single_source2: 359; AVX-SLOW: # %bb.0: 360; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 361; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 362; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 363; AVX-SLOW-NEXT: retq 364; 365; AVX-FAST-LABEL: phaddd_single_source2: 366; AVX-FAST: # %bb.0: 367; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 368; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] 369; AVX-FAST-NEXT: retq 370; 371; AVX2-SHUF-LABEL: phaddd_single_source2: 372; AVX2-SHUF: # %bb.0: 373; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 374; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 375; AVX2-SHUF-NEXT: vpaddd %xmm1, %xmm0, %xmm0 376; AVX2-SHUF-NEXT: retq 377 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2> 378 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3> 379 %add = add <4 x i32> %l, %r 380 %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 undef, i32 undef> 381 ret <4 x i32> %shuffle2 382} 383 384define <4 x i32> @phaddd_single_source3(<4 x i32> %x) { 385; SSSE3-LABEL: phaddd_single_source3: 386; SSSE3: # %bb.0: 387; SSSE3-NEXT: phaddd %xmm0, %xmm0 388; SSSE3-NEXT: retq 389; 390; AVX-LABEL: phaddd_single_source3: 391; AVX: # %bb.0: 392; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 393; AVX-NEXT: retq 394 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef> 395 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef> 396 %add = add <4 x i32> %l, %r 397 ret <4 x i32> %add 398} 399 400define <4 x i32> @phaddd_single_source4(<4 x i32> %x) { 401; SSSE3-SLOW-LABEL: phaddd_single_source4: 402; SSSE3-SLOW: # %bb.0: 403; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] 404; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 405; SSSE3-SLOW-NEXT: retq 406; 407; SSSE3-FAST-LABEL: phaddd_single_source4: 408; SSSE3-FAST: # %bb.0: 409; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 410; SSSE3-FAST-NEXT: retq 411; 412; AVX-SLOW-LABEL: phaddd_single_source4: 413; AVX-SLOW: # %bb.0: 414; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] 415; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 416; AVX-SLOW-NEXT: retq 417; 418; AVX-FAST-LABEL: phaddd_single_source4: 419; AVX-FAST: # %bb.0: 420; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 421; AVX-FAST-NEXT: retq 422; 423; AVX2-SHUF-LABEL: phaddd_single_source4: 424; AVX2-SHUF: # %bb.0: 425; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] 426; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0 427; AVX2-SHUF-NEXT: retq 428 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2> 429 %add = add <4 x i32> %l, %x 430 ret <4 x i32> %add 431} 432 433define <4 x i32> @phaddd_single_source5(<4 x i32> %x) { 434; SSSE3-SLOW-LABEL: phaddd_single_source5: 435; SSSE3-SLOW: # %bb.0: 436; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] 437; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 438; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 439; SSSE3-SLOW-NEXT: retq 440; 441; SSSE3-FAST-LABEL: phaddd_single_source5: 442; SSSE3-FAST: # %bb.0: 443; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 444; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 445; SSSE3-FAST-NEXT: retq 446; 447; AVX-SLOW-LABEL: phaddd_single_source5: 448; AVX-SLOW: # %bb.0: 449; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] 450; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 451; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 452; AVX-SLOW-NEXT: retq 453; 454; AVX-FAST-LABEL: phaddd_single_source5: 455; AVX-FAST: # %bb.0: 456; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 457; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 458; AVX-FAST-NEXT: retq 459; 460; AVX2-SHUF-LABEL: phaddd_single_source5: 461; AVX2-SHUF: # %bb.0: 462; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] 463; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 464; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0 465; AVX2-SHUF-NEXT: retq 466 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2> 467 %add = add <4 x i32> %l, %x 468 %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> 469 ret <4 x i32> %shuffle2 470} 471 472define <4 x i32> @phaddd_single_source6(<4 x i32> %x) { 473; SSSE3-LABEL: phaddd_single_source6: 474; SSSE3: # %bb.0: 475; SSSE3-NEXT: phaddd %xmm0, %xmm0 476; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 477; SSSE3-NEXT: retq 478; 479; AVX-LABEL: phaddd_single_source6: 480; AVX: # %bb.0: 481; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 482; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 483; AVX-NEXT: retq 484 %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef> 485 %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef> 486 %add = add <4 x i32> %l, %r 487 %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef> 488 ret <4 x i32> %shuffle2 489} 490 491define <8 x i16> @phaddw_single_source1(<8 x i16> %x) { 492; SSSE3-LABEL: phaddw_single_source1: 493; SSSE3: # %bb.0: 494; SSSE3-NEXT: phaddw %xmm0, %xmm0 495; SSSE3-NEXT: retq 496; 497; AVX-LABEL: phaddw_single_source1: 498; AVX: # %bb.0: 499; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 500; AVX-NEXT: retq 501 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6> 502 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7> 503 %add = add <8 x i16> %l, %r 504 ret <8 x i16> %add 505} 506 507define <8 x i16> @phaddw_single_source2(<8 x i16> %x) { 508; SSSE3-SLOW-LABEL: phaddw_single_source2: 509; SSSE3-SLOW: # %bb.0: 510; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7] 511; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 512; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 513; SSSE3-SLOW-NEXT: retq 514; 515; SSSE3-FAST-LABEL: phaddw_single_source2: 516; SSSE3-FAST: # %bb.0: 517; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 518; SSSE3-FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7] 519; SSSE3-FAST-NEXT: retq 520; 521; AVX-SLOW-LABEL: phaddw_single_source2: 522; AVX-SLOW: # %bb.0: 523; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7] 524; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 525; AVX-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 526; AVX-SLOW-NEXT: retq 527; 528; AVX-FAST-LABEL: phaddw_single_source2: 529; AVX-FAST: # %bb.0: 530; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 531; AVX-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7] 532; AVX-FAST-NEXT: retq 533; 534; AVX2-SHUF-LABEL: phaddw_single_source2: 535; AVX2-SHUF: # %bb.0: 536; AVX2-SHUF-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7] 537; AVX2-SHUF-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] 538; AVX2-SHUF-NEXT: vpaddw %xmm1, %xmm0, %xmm0 539; AVX2-SHUF-NEXT: retq 540 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6> 541 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7> 542 %add = add <8 x i16> %l, %r 543 %shuffle2 = shufflevector <8 x i16> %add, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 3, i32 2, i32 undef, i32 undef, i32 undef, i32 undef> 544 ret <8 x i16> %shuffle2 545} 546 547define <8 x i16> @phaddw_single_source3(<8 x i16> %x) { 548; SSSE3-LABEL: phaddw_single_source3: 549; SSSE3: # %bb.0: 550; SSSE3-NEXT: phaddw %xmm0, %xmm0 551; SSSE3-NEXT: retq 552; 553; AVX-LABEL: phaddw_single_source3: 554; AVX: # %bb.0: 555; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 556; AVX-NEXT: retq 557 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 undef, i32 undef> 558 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 undef, i32 undef> 559 %add = add <8 x i16> %l, %r 560 ret <8 x i16> %add 561} 562 563define <8 x i16> @phaddw_single_source4(<8 x i16> %x) { 564; SSSE3-SLOW-LABEL: phaddw_single_source4: 565; SSSE3-SLOW: # %bb.0: 566; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1 567; SSSE3-SLOW-NEXT: pslld $16, %xmm1 568; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1 569; SSSE3-SLOW-NEXT: movdqa %xmm1, %xmm0 570; SSSE3-SLOW-NEXT: retq 571; 572; SSSE3-FAST-LABEL: phaddw_single_source4: 573; SSSE3-FAST: # %bb.0: 574; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 575; SSSE3-FAST-NEXT: retq 576; 577; AVX-SLOW-LABEL: phaddw_single_source4: 578; AVX-SLOW: # %bb.0: 579; AVX-SLOW-NEXT: vpslld $16, %xmm0, %xmm1 580; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 581; AVX-SLOW-NEXT: retq 582; 583; AVX-FAST-LABEL: phaddw_single_source4: 584; AVX-FAST: # %bb.0: 585; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 586; AVX-FAST-NEXT: retq 587; 588; AVX2-SHUF-LABEL: phaddw_single_source4: 589; AVX2-SHUF: # %bb.0: 590; AVX2-SHUF-NEXT: vpslld $16, %xmm0, %xmm1 591; AVX2-SHUF-NEXT: vpaddw %xmm0, %xmm1, %xmm0 592; AVX2-SHUF-NEXT: retq 593 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6> 594 %add = add <8 x i16> %l, %x 595 ret <8 x i16> %add 596} 597 598define <8 x i16> @phaddw_single_source6(<8 x i16> %x) { 599; SSSE3-LABEL: phaddw_single_source6: 600; SSSE3: # %bb.0: 601; SSSE3-NEXT: phaddw %xmm0, %xmm0 602; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 603; SSSE3-NEXT: retq 604; 605; AVX-LABEL: phaddw_single_source6: 606; AVX: # %bb.0: 607; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 608; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero 609; AVX-NEXT: retq 610 %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef> 611 %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef> 612 %add = add <8 x i16> %l, %r 613 %shuffle2 = shufflevector <8 x i16> %add, <8 x i16> undef, <8 x i32> <i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 614 ret <8 x i16> %shuffle2 615} 616 617; PR39921 + PR39936 618define i32 @PR39936_v8i32(<8 x i32>) { 619; SSSE3-SLOW-LABEL: PR39936_v8i32: 620; SSSE3-SLOW: # %bb.0: 621; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0 622; SSSE3-SLOW-NEXT: phaddd %xmm0, %xmm0 623; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 624; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 625; SSSE3-SLOW-NEXT: movd %xmm1, %eax 626; SSSE3-SLOW-NEXT: retq 627; 628; SSSE3-FAST-LABEL: PR39936_v8i32: 629; SSSE3-FAST: # %bb.0: 630; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0 631; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 632; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 633; SSSE3-FAST-NEXT: movd %xmm0, %eax 634; SSSE3-FAST-NEXT: retq 635; 636; AVX1-SLOW-LABEL: PR39936_v8i32: 637; AVX1-SLOW: # %bb.0: 638; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 639; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 640; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 641; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 642; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 643; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 644; AVX1-SLOW-NEXT: vzeroupper 645; AVX1-SLOW-NEXT: retq 646; 647; AVX1-FAST-LABEL: PR39936_v8i32: 648; AVX1-FAST: # %bb.0: 649; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 650; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 651; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 652; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 653; AVX1-FAST-NEXT: vmovd %xmm0, %eax 654; AVX1-FAST-NEXT: vzeroupper 655; AVX1-FAST-NEXT: retq 656; 657; AVX2-SLOW-LABEL: PR39936_v8i32: 658; AVX2-SLOW: # %bb.0: 659; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 660; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 661; AVX2-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 662; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 663; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 664; AVX2-SLOW-NEXT: vmovd %xmm0, %eax 665; AVX2-SLOW-NEXT: vzeroupper 666; AVX2-SLOW-NEXT: retq 667; 668; AVX2-FAST-LABEL: PR39936_v8i32: 669; AVX2-FAST: # %bb.0: 670; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 671; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 672; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 673; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 674; AVX2-FAST-NEXT: vmovd %xmm0, %eax 675; AVX2-FAST-NEXT: vzeroupper 676; AVX2-FAST-NEXT: retq 677; 678; AVX2-SHUF-LABEL: PR39936_v8i32: 679; AVX2-SHUF: # %bb.0: 680; AVX2-SHUF-NEXT: vextracti128 $1, %ymm0, %xmm1 681; AVX2-SHUF-NEXT: vphaddd %xmm1, %xmm0, %xmm0 682; AVX2-SHUF-NEXT: vphaddd %xmm0, %xmm0, %xmm0 683; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 684; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0 685; AVX2-SHUF-NEXT: vmovd %xmm0, %eax 686; AVX2-SHUF-NEXT: vzeroupper 687; AVX2-SHUF-NEXT: retq 688 %2 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef> 689 %3 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 690 %4 = add <8 x i32> %2, %3 691 %5 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 692 %6 = shufflevector <8 x i32> %4, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 693 %7 = add <8 x i32> %5, %6 694 %8 = shufflevector <8 x i32> %7, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 695 %9 = add <8 x i32> %8, %7 696 %10 = extractelement <8 x i32> %9, i32 0 697 ret i32 %10 698} 699 700