1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE3,SSE3-FAST 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512-SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512-FAST 8 9define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) { 10; SSE3-LABEL: haddpd1: 11; SSE3: # %bb.0: 12; SSE3-NEXT: haddpd %xmm1, %xmm0 13; SSE3-NEXT: retq 14; 15; AVX-LABEL: haddpd1: 16; AVX: # %bb.0: 17; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 18; AVX-NEXT: retq 19 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2> 20 %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3> 21 %r = fadd <2 x double> %a, %b 22 ret <2 x double> %r 23} 24 25define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) { 26; SSE3-LABEL: haddpd2: 27; SSE3: # %bb.0: 28; SSE3-NEXT: haddpd %xmm1, %xmm0 29; SSE3-NEXT: retq 30; 31; AVX-LABEL: haddpd2: 32; AVX: # %bb.0: 33; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 34; AVX-NEXT: retq 35 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 2> 36 %b = shufflevector <2 x double> %y, <2 x double> %x, <2 x i32> <i32 2, i32 1> 37 %r = fadd <2 x double> %a, %b 38 ret <2 x double> %r 39} 40 41define <2 x double> @haddpd3(<2 x double> %x) { 42; SSE3-SLOW-LABEL: haddpd3: 43; SSE3-SLOW: # %bb.0: 44; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 45; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 46; SSE3-SLOW-NEXT: addpd %xmm0, %xmm1 47; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 48; SSE3-SLOW-NEXT: retq 49; 50; SSE3-FAST-LABEL: haddpd3: 51; SSE3-FAST: # %bb.0: 52; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 53; SSE3-FAST-NEXT: retq 54; 55; AVX-SLOW-LABEL: haddpd3: 56; AVX-SLOW: # %bb.0: 57; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 58; AVX-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 59; AVX-SLOW-NEXT: retq 60; 61; AVX-FAST-LABEL: haddpd3: 62; AVX-FAST: # %bb.0: 63; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 64; AVX-FAST-NEXT: retq 65 %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef> 66 %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 67 %r = fadd <2 x double> %a, %b 68 ret <2 x double> %r 69} 70 71define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) { 72; SSE3-LABEL: haddps1: 73; SSE3: # %bb.0: 74; SSE3-NEXT: haddps %xmm1, %xmm0 75; SSE3-NEXT: retq 76; 77; AVX-LABEL: haddps1: 78; AVX: # %bb.0: 79; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 80; AVX-NEXT: retq 81 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 82 %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 83 %r = fadd <4 x float> %a, %b 84 ret <4 x float> %r 85} 86 87define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) { 88; SSE3-LABEL: haddps2: 89; SSE3: # %bb.0: 90; SSE3-NEXT: haddps %xmm1, %xmm0 91; SSE3-NEXT: retq 92; 93; AVX-LABEL: haddps2: 94; AVX: # %bb.0: 95; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 96; AVX-NEXT: retq 97 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6> 98 %b = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3> 99 %r = fadd <4 x float> %a, %b 100 ret <4 x float> %r 101} 102 103define <4 x float> @haddps3(<4 x float> %x) { 104; SSE3-SLOW-LABEL: haddps3: 105; SSE3-SLOW: # %bb.0: 106; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 107; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 108; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 109; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 110; SSE3-SLOW-NEXT: retq 111; 112; SSE3-FAST-LABEL: haddps3: 113; SSE3-FAST: # %bb.0: 114; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 115; SSE3-FAST-NEXT: retq 116; 117; AVX-SLOW-LABEL: haddps3: 118; AVX-SLOW: # %bb.0: 119; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] 120; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 121; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 122; AVX-SLOW-NEXT: retq 123; 124; AVX-FAST-LABEL: haddps3: 125; AVX-FAST: # %bb.0: 126; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 127; AVX-FAST-NEXT: retq 128 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6> 129 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7> 130 %r = fadd <4 x float> %a, %b 131 ret <4 x float> %r 132} 133 134define <4 x float> @haddps4(<4 x float> %x) { 135; SSE3-SLOW-LABEL: haddps4: 136; SSE3-SLOW: # %bb.0: 137; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 138; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 139; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] 140; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 141; SSE3-SLOW-NEXT: retq 142; 143; SSE3-FAST-LABEL: haddps4: 144; SSE3-FAST: # %bb.0: 145; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 146; SSE3-FAST-NEXT: retq 147; 148; AVX-SLOW-LABEL: haddps4: 149; AVX-SLOW: # %bb.0: 150; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] 151; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] 152; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 153; AVX-SLOW-NEXT: retq 154; 155; AVX-FAST-LABEL: haddps4: 156; AVX-FAST: # %bb.0: 157; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 158; AVX-FAST-NEXT: retq 159 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 160 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 161 %r = fadd <4 x float> %a, %b 162 ret <4 x float> %r 163} 164 165define <4 x float> @haddps5(<4 x float> %x) { 166; SSE3-SLOW-LABEL: haddps5: 167; SSE3-SLOW: # %bb.0: 168; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 169; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,3] 170; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2,2,3] 171; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 172; SSE3-SLOW-NEXT: retq 173; 174; SSE3-FAST-LABEL: haddps5: 175; SSE3-FAST: # %bb.0: 176; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 177; SSE3-FAST-NEXT: retq 178; 179; AVX-SLOW-LABEL: haddps5: 180; AVX-SLOW: # %bb.0: 181; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,3,2,3] 182; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,2,3] 183; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 184; AVX-SLOW-NEXT: retq 185; 186; AVX-FAST-LABEL: haddps5: 187; AVX-FAST: # %bb.0: 188; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 189; AVX-FAST-NEXT: retq 190 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef> 191 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef> 192 %r = fadd <4 x float> %a, %b 193 ret <4 x float> %r 194} 195 196define <4 x float> @haddps6(<4 x float> %x) { 197; SSE3-SLOW-LABEL: haddps6: 198; SSE3-SLOW: # %bb.0: 199; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 200; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 201; SSE3-SLOW-NEXT: retq 202; 203; SSE3-FAST-LABEL: haddps6: 204; SSE3-FAST: # %bb.0: 205; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 206; SSE3-FAST-NEXT: retq 207; 208; AVX-SLOW-LABEL: haddps6: 209; AVX-SLOW: # %bb.0: 210; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 211; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 212; AVX-SLOW-NEXT: retq 213; 214; AVX-FAST-LABEL: haddps6: 215; AVX-FAST: # %bb.0: 216; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 217; AVX-FAST-NEXT: retq 218 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 219 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 220 %r = fadd <4 x float> %a, %b 221 ret <4 x float> %r 222} 223 224define <4 x float> @haddps7(<4 x float> %x) { 225; SSE3-SLOW-LABEL: haddps7: 226; SSE3-SLOW: # %bb.0: 227; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 228; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 229; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] 230; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 231; SSE3-SLOW-NEXT: retq 232; 233; SSE3-FAST-LABEL: haddps7: 234; SSE3-FAST: # %bb.0: 235; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 236; SSE3-FAST-NEXT: retq 237; 238; AVX-SLOW-LABEL: haddps7: 239; AVX-SLOW: # %bb.0: 240; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 241; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] 242; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 243; AVX-SLOW-NEXT: retq 244; 245; AVX-FAST-LABEL: haddps7: 246; AVX-FAST: # %bb.0: 247; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 248; AVX-FAST-NEXT: retq 249 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef> 250 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef> 251 %r = fadd <4 x float> %a, %b 252 ret <4 x float> %r 253} 254 255define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) { 256; SSE3-LABEL: hsubpd1: 257; SSE3: # %bb.0: 258; SSE3-NEXT: hsubpd %xmm1, %xmm0 259; SSE3-NEXT: retq 260; 261; AVX-LABEL: hsubpd1: 262; AVX: # %bb.0: 263; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 264; AVX-NEXT: retq 265 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2> 266 %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3> 267 %r = fsub <2 x double> %a, %b 268 ret <2 x double> %r 269} 270 271define <2 x double> @hsubpd2(<2 x double> %x) { 272; SSE3-SLOW-LABEL: hsubpd2: 273; SSE3-SLOW: # %bb.0: 274; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 275; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 276; SSE3-SLOW-NEXT: subpd %xmm1, %xmm0 277; SSE3-SLOW-NEXT: retq 278; 279; SSE3-FAST-LABEL: hsubpd2: 280; SSE3-FAST: # %bb.0: 281; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0 282; SSE3-FAST-NEXT: retq 283; 284; AVX-SLOW-LABEL: hsubpd2: 285; AVX-SLOW: # %bb.0: 286; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 287; AVX-SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0 288; AVX-SLOW-NEXT: retq 289; 290; AVX-FAST-LABEL: hsubpd2: 291; AVX-FAST: # %bb.0: 292; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 293; AVX-FAST-NEXT: retq 294 %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef> 295 %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 296 %r = fsub <2 x double> %a, %b 297 ret <2 x double> %r 298} 299 300define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) { 301; SSE3-LABEL: hsubps1: 302; SSE3: # %bb.0: 303; SSE3-NEXT: hsubps %xmm1, %xmm0 304; SSE3-NEXT: retq 305; 306; AVX-LABEL: hsubps1: 307; AVX: # %bb.0: 308; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0 309; AVX-NEXT: retq 310 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 311 %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 312 %r = fsub <4 x float> %a, %b 313 ret <4 x float> %r 314} 315 316define <4 x float> @hsubps2(<4 x float> %x) { 317; SSE3-SLOW-LABEL: hsubps2: 318; SSE3-SLOW: # %bb.0: 319; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 320; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 321; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 322; SSE3-SLOW-NEXT: subps %xmm0, %xmm1 323; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 324; SSE3-SLOW-NEXT: retq 325; 326; SSE3-FAST-LABEL: hsubps2: 327; SSE3-FAST: # %bb.0: 328; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 329; SSE3-FAST-NEXT: retq 330; 331; AVX-SLOW-LABEL: hsubps2: 332; AVX-SLOW: # %bb.0: 333; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] 334; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] 335; AVX-SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0 336; AVX-SLOW-NEXT: retq 337; 338; AVX-FAST-LABEL: hsubps2: 339; AVX-FAST: # %bb.0: 340; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 341; AVX-FAST-NEXT: retq 342 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6> 343 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7> 344 %r = fsub <4 x float> %a, %b 345 ret <4 x float> %r 346} 347 348define <4 x float> @hsubps3(<4 x float> %x) { 349; SSE3-SLOW-LABEL: hsubps3: 350; SSE3-SLOW: # %bb.0: 351; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 352; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 353; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] 354; SSE3-SLOW-NEXT: subps %xmm0, %xmm1 355; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 356; SSE3-SLOW-NEXT: retq 357; 358; SSE3-FAST-LABEL: hsubps3: 359; SSE3-FAST: # %bb.0: 360; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 361; SSE3-FAST-NEXT: retq 362; 363; AVX-SLOW-LABEL: hsubps3: 364; AVX-SLOW: # %bb.0: 365; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] 366; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] 367; AVX-SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0 368; AVX-SLOW-NEXT: retq 369; 370; AVX-FAST-LABEL: hsubps3: 371; AVX-FAST: # %bb.0: 372; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 373; AVX-FAST-NEXT: retq 374 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 375 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 376 %r = fsub <4 x float> %a, %b 377 ret <4 x float> %r 378} 379 380define <4 x float> @hsubps4(<4 x float> %x) { 381; SSE3-SLOW-LABEL: hsubps4: 382; SSE3-SLOW: # %bb.0: 383; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 384; SSE3-SLOW-NEXT: subps %xmm1, %xmm0 385; SSE3-SLOW-NEXT: retq 386; 387; SSE3-FAST-LABEL: hsubps4: 388; SSE3-FAST: # %bb.0: 389; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 390; SSE3-FAST-NEXT: retq 391; 392; AVX-SLOW-LABEL: hsubps4: 393; AVX-SLOW: # %bb.0: 394; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 395; AVX-SLOW-NEXT: vsubps %xmm1, %xmm0, %xmm0 396; AVX-SLOW-NEXT: retq 397; 398; AVX-FAST-LABEL: hsubps4: 399; AVX-FAST: # %bb.0: 400; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 401; AVX-FAST-NEXT: retq 402 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 403 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 404 %r = fsub <4 x float> %a, %b 405 ret <4 x float> %r 406} 407 408define <8 x float> @vhaddps1(<8 x float> %x, <8 x float> %y) { 409; SSE3-LABEL: vhaddps1: 410; SSE3: # %bb.0: 411; SSE3-NEXT: haddps %xmm2, %xmm0 412; SSE3-NEXT: haddps %xmm3, %xmm1 413; SSE3-NEXT: retq 414; 415; AVX-LABEL: vhaddps1: 416; AVX: # %bb.0: 417; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 418; AVX-NEXT: retq 419 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> 420 %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> 421 %r = fadd <8 x float> %a, %b 422 ret <8 x float> %r 423} 424 425define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) { 426; SSE3-LABEL: vhaddps2: 427; SSE3: # %bb.0: 428; SSE3-NEXT: haddps %xmm2, %xmm0 429; SSE3-NEXT: haddps %xmm3, %xmm1 430; SSE3-NEXT: retq 431; 432; AVX-LABEL: vhaddps2: 433; AVX: # %bb.0: 434; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 435; AVX-NEXT: retq 436 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14> 437 %b = shufflevector <8 x float> %y, <8 x float> %x, <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7> 438 %r = fadd <8 x float> %a, %b 439 ret <8 x float> %r 440} 441 442define <8 x float> @vhaddps3(<8 x float> %x) { 443; SSE3-SLOW-LABEL: vhaddps3: 444; SSE3-SLOW: # %bb.0: 445; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2 446; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] 447; SSE3-SLOW-NEXT: movaps %xmm0, %xmm3 448; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,3] 449; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] 450; SSE3-SLOW-NEXT: addps %xmm2, %xmm1 451; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] 452; SSE3-SLOW-NEXT: addps %xmm3, %xmm0 453; SSE3-SLOW-NEXT: retq 454; 455; SSE3-FAST-LABEL: vhaddps3: 456; SSE3-FAST: # %bb.0: 457; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 458; SSE3-FAST-NEXT: haddps %xmm1, %xmm1 459; SSE3-FAST-NEXT: retq 460; 461; AVX-SLOW-LABEL: vhaddps3: 462; AVX-SLOW: # %bb.0: 463; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] 464; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] 465; AVX-SLOW-NEXT: vaddps %ymm0, %ymm1, %ymm0 466; AVX-SLOW-NEXT: retq 467; 468; AVX-FAST-LABEL: vhaddps3: 469; AVX-FAST: # %bb.0: 470; AVX-FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0 471; AVX-FAST-NEXT: retq 472 %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14> 473 %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15> 474 %r = fadd <8 x float> %a, %b 475 ret <8 x float> %r 476} 477 478define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) { 479; SSE3-LABEL: vhsubps1: 480; SSE3: # %bb.0: 481; SSE3-NEXT: hsubps %xmm2, %xmm0 482; SSE3-NEXT: hsubps %xmm3, %xmm1 483; SSE3-NEXT: retq 484; 485; AVX-LABEL: vhsubps1: 486; AVX: # %bb.0: 487; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0 488; AVX-NEXT: retq 489 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> 490 %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> 491 %r = fsub <8 x float> %a, %b 492 ret <8 x float> %r 493} 494 495define <8 x float> @vhsubps3(<8 x float> %x) { 496; SSE3-SLOW-LABEL: vhsubps3: 497; SSE3-SLOW: # %bb.0: 498; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2 499; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] 500; SSE3-SLOW-NEXT: movaps %xmm0, %xmm3 501; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,3] 502; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] 503; SSE3-SLOW-NEXT: subps %xmm1, %xmm2 504; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] 505; SSE3-SLOW-NEXT: subps %xmm0, %xmm3 506; SSE3-SLOW-NEXT: movaps %xmm3, %xmm0 507; SSE3-SLOW-NEXT: movaps %xmm2, %xmm1 508; SSE3-SLOW-NEXT: retq 509; 510; SSE3-FAST-LABEL: vhsubps3: 511; SSE3-FAST: # %bb.0: 512; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 513; SSE3-FAST-NEXT: hsubps %xmm1, %xmm1 514; SSE3-FAST-NEXT: retq 515; 516; AVX-SLOW-LABEL: vhsubps3: 517; AVX-SLOW: # %bb.0: 518; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] 519; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] 520; AVX-SLOW-NEXT: vsubps %ymm0, %ymm1, %ymm0 521; AVX-SLOW-NEXT: retq 522; 523; AVX-FAST-LABEL: vhsubps3: 524; AVX-FAST: # %bb.0: 525; AVX-FAST-NEXT: vhsubps %ymm0, %ymm0, %ymm0 526; AVX-FAST-NEXT: retq 527 %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14> 528 %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15> 529 %r = fsub <8 x float> %a, %b 530 ret <8 x float> %r 531} 532 533define <4 x double> @vhaddpd1(<4 x double> %x, <4 x double> %y) { 534; SSE3-LABEL: vhaddpd1: 535; SSE3: # %bb.0: 536; SSE3-NEXT: haddpd %xmm2, %xmm0 537; SSE3-NEXT: haddpd %xmm3, %xmm1 538; SSE3-NEXT: retq 539; 540; AVX-LABEL: vhaddpd1: 541; AVX: # %bb.0: 542; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 543; AVX-NEXT: retq 544 %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 545 %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 546 %r = fadd <4 x double> %a, %b 547 ret <4 x double> %r 548} 549 550define <4 x double> @vhsubpd1(<4 x double> %x, <4 x double> %y) { 551; SSE3-LABEL: vhsubpd1: 552; SSE3: # %bb.0: 553; SSE3-NEXT: hsubpd %xmm2, %xmm0 554; SSE3-NEXT: hsubpd %xmm3, %xmm1 555; SSE3-NEXT: retq 556; 557; AVX-LABEL: vhsubpd1: 558; AVX: # %bb.0: 559; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 560; AVX-NEXT: retq 561 %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 562 %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 563 %r = fsub <4 x double> %a, %b 564 ret <4 x double> %r 565} 566 567define <2 x float> @haddps_v2f32(<4 x float> %v0) { 568; SSE3-LABEL: haddps_v2f32: 569; SSE3: # %bb.0: 570; SSE3-NEXT: haddps %xmm0, %xmm0 571; SSE3-NEXT: retq 572; 573; AVX-LABEL: haddps_v2f32: 574; AVX: # %bb.0: 575; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 576; AVX-NEXT: retq 577 %v0.0 = extractelement <4 x float> %v0, i32 0 578 %v0.1 = extractelement <4 x float> %v0, i32 1 579 %v0.2 = extractelement <4 x float> %v0, i32 2 580 %v0.3 = extractelement <4 x float> %v0, i32 3 581 %op0 = fadd float %v0.0, %v0.1 582 %op1 = fadd float %v0.2, %v0.3 583 %res0 = insertelement <2 x float> undef, float %op0, i32 0 584 %res1 = insertelement <2 x float> %res0, float %op1, i32 1 585 ret <2 x float> %res1 586} 587 588; 128-bit vectors, float/double, fadd/fsub 589 590define float @extract_extract_v4f32_fadd_f32(<4 x float> %x) { 591; SSE3-SLOW-LABEL: extract_extract_v4f32_fadd_f32: 592; SSE3-SLOW: # %bb.0: 593; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 594; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 595; SSE3-SLOW-NEXT: retq 596; 597; SSE3-FAST-LABEL: extract_extract_v4f32_fadd_f32: 598; SSE3-FAST: # %bb.0: 599; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 600; SSE3-FAST-NEXT: retq 601; 602; AVX-SLOW-LABEL: extract_extract_v4f32_fadd_f32: 603; AVX-SLOW: # %bb.0: 604; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 605; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 606; AVX-SLOW-NEXT: retq 607; 608; AVX-FAST-LABEL: extract_extract_v4f32_fadd_f32: 609; AVX-FAST: # %bb.0: 610; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 611; AVX-FAST-NEXT: retq 612 %x0 = extractelement <4 x float> %x, i32 0 613 %x1 = extractelement <4 x float> %x, i32 1 614 %x01 = fadd float %x0, %x1 615 ret float %x01 616} 617 618define float @extract_extract_v4f32_fadd_f32_commute(<4 x float> %x) { 619; SSE3-SLOW-LABEL: extract_extract_v4f32_fadd_f32_commute: 620; SSE3-SLOW: # %bb.0: 621; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 622; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 623; SSE3-SLOW-NEXT: retq 624; 625; SSE3-FAST-LABEL: extract_extract_v4f32_fadd_f32_commute: 626; SSE3-FAST: # %bb.0: 627; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 628; SSE3-FAST-NEXT: retq 629; 630; AVX-SLOW-LABEL: extract_extract_v4f32_fadd_f32_commute: 631; AVX-SLOW: # %bb.0: 632; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 633; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 634; AVX-SLOW-NEXT: retq 635; 636; AVX-FAST-LABEL: extract_extract_v4f32_fadd_f32_commute: 637; AVX-FAST: # %bb.0: 638; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 639; AVX-FAST-NEXT: retq 640 %x0 = extractelement <4 x float> %x, i32 0 641 %x1 = extractelement <4 x float> %x, i32 1 642 %x01 = fadd float %x1, %x0 643 ret float %x01 644} 645 646define double @extract_extract_v2f64_fadd_f64(<2 x double> %x) { 647; SSE3-SLOW-LABEL: extract_extract_v2f64_fadd_f64: 648; SSE3-SLOW: # %bb.0: 649; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 650; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 651; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 652; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 653; SSE3-SLOW-NEXT: retq 654; 655; SSE3-FAST-LABEL: extract_extract_v2f64_fadd_f64: 656; SSE3-FAST: # %bb.0: 657; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 658; SSE3-FAST-NEXT: retq 659; 660; AVX-SLOW-LABEL: extract_extract_v2f64_fadd_f64: 661; AVX-SLOW: # %bb.0: 662; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 663; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 664; AVX-SLOW-NEXT: retq 665; 666; AVX-FAST-LABEL: extract_extract_v2f64_fadd_f64: 667; AVX-FAST: # %bb.0: 668; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 669; AVX-FAST-NEXT: retq 670 %x0 = extractelement <2 x double> %x, i32 0 671 %x1 = extractelement <2 x double> %x, i32 1 672 %x01 = fadd double %x0, %x1 673 ret double %x01 674} 675 676define double @extract_extract_v2f64_fadd_f64_commute(<2 x double> %x) { 677; SSE3-SLOW-LABEL: extract_extract_v2f64_fadd_f64_commute: 678; SSE3-SLOW: # %bb.0: 679; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 680; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 681; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 682; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 683; SSE3-SLOW-NEXT: retq 684; 685; SSE3-FAST-LABEL: extract_extract_v2f64_fadd_f64_commute: 686; SSE3-FAST: # %bb.0: 687; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 688; SSE3-FAST-NEXT: retq 689; 690; AVX-SLOW-LABEL: extract_extract_v2f64_fadd_f64_commute: 691; AVX-SLOW: # %bb.0: 692; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 693; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0 694; AVX-SLOW-NEXT: retq 695; 696; AVX-FAST-LABEL: extract_extract_v2f64_fadd_f64_commute: 697; AVX-FAST: # %bb.0: 698; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 699; AVX-FAST-NEXT: retq 700 %x0 = extractelement <2 x double> %x, i32 0 701 %x1 = extractelement <2 x double> %x, i32 1 702 %x01 = fadd double %x1, %x0 703 ret double %x01 704} 705 706define float @extract_extract_v4f32_fsub_f32(<4 x float> %x) { 707; SSE3-SLOW-LABEL: extract_extract_v4f32_fsub_f32: 708; SSE3-SLOW: # %bb.0: 709; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 710; SSE3-SLOW-NEXT: subss %xmm1, %xmm0 711; SSE3-SLOW-NEXT: retq 712; 713; SSE3-FAST-LABEL: extract_extract_v4f32_fsub_f32: 714; SSE3-FAST: # %bb.0: 715; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 716; SSE3-FAST-NEXT: retq 717; 718; AVX-SLOW-LABEL: extract_extract_v4f32_fsub_f32: 719; AVX-SLOW: # %bb.0: 720; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 721; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0 722; AVX-SLOW-NEXT: retq 723; 724; AVX-FAST-LABEL: extract_extract_v4f32_fsub_f32: 725; AVX-FAST: # %bb.0: 726; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 727; AVX-FAST-NEXT: retq 728 %x0 = extractelement <4 x float> %x, i32 0 729 %x1 = extractelement <4 x float> %x, i32 1 730 %x01 = fsub float %x0, %x1 731 ret float %x01 732} 733 734define float @extract_extract_v4f32_fsub_f32_commute(<4 x float> %x) { 735; SSE3-LABEL: extract_extract_v4f32_fsub_f32_commute: 736; SSE3: # %bb.0: 737; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 738; SSE3-NEXT: subss %xmm0, %xmm1 739; SSE3-NEXT: movaps %xmm1, %xmm0 740; SSE3-NEXT: retq 741; 742; AVX-LABEL: extract_extract_v4f32_fsub_f32_commute: 743; AVX: # %bb.0: 744; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 745; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 746; AVX-NEXT: retq 747 %x0 = extractelement <4 x float> %x, i32 0 748 %x1 = extractelement <4 x float> %x, i32 1 749 %x01 = fsub float %x1, %x0 750 ret float %x01 751} 752 753define double @extract_extract_v2f64_fsub_f64(<2 x double> %x) { 754; SSE3-SLOW-LABEL: extract_extract_v2f64_fsub_f64: 755; SSE3-SLOW: # %bb.0: 756; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 757; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 758; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0 759; SSE3-SLOW-NEXT: retq 760; 761; SSE3-FAST-LABEL: extract_extract_v2f64_fsub_f64: 762; SSE3-FAST: # %bb.0: 763; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0 764; SSE3-FAST-NEXT: retq 765; 766; AVX-SLOW-LABEL: extract_extract_v2f64_fsub_f64: 767; AVX-SLOW: # %bb.0: 768; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 769; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0 770; AVX-SLOW-NEXT: retq 771; 772; AVX-FAST-LABEL: extract_extract_v2f64_fsub_f64: 773; AVX-FAST: # %bb.0: 774; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 775; AVX-FAST-NEXT: retq 776 %x0 = extractelement <2 x double> %x, i32 0 777 %x1 = extractelement <2 x double> %x, i32 1 778 %x01 = fsub double %x0, %x1 779 ret double %x01 780} 781 782define double @extract_extract_v2f64_fsub_f64_commute(<2 x double> %x) { 783; SSE3-LABEL: extract_extract_v2f64_fsub_f64_commute: 784; SSE3: # %bb.0: 785; SSE3-NEXT: movapd %xmm0, %xmm1 786; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 787; SSE3-NEXT: subsd %xmm0, %xmm1 788; SSE3-NEXT: movapd %xmm1, %xmm0 789; SSE3-NEXT: retq 790; 791; AVX-LABEL: extract_extract_v2f64_fsub_f64_commute: 792; AVX: # %bb.0: 793; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 794; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 795; AVX-NEXT: retq 796 %x0 = extractelement <2 x double> %x, i32 0 797 %x1 = extractelement <2 x double> %x, i32 1 798 %x01 = fsub double %x1, %x0 799 ret double %x01 800} 801 802; 256-bit vectors, float/double, fadd/fsub 803 804define float @extract_extract_v8f32_fadd_f32(<8 x float> %x) { 805; SSE3-SLOW-LABEL: extract_extract_v8f32_fadd_f32: 806; SSE3-SLOW: # %bb.0: 807; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 808; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 809; SSE3-SLOW-NEXT: retq 810; 811; SSE3-FAST-LABEL: extract_extract_v8f32_fadd_f32: 812; SSE3-FAST: # %bb.0: 813; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 814; SSE3-FAST-NEXT: retq 815; 816; AVX-SLOW-LABEL: extract_extract_v8f32_fadd_f32: 817; AVX-SLOW: # %bb.0: 818; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 819; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 820; AVX-SLOW-NEXT: vzeroupper 821; AVX-SLOW-NEXT: retq 822; 823; AVX-FAST-LABEL: extract_extract_v8f32_fadd_f32: 824; AVX-FAST: # %bb.0: 825; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 826; AVX-FAST-NEXT: vzeroupper 827; AVX-FAST-NEXT: retq 828 %x0 = extractelement <8 x float> %x, i32 0 829 %x1 = extractelement <8 x float> %x, i32 1 830 %x01 = fadd float %x0, %x1 831 ret float %x01 832} 833 834define float @extract_extract_v8f32_fadd_f32_commute(<8 x float> %x) { 835; SSE3-SLOW-LABEL: extract_extract_v8f32_fadd_f32_commute: 836; SSE3-SLOW: # %bb.0: 837; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 838; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 839; SSE3-SLOW-NEXT: retq 840; 841; SSE3-FAST-LABEL: extract_extract_v8f32_fadd_f32_commute: 842; SSE3-FAST: # %bb.0: 843; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 844; SSE3-FAST-NEXT: retq 845; 846; AVX-SLOW-LABEL: extract_extract_v8f32_fadd_f32_commute: 847; AVX-SLOW: # %bb.0: 848; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 849; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 850; AVX-SLOW-NEXT: vzeroupper 851; AVX-SLOW-NEXT: retq 852; 853; AVX-FAST-LABEL: extract_extract_v8f32_fadd_f32_commute: 854; AVX-FAST: # %bb.0: 855; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 856; AVX-FAST-NEXT: vzeroupper 857; AVX-FAST-NEXT: retq 858 %x0 = extractelement <8 x float> %x, i32 0 859 %x1 = extractelement <8 x float> %x, i32 1 860 %x01 = fadd float %x1, %x0 861 ret float %x01 862} 863 864define double @extract_extract_v4f64_fadd_f64(<4 x double> %x) { 865; SSE3-SLOW-LABEL: extract_extract_v4f64_fadd_f64: 866; SSE3-SLOW: # %bb.0: 867; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 868; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 869; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 870; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 871; SSE3-SLOW-NEXT: retq 872; 873; SSE3-FAST-LABEL: extract_extract_v4f64_fadd_f64: 874; SSE3-FAST: # %bb.0: 875; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 876; SSE3-FAST-NEXT: retq 877; 878; AVX-SLOW-LABEL: extract_extract_v4f64_fadd_f64: 879; AVX-SLOW: # %bb.0: 880; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 881; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 882; AVX-SLOW-NEXT: vzeroupper 883; AVX-SLOW-NEXT: retq 884; 885; AVX-FAST-LABEL: extract_extract_v4f64_fadd_f64: 886; AVX-FAST: # %bb.0: 887; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 888; AVX-FAST-NEXT: vzeroupper 889; AVX-FAST-NEXT: retq 890 %x0 = extractelement <4 x double> %x, i32 0 891 %x1 = extractelement <4 x double> %x, i32 1 892 %x01 = fadd double %x0, %x1 893 ret double %x01 894} 895 896define double @extract_extract_v4f64_fadd_f64_commute(<4 x double> %x) { 897; SSE3-SLOW-LABEL: extract_extract_v4f64_fadd_f64_commute: 898; SSE3-SLOW: # %bb.0: 899; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 900; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 901; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 902; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 903; SSE3-SLOW-NEXT: retq 904; 905; SSE3-FAST-LABEL: extract_extract_v4f64_fadd_f64_commute: 906; SSE3-FAST: # %bb.0: 907; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 908; SSE3-FAST-NEXT: retq 909; 910; AVX-SLOW-LABEL: extract_extract_v4f64_fadd_f64_commute: 911; AVX-SLOW: # %bb.0: 912; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 913; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0 914; AVX-SLOW-NEXT: vzeroupper 915; AVX-SLOW-NEXT: retq 916; 917; AVX-FAST-LABEL: extract_extract_v4f64_fadd_f64_commute: 918; AVX-FAST: # %bb.0: 919; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 920; AVX-FAST-NEXT: vzeroupper 921; AVX-FAST-NEXT: retq 922 %x0 = extractelement <4 x double> %x, i32 0 923 %x1 = extractelement <4 x double> %x, i32 1 924 %x01 = fadd double %x1, %x0 925 ret double %x01 926} 927 928define float @extract_extract_v8f32_fsub_f32(<8 x float> %x) { 929; SSE3-SLOW-LABEL: extract_extract_v8f32_fsub_f32: 930; SSE3-SLOW: # %bb.0: 931; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 932; SSE3-SLOW-NEXT: subss %xmm1, %xmm0 933; SSE3-SLOW-NEXT: retq 934; 935; SSE3-FAST-LABEL: extract_extract_v8f32_fsub_f32: 936; SSE3-FAST: # %bb.0: 937; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 938; SSE3-FAST-NEXT: retq 939; 940; AVX-SLOW-LABEL: extract_extract_v8f32_fsub_f32: 941; AVX-SLOW: # %bb.0: 942; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 943; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0 944; AVX-SLOW-NEXT: vzeroupper 945; AVX-SLOW-NEXT: retq 946; 947; AVX-FAST-LABEL: extract_extract_v8f32_fsub_f32: 948; AVX-FAST: # %bb.0: 949; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 950; AVX-FAST-NEXT: vzeroupper 951; AVX-FAST-NEXT: retq 952 %x0 = extractelement <8 x float> %x, i32 0 953 %x1 = extractelement <8 x float> %x, i32 1 954 %x01 = fsub float %x0, %x1 955 ret float %x01 956} 957 958; Negative test...or get hoppy and negate? 959 960define float @extract_extract_v8f32_fsub_f32_commute(<8 x float> %x) { 961; SSE3-LABEL: extract_extract_v8f32_fsub_f32_commute: 962; SSE3: # %bb.0: 963; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 964; SSE3-NEXT: subss %xmm0, %xmm1 965; SSE3-NEXT: movaps %xmm1, %xmm0 966; SSE3-NEXT: retq 967; 968; AVX-LABEL: extract_extract_v8f32_fsub_f32_commute: 969; AVX: # %bb.0: 970; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 971; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 972; AVX-NEXT: vzeroupper 973; AVX-NEXT: retq 974 %x0 = extractelement <8 x float> %x, i32 0 975 %x1 = extractelement <8 x float> %x, i32 1 976 %x01 = fsub float %x1, %x0 977 ret float %x01 978} 979 980define double @extract_extract_v4f64_fsub_f64(<4 x double> %x) { 981; SSE3-SLOW-LABEL: extract_extract_v4f64_fsub_f64: 982; SSE3-SLOW: # %bb.0: 983; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 984; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 985; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0 986; SSE3-SLOW-NEXT: retq 987; 988; SSE3-FAST-LABEL: extract_extract_v4f64_fsub_f64: 989; SSE3-FAST: # %bb.0: 990; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0 991; SSE3-FAST-NEXT: retq 992; 993; AVX-SLOW-LABEL: extract_extract_v4f64_fsub_f64: 994; AVX-SLOW: # %bb.0: 995; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 996; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0 997; AVX-SLOW-NEXT: vzeroupper 998; AVX-SLOW-NEXT: retq 999; 1000; AVX-FAST-LABEL: extract_extract_v4f64_fsub_f64: 1001; AVX-FAST: # %bb.0: 1002; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 1003; AVX-FAST-NEXT: vzeroupper 1004; AVX-FAST-NEXT: retq 1005 %x0 = extractelement <4 x double> %x, i32 0 1006 %x1 = extractelement <4 x double> %x, i32 1 1007 %x01 = fsub double %x0, %x1 1008 ret double %x01 1009} 1010 1011; Negative test...or get hoppy and negate? 1012 1013define double @extract_extract_v4f64_fsub_f64_commute(<4 x double> %x) { 1014; SSE3-LABEL: extract_extract_v4f64_fsub_f64_commute: 1015; SSE3: # %bb.0: 1016; SSE3-NEXT: movapd %xmm0, %xmm1 1017; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1018; SSE3-NEXT: subsd %xmm0, %xmm1 1019; SSE3-NEXT: movapd %xmm1, %xmm0 1020; SSE3-NEXT: retq 1021; 1022; AVX-LABEL: extract_extract_v4f64_fsub_f64_commute: 1023; AVX: # %bb.0: 1024; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1025; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 1026; AVX-NEXT: vzeroupper 1027; AVX-NEXT: retq 1028 %x0 = extractelement <4 x double> %x, i32 0 1029 %x1 = extractelement <4 x double> %x, i32 1 1030 %x01 = fsub double %x1, %x0 1031 ret double %x01 1032} 1033 1034; 512-bit vectors, float/double, fadd/fsub 1035 1036define float @extract_extract_v16f32_fadd_f32(<16 x float> %x) { 1037; SSE3-SLOW-LABEL: extract_extract_v16f32_fadd_f32: 1038; SSE3-SLOW: # %bb.0: 1039; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1040; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 1041; SSE3-SLOW-NEXT: retq 1042; 1043; SSE3-FAST-LABEL: extract_extract_v16f32_fadd_f32: 1044; SSE3-FAST: # %bb.0: 1045; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 1046; SSE3-FAST-NEXT: retq 1047; 1048; AVX-SLOW-LABEL: extract_extract_v16f32_fadd_f32: 1049; AVX-SLOW: # %bb.0: 1050; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1051; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 1052; AVX-SLOW-NEXT: vzeroupper 1053; AVX-SLOW-NEXT: retq 1054; 1055; AVX-FAST-LABEL: extract_extract_v16f32_fadd_f32: 1056; AVX-FAST: # %bb.0: 1057; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1058; AVX-FAST-NEXT: vzeroupper 1059; AVX-FAST-NEXT: retq 1060 %x0 = extractelement <16 x float> %x, i32 0 1061 %x1 = extractelement <16 x float> %x, i32 1 1062 %x01 = fadd float %x0, %x1 1063 ret float %x01 1064} 1065 1066define float @extract_extract_v16f32_fadd_f32_commute(<16 x float> %x) { 1067; SSE3-SLOW-LABEL: extract_extract_v16f32_fadd_f32_commute: 1068; SSE3-SLOW: # %bb.0: 1069; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1070; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 1071; SSE3-SLOW-NEXT: retq 1072; 1073; SSE3-FAST-LABEL: extract_extract_v16f32_fadd_f32_commute: 1074; SSE3-FAST: # %bb.0: 1075; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 1076; SSE3-FAST-NEXT: retq 1077; 1078; AVX-SLOW-LABEL: extract_extract_v16f32_fadd_f32_commute: 1079; AVX-SLOW: # %bb.0: 1080; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1081; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 1082; AVX-SLOW-NEXT: vzeroupper 1083; AVX-SLOW-NEXT: retq 1084; 1085; AVX-FAST-LABEL: extract_extract_v16f32_fadd_f32_commute: 1086; AVX-FAST: # %bb.0: 1087; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1088; AVX-FAST-NEXT: vzeroupper 1089; AVX-FAST-NEXT: retq 1090 %x0 = extractelement <16 x float> %x, i32 0 1091 %x1 = extractelement <16 x float> %x, i32 1 1092 %x01 = fadd float %x1, %x0 1093 ret float %x01 1094} 1095 1096define double @extract_extract_v8f64_fadd_f64(<8 x double> %x) { 1097; SSE3-SLOW-LABEL: extract_extract_v8f64_fadd_f64: 1098; SSE3-SLOW: # %bb.0: 1099; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 1100; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1101; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 1102; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 1103; SSE3-SLOW-NEXT: retq 1104; 1105; SSE3-FAST-LABEL: extract_extract_v8f64_fadd_f64: 1106; SSE3-FAST: # %bb.0: 1107; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 1108; SSE3-FAST-NEXT: retq 1109; 1110; AVX-SLOW-LABEL: extract_extract_v8f64_fadd_f64: 1111; AVX-SLOW: # %bb.0: 1112; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1113; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1114; AVX-SLOW-NEXT: vzeroupper 1115; AVX-SLOW-NEXT: retq 1116; 1117; AVX-FAST-LABEL: extract_extract_v8f64_fadd_f64: 1118; AVX-FAST: # %bb.0: 1119; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1120; AVX-FAST-NEXT: vzeroupper 1121; AVX-FAST-NEXT: retq 1122 %x0 = extractelement <8 x double> %x, i32 0 1123 %x1 = extractelement <8 x double> %x, i32 1 1124 %x01 = fadd double %x0, %x1 1125 ret double %x01 1126} 1127 1128define double @extract_extract_v8f64_fadd_f64_commute(<8 x double> %x) { 1129; SSE3-SLOW-LABEL: extract_extract_v8f64_fadd_f64_commute: 1130; SSE3-SLOW: # %bb.0: 1131; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 1132; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1133; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 1134; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 1135; SSE3-SLOW-NEXT: retq 1136; 1137; SSE3-FAST-LABEL: extract_extract_v8f64_fadd_f64_commute: 1138; SSE3-FAST: # %bb.0: 1139; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 1140; SSE3-FAST-NEXT: retq 1141; 1142; AVX-SLOW-LABEL: extract_extract_v8f64_fadd_f64_commute: 1143; AVX-SLOW: # %bb.0: 1144; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1145; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1146; AVX-SLOW-NEXT: vzeroupper 1147; AVX-SLOW-NEXT: retq 1148; 1149; AVX-FAST-LABEL: extract_extract_v8f64_fadd_f64_commute: 1150; AVX-FAST: # %bb.0: 1151; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1152; AVX-FAST-NEXT: vzeroupper 1153; AVX-FAST-NEXT: retq 1154 %x0 = extractelement <8 x double> %x, i32 0 1155 %x1 = extractelement <8 x double> %x, i32 1 1156 %x01 = fadd double %x1, %x0 1157 ret double %x01 1158} 1159 1160define float @extract_extract_v16f32_fsub_f32(<16 x float> %x) { 1161; SSE3-SLOW-LABEL: extract_extract_v16f32_fsub_f32: 1162; SSE3-SLOW: # %bb.0: 1163; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1164; SSE3-SLOW-NEXT: subss %xmm1, %xmm0 1165; SSE3-SLOW-NEXT: retq 1166; 1167; SSE3-FAST-LABEL: extract_extract_v16f32_fsub_f32: 1168; SSE3-FAST: # %bb.0: 1169; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 1170; SSE3-FAST-NEXT: retq 1171; 1172; AVX-SLOW-LABEL: extract_extract_v16f32_fsub_f32: 1173; AVX-SLOW: # %bb.0: 1174; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1175; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0 1176; AVX-SLOW-NEXT: vzeroupper 1177; AVX-SLOW-NEXT: retq 1178; 1179; AVX-FAST-LABEL: extract_extract_v16f32_fsub_f32: 1180; AVX-FAST: # %bb.0: 1181; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 1182; AVX-FAST-NEXT: vzeroupper 1183; AVX-FAST-NEXT: retq 1184 %x0 = extractelement <16 x float> %x, i32 0 1185 %x1 = extractelement <16 x float> %x, i32 1 1186 %x01 = fsub float %x0, %x1 1187 ret float %x01 1188} 1189 1190define float @extract_extract_v16f32_fsub_f32_commute(<16 x float> %x) { 1191; SSE3-LABEL: extract_extract_v16f32_fsub_f32_commute: 1192; SSE3: # %bb.0: 1193; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1194; SSE3-NEXT: subss %xmm0, %xmm1 1195; SSE3-NEXT: movaps %xmm1, %xmm0 1196; SSE3-NEXT: retq 1197; 1198; AVX-LABEL: extract_extract_v16f32_fsub_f32_commute: 1199; AVX: # %bb.0: 1200; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1201; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 1202; AVX-NEXT: vzeroupper 1203; AVX-NEXT: retq 1204 %x0 = extractelement <16 x float> %x, i32 0 1205 %x1 = extractelement <16 x float> %x, i32 1 1206 %x01 = fsub float %x1, %x0 1207 ret float %x01 1208} 1209 1210define double @extract_extract_v8f64_fsub_f64(<8 x double> %x) { 1211; SSE3-SLOW-LABEL: extract_extract_v8f64_fsub_f64: 1212; SSE3-SLOW: # %bb.0: 1213; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 1214; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1215; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0 1216; SSE3-SLOW-NEXT: retq 1217; 1218; SSE3-FAST-LABEL: extract_extract_v8f64_fsub_f64: 1219; SSE3-FAST: # %bb.0: 1220; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0 1221; SSE3-FAST-NEXT: retq 1222; 1223; AVX-SLOW-LABEL: extract_extract_v8f64_fsub_f64: 1224; AVX-SLOW: # %bb.0: 1225; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1226; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0 1227; AVX-SLOW-NEXT: vzeroupper 1228; AVX-SLOW-NEXT: retq 1229; 1230; AVX-FAST-LABEL: extract_extract_v8f64_fsub_f64: 1231; AVX-FAST: # %bb.0: 1232; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 1233; AVX-FAST-NEXT: vzeroupper 1234; AVX-FAST-NEXT: retq 1235 %x0 = extractelement <8 x double> %x, i32 0 1236 %x1 = extractelement <8 x double> %x, i32 1 1237 %x01 = fsub double %x0, %x1 1238 ret double %x01 1239} 1240 1241define double @extract_extract_v8f64_fsub_f64_commute(<8 x double> %x) { 1242; SSE3-LABEL: extract_extract_v8f64_fsub_f64_commute: 1243; SSE3: # %bb.0: 1244; SSE3-NEXT: movapd %xmm0, %xmm1 1245; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1246; SSE3-NEXT: subsd %xmm0, %xmm1 1247; SSE3-NEXT: movapd %xmm1, %xmm0 1248; SSE3-NEXT: retq 1249; 1250; AVX-LABEL: extract_extract_v8f64_fsub_f64_commute: 1251; AVX: # %bb.0: 1252; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1253; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 1254; AVX-NEXT: vzeroupper 1255; AVX-NEXT: retq 1256 %x0 = extractelement <8 x double> %x, i32 0 1257 %x1 = extractelement <8 x double> %x, i32 1 1258 %x01 = fsub double %x1, %x0 1259 ret double %x01 1260} 1261 1262; Check output when 1 or both extracts have extra uses. 1263 1264define float @extract_extract_v4f32_fadd_f32_uses1(<4 x float> %x, float* %p) { 1265; SSE3-SLOW-LABEL: extract_extract_v4f32_fadd_f32_uses1: 1266; SSE3-SLOW: # %bb.0: 1267; SSE3-SLOW-NEXT: movss %xmm0, (%rdi) 1268; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1269; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 1270; SSE3-SLOW-NEXT: retq 1271; 1272; SSE3-FAST-LABEL: extract_extract_v4f32_fadd_f32_uses1: 1273; SSE3-FAST: # %bb.0: 1274; SSE3-FAST-NEXT: movss %xmm0, (%rdi) 1275; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 1276; SSE3-FAST-NEXT: retq 1277; 1278; AVX-SLOW-LABEL: extract_extract_v4f32_fadd_f32_uses1: 1279; AVX-SLOW: # %bb.0: 1280; AVX-SLOW-NEXT: vmovss %xmm0, (%rdi) 1281; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1282; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 1283; AVX-SLOW-NEXT: retq 1284; 1285; AVX-FAST-LABEL: extract_extract_v4f32_fadd_f32_uses1: 1286; AVX-FAST: # %bb.0: 1287; AVX-FAST-NEXT: vmovss %xmm0, (%rdi) 1288; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1289; AVX-FAST-NEXT: retq 1290 %x0 = extractelement <4 x float> %x, i32 0 1291 store float %x0, float* %p 1292 %x1 = extractelement <4 x float> %x, i32 1 1293 %x01 = fadd float %x0, %x1 1294 ret float %x01 1295} 1296 1297define float @extract_extract_v4f32_fadd_f32_uses2(<4 x float> %x, float* %p) { 1298; SSE3-SLOW-LABEL: extract_extract_v4f32_fadd_f32_uses2: 1299; SSE3-SLOW: # %bb.0: 1300; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1301; SSE3-SLOW-NEXT: movss %xmm1, (%rdi) 1302; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 1303; SSE3-SLOW-NEXT: retq 1304; 1305; SSE3-FAST-LABEL: extract_extract_v4f32_fadd_f32_uses2: 1306; SSE3-FAST: # %bb.0: 1307; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1308; SSE3-FAST-NEXT: movss %xmm1, (%rdi) 1309; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 1310; SSE3-FAST-NEXT: retq 1311; 1312; AVX-SLOW-LABEL: extract_extract_v4f32_fadd_f32_uses2: 1313; AVX-SLOW: # %bb.0: 1314; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1315; AVX-SLOW-NEXT: vmovss %xmm1, (%rdi) 1316; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 1317; AVX-SLOW-NEXT: retq 1318; 1319; AVX-FAST-LABEL: extract_extract_v4f32_fadd_f32_uses2: 1320; AVX-FAST: # %bb.0: 1321; AVX-FAST-NEXT: vextractps $1, %xmm0, (%rdi) 1322; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1323; AVX-FAST-NEXT: retq 1324 %x0 = extractelement <4 x float> %x, i32 0 1325 %x1 = extractelement <4 x float> %x, i32 1 1326 store float %x1, float* %p 1327 %x01 = fadd float %x0, %x1 1328 ret float %x01 1329} 1330 1331define float @extract_extract_v4f32_fadd_f32_uses3(<4 x float> %x, float* %p1, float* %p2) { 1332; SSE3-LABEL: extract_extract_v4f32_fadd_f32_uses3: 1333; SSE3: # %bb.0: 1334; SSE3-NEXT: movss %xmm0, (%rdi) 1335; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1336; SSE3-NEXT: movss %xmm1, (%rsi) 1337; SSE3-NEXT: addss %xmm1, %xmm0 1338; SSE3-NEXT: retq 1339; 1340; AVX-LABEL: extract_extract_v4f32_fadd_f32_uses3: 1341; AVX: # %bb.0: 1342; AVX-NEXT: vmovss %xmm0, (%rdi) 1343; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1344; AVX-NEXT: vmovss %xmm1, (%rsi) 1345; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 1346; AVX-NEXT: retq 1347 %x0 = extractelement <4 x float> %x, i32 0 1348 store float %x0, float* %p1 1349 %x1 = extractelement <4 x float> %x, i32 1 1350 store float %x1, float* %p2 1351 %x01 = fadd float %x0, %x1 1352 ret float %x01 1353} 1354 1355