1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE3,SSE3-FAST 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1,AVX1-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1,AVX1-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2,AVX2-SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2,AVX2-FAST 8; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW 9; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST 10 11define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) { 12; SSE3-LABEL: haddpd1: 13; SSE3: # %bb.0: 14; SSE3-NEXT: haddpd %xmm1, %xmm0 15; SSE3-NEXT: retq 16; 17; AVX-LABEL: haddpd1: 18; AVX: # %bb.0: 19; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 20; AVX-NEXT: retq 21 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2> 22 %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3> 23 %r = fadd <2 x double> %a, %b 24 ret <2 x double> %r 25} 26 27define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) { 28; SSE3-LABEL: haddpd2: 29; SSE3: # %bb.0: 30; SSE3-NEXT: haddpd %xmm1, %xmm0 31; SSE3-NEXT: retq 32; 33; AVX-LABEL: haddpd2: 34; AVX: # %bb.0: 35; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 36; AVX-NEXT: retq 37 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 2> 38 %b = shufflevector <2 x double> %y, <2 x double> %x, <2 x i32> <i32 2, i32 1> 39 %r = fadd <2 x double> %a, %b 40 ret <2 x double> %r 41} 42 43define <2 x double> @haddpd3(<2 x double> %x) { 44; SSE3-SLOW-LABEL: haddpd3: 45; SSE3-SLOW: # %bb.0: 46; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 47; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 48; SSE3-SLOW-NEXT: addpd %xmm0, %xmm1 49; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 50; SSE3-SLOW-NEXT: retq 51; 52; SSE3-FAST-LABEL: haddpd3: 53; SSE3-FAST: # %bb.0: 54; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 55; SSE3-FAST-NEXT: retq 56; 57; AVX-SLOW-LABEL: haddpd3: 58; AVX-SLOW: # %bb.0: 59; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 60; AVX-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 61; AVX-SLOW-NEXT: retq 62; 63; AVX-FAST-LABEL: haddpd3: 64; AVX-FAST: # %bb.0: 65; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 66; AVX-FAST-NEXT: retq 67 %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef> 68 %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 69 %r = fadd <2 x double> %a, %b 70 ret <2 x double> %r 71} 72 73define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) { 74; SSE3-LABEL: haddps1: 75; SSE3: # %bb.0: 76; SSE3-NEXT: haddps %xmm1, %xmm0 77; SSE3-NEXT: retq 78; 79; AVX-LABEL: haddps1: 80; AVX: # %bb.0: 81; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 82; AVX-NEXT: retq 83 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 84 %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 85 %r = fadd <4 x float> %a, %b 86 ret <4 x float> %r 87} 88 89define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) { 90; SSE3-LABEL: haddps2: 91; SSE3: # %bb.0: 92; SSE3-NEXT: haddps %xmm1, %xmm0 93; SSE3-NEXT: retq 94; 95; AVX-LABEL: haddps2: 96; AVX: # %bb.0: 97; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 98; AVX-NEXT: retq 99 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6> 100 %b = shufflevector <4 x float> %y, <4 x float> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3> 101 %r = fadd <4 x float> %a, %b 102 ret <4 x float> %r 103} 104 105define <4 x float> @haddps3(<4 x float> %x) { 106; SSE3-LABEL: haddps3: 107; SSE3: # %bb.0: 108; SSE3-NEXT: haddps %xmm0, %xmm0 109; SSE3-NEXT: retq 110; 111; AVX-LABEL: haddps3: 112; AVX: # %bb.0: 113; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 114; AVX-NEXT: retq 115 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6> 116 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7> 117 %r = fadd <4 x float> %a, %b 118 ret <4 x float> %r 119} 120 121define <4 x float> @haddps4(<4 x float> %x) { 122; SSE3-LABEL: haddps4: 123; SSE3: # %bb.0: 124; SSE3-NEXT: haddps %xmm0, %xmm0 125; SSE3-NEXT: retq 126; 127; AVX-LABEL: haddps4: 128; AVX: # %bb.0: 129; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 130; AVX-NEXT: retq 131 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 132 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 133 %r = fadd <4 x float> %a, %b 134 ret <4 x float> %r 135} 136 137define <4 x float> @haddps5(<4 x float> %x) { 138; SSE3-LABEL: haddps5: 139; SSE3: # %bb.0: 140; SSE3-NEXT: haddps %xmm0, %xmm0 141; SSE3-NEXT: retq 142; 143; AVX-LABEL: haddps5: 144; AVX: # %bb.0: 145; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 146; AVX-NEXT: retq 147 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef> 148 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef> 149 %r = fadd <4 x float> %a, %b 150 ret <4 x float> %r 151} 152 153define <4 x float> @haddps6(<4 x float> %x) { 154; SSE3-SLOW-LABEL: haddps6: 155; SSE3-SLOW: # %bb.0: 156; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 157; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 158; SSE3-SLOW-NEXT: retq 159; 160; SSE3-FAST-LABEL: haddps6: 161; SSE3-FAST: # %bb.0: 162; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 163; SSE3-FAST-NEXT: retq 164; 165; AVX-SLOW-LABEL: haddps6: 166; AVX-SLOW: # %bb.0: 167; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 168; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 169; AVX-SLOW-NEXT: retq 170; 171; AVX-FAST-LABEL: haddps6: 172; AVX-FAST: # %bb.0: 173; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 174; AVX-FAST-NEXT: retq 175 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 176 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 177 %r = fadd <4 x float> %a, %b 178 ret <4 x float> %r 179} 180 181define <4 x float> @haddps7(<4 x float> %x) { 182; SSE3-LABEL: haddps7: 183; SSE3: # %bb.0: 184; SSE3-NEXT: haddps %xmm0, %xmm0 185; SSE3-NEXT: retq 186; 187; AVX-LABEL: haddps7: 188; AVX: # %bb.0: 189; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 190; AVX-NEXT: retq 191 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef> 192 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef> 193 %r = fadd <4 x float> %a, %b 194 ret <4 x float> %r 195} 196 197define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) { 198; SSE3-LABEL: hsubpd1: 199; SSE3: # %bb.0: 200; SSE3-NEXT: hsubpd %xmm1, %xmm0 201; SSE3-NEXT: retq 202; 203; AVX-LABEL: hsubpd1: 204; AVX: # %bb.0: 205; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 206; AVX-NEXT: retq 207 %a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2> 208 %b = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 3> 209 %r = fsub <2 x double> %a, %b 210 ret <2 x double> %r 211} 212 213define <2 x double> @hsubpd2(<2 x double> %x) { 214; SSE3-SLOW-LABEL: hsubpd2: 215; SSE3-SLOW: # %bb.0: 216; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 217; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 218; SSE3-SLOW-NEXT: subpd %xmm1, %xmm0 219; SSE3-SLOW-NEXT: retq 220; 221; SSE3-FAST-LABEL: hsubpd2: 222; SSE3-FAST: # %bb.0: 223; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0 224; SSE3-FAST-NEXT: retq 225; 226; AVX-SLOW-LABEL: hsubpd2: 227; AVX-SLOW: # %bb.0: 228; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 229; AVX-SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0 230; AVX-SLOW-NEXT: retq 231; 232; AVX-FAST-LABEL: hsubpd2: 233; AVX-FAST: # %bb.0: 234; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 235; AVX-FAST-NEXT: retq 236 %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef> 237 %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 1, i32 undef> 238 %r = fsub <2 x double> %a, %b 239 ret <2 x double> %r 240} 241 242define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) { 243; SSE3-LABEL: hsubps1: 244; SSE3: # %bb.0: 245; SSE3-NEXT: hsubps %xmm1, %xmm0 246; SSE3-NEXT: retq 247; 248; AVX-LABEL: hsubps1: 249; AVX: # %bb.0: 250; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0 251; AVX-NEXT: retq 252 %a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 253 %b = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 254 %r = fsub <4 x float> %a, %b 255 ret <4 x float> %r 256} 257 258define <4 x float> @hsubps2(<4 x float> %x) { 259; SSE3-LABEL: hsubps2: 260; SSE3: # %bb.0: 261; SSE3-NEXT: hsubps %xmm0, %xmm0 262; SSE3-NEXT: retq 263; 264; AVX-LABEL: hsubps2: 265; AVX: # %bb.0: 266; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0 267; AVX-NEXT: retq 268 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6> 269 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7> 270 %r = fsub <4 x float> %a, %b 271 ret <4 x float> %r 272} 273 274define <4 x float> @hsubps3(<4 x float> %x) { 275; SSE3-LABEL: hsubps3: 276; SSE3: # %bb.0: 277; SSE3-NEXT: hsubps %xmm0, %xmm0 278; SSE3-NEXT: retq 279; 280; AVX-LABEL: hsubps3: 281; AVX: # %bb.0: 282; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0 283; AVX-NEXT: retq 284 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 285 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 286 %r = fsub <4 x float> %a, %b 287 ret <4 x float> %r 288} 289 290define <4 x float> @hsubps4(<4 x float> %x) { 291; SSE3-SLOW-LABEL: hsubps4: 292; SSE3-SLOW: # %bb.0: 293; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 294; SSE3-SLOW-NEXT: subps %xmm1, %xmm0 295; SSE3-SLOW-NEXT: retq 296; 297; SSE3-FAST-LABEL: hsubps4: 298; SSE3-FAST: # %bb.0: 299; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 300; SSE3-FAST-NEXT: retq 301; 302; AVX-SLOW-LABEL: hsubps4: 303; AVX-SLOW: # %bb.0: 304; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 305; AVX-SLOW-NEXT: vsubps %xmm1, %xmm0, %xmm0 306; AVX-SLOW-NEXT: retq 307; 308; AVX-FAST-LABEL: hsubps4: 309; AVX-FAST: # %bb.0: 310; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 311; AVX-FAST-NEXT: retq 312 %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 313 %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 314 %r = fsub <4 x float> %a, %b 315 ret <4 x float> %r 316} 317 318define <8 x float> @vhaddps1(<8 x float> %x, <8 x float> %y) { 319; SSE3-LABEL: vhaddps1: 320; SSE3: # %bb.0: 321; SSE3-NEXT: haddps %xmm2, %xmm0 322; SSE3-NEXT: haddps %xmm3, %xmm1 323; SSE3-NEXT: retq 324; 325; AVX-LABEL: vhaddps1: 326; AVX: # %bb.0: 327; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 328; AVX-NEXT: retq 329 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> 330 %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> 331 %r = fadd <8 x float> %a, %b 332 ret <8 x float> %r 333} 334 335define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) { 336; SSE3-LABEL: vhaddps2: 337; SSE3: # %bb.0: 338; SSE3-NEXT: haddps %xmm2, %xmm0 339; SSE3-NEXT: haddps %xmm3, %xmm1 340; SSE3-NEXT: retq 341; 342; AVX-LABEL: vhaddps2: 343; AVX: # %bb.0: 344; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 345; AVX-NEXT: retq 346 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14> 347 %b = shufflevector <8 x float> %y, <8 x float> %x, <8 x i32> <i32 8, i32 11, i32 0, i32 3, i32 12, i32 15, i32 4, i32 7> 348 %r = fadd <8 x float> %a, %b 349 ret <8 x float> %r 350} 351 352define <8 x float> @vhaddps3(<8 x float> %x) { 353; SSE3-LABEL: vhaddps3: 354; SSE3: # %bb.0: 355; SSE3-NEXT: haddps %xmm0, %xmm0 356; SSE3-NEXT: haddps %xmm1, %xmm1 357; SSE3-NEXT: retq 358; 359; AVX-LABEL: vhaddps3: 360; AVX: # %bb.0: 361; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 362; AVX-NEXT: retq 363 %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14> 364 %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15> 365 %r = fadd <8 x float> %a, %b 366 ret <8 x float> %r 367} 368 369define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) { 370; SSE3-LABEL: vhsubps1: 371; SSE3: # %bb.0: 372; SSE3-NEXT: hsubps %xmm2, %xmm0 373; SSE3-NEXT: hsubps %xmm3, %xmm1 374; SSE3-NEXT: retq 375; 376; AVX-LABEL: vhsubps1: 377; AVX: # %bb.0: 378; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0 379; AVX-NEXT: retq 380 %a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> 381 %b = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> 382 %r = fsub <8 x float> %a, %b 383 ret <8 x float> %r 384} 385 386define <8 x float> @vhsubps3(<8 x float> %x) { 387; SSE3-LABEL: vhsubps3: 388; SSE3: # %bb.0: 389; SSE3-NEXT: hsubps %xmm0, %xmm0 390; SSE3-NEXT: hsubps %xmm1, %xmm1 391; SSE3-NEXT: retq 392; 393; AVX-LABEL: vhsubps3: 394; AVX: # %bb.0: 395; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0 396; AVX-NEXT: retq 397 %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14> 398 %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 9, i32 undef, i32 5, i32 7, i32 13, i32 15> 399 %r = fsub <8 x float> %a, %b 400 ret <8 x float> %r 401} 402 403define <4 x double> @vhaddpd1(<4 x double> %x, <4 x double> %y) { 404; SSE3-LABEL: vhaddpd1: 405; SSE3: # %bb.0: 406; SSE3-NEXT: haddpd %xmm2, %xmm0 407; SSE3-NEXT: haddpd %xmm3, %xmm1 408; SSE3-NEXT: retq 409; 410; AVX-LABEL: vhaddpd1: 411; AVX: # %bb.0: 412; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 413; AVX-NEXT: retq 414 %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 415 %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 416 %r = fadd <4 x double> %a, %b 417 ret <4 x double> %r 418} 419 420define <4 x double> @vhsubpd1(<4 x double> %x, <4 x double> %y) { 421; SSE3-LABEL: vhsubpd1: 422; SSE3: # %bb.0: 423; SSE3-NEXT: hsubpd %xmm2, %xmm0 424; SSE3-NEXT: hsubpd %xmm3, %xmm1 425; SSE3-NEXT: retq 426; 427; AVX-LABEL: vhsubpd1: 428; AVX: # %bb.0: 429; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 430; AVX-NEXT: retq 431 %a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 432 %b = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 433 %r = fsub <4 x double> %a, %b 434 ret <4 x double> %r 435} 436 437define <2 x float> @haddps_v2f32(<4 x float> %v0) { 438; SSE3-LABEL: haddps_v2f32: 439; SSE3: # %bb.0: 440; SSE3-NEXT: haddps %xmm0, %xmm0 441; SSE3-NEXT: retq 442; 443; AVX-LABEL: haddps_v2f32: 444; AVX: # %bb.0: 445; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 446; AVX-NEXT: retq 447 %v0.0 = extractelement <4 x float> %v0, i32 0 448 %v0.1 = extractelement <4 x float> %v0, i32 1 449 %v0.2 = extractelement <4 x float> %v0, i32 2 450 %v0.3 = extractelement <4 x float> %v0, i32 3 451 %op0 = fadd float %v0.0, %v0.1 452 %op1 = fadd float %v0.2, %v0.3 453 %res0 = insertelement <2 x float> undef, float %op0, i32 0 454 %res1 = insertelement <2 x float> %res0, float %op1, i32 1 455 ret <2 x float> %res1 456} 457 458; 128-bit vectors, float/double, fadd/fsub 459 460define float @extract_extract01_v4f32_fadd_f32(<4 x float> %x) { 461; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32: 462; SSE3-SLOW: # %bb.0: 463; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 464; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 465; SSE3-SLOW-NEXT: retq 466; 467; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32: 468; SSE3-FAST: # %bb.0: 469; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 470; SSE3-FAST-NEXT: retq 471; 472; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32: 473; AVX-SLOW: # %bb.0: 474; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 475; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 476; AVX-SLOW-NEXT: retq 477; 478; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32: 479; AVX-FAST: # %bb.0: 480; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 481; AVX-FAST-NEXT: retq 482 %x0 = extractelement <4 x float> %x, i32 0 483 %x1 = extractelement <4 x float> %x, i32 1 484 %x01 = fadd float %x0, %x1 485 ret float %x01 486} 487 488define float @extract_extract23_v4f32_fadd_f32(<4 x float> %x) { 489; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32: 490; SSE3-SLOW: # %bb.0: 491; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 492; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 493; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 494; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 495; SSE3-SLOW-NEXT: retq 496; 497; SSE3-FAST-LABEL: extract_extract23_v4f32_fadd_f32: 498; SSE3-FAST: # %bb.0: 499; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 500; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 501; SSE3-FAST-NEXT: retq 502; 503; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32: 504; AVX-SLOW: # %bb.0: 505; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 506; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 507; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 508; AVX-SLOW-NEXT: retq 509; 510; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32: 511; AVX-FAST: # %bb.0: 512; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 513; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 514; AVX-FAST-NEXT: retq 515 %x0 = extractelement <4 x float> %x, i32 2 516 %x1 = extractelement <4 x float> %x, i32 3 517 %x01 = fadd float %x0, %x1 518 ret float %x01 519} 520 521define float @extract_extract01_v4f32_fadd_f32_commute(<4 x float> %x) { 522; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute: 523; SSE3-SLOW: # %bb.0: 524; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 525; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 526; SSE3-SLOW-NEXT: retq 527; 528; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute: 529; SSE3-FAST: # %bb.0: 530; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 531; SSE3-FAST-NEXT: retq 532; 533; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute: 534; AVX-SLOW: # %bb.0: 535; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 536; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 537; AVX-SLOW-NEXT: retq 538; 539; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute: 540; AVX-FAST: # %bb.0: 541; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 542; AVX-FAST-NEXT: retq 543 %x0 = extractelement <4 x float> %x, i32 0 544 %x1 = extractelement <4 x float> %x, i32 1 545 %x01 = fadd float %x1, %x0 546 ret float %x01 547} 548 549define float @extract_extract23_v4f32_fadd_f32_commute(<4 x float> %x) { 550; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute: 551; SSE3-SLOW: # %bb.0: 552; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 553; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 554; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 555; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 556; SSE3-SLOW-NEXT: retq 557; 558; SSE3-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute: 559; SSE3-FAST: # %bb.0: 560; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 561; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 562; SSE3-FAST-NEXT: retq 563; 564; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute: 565; AVX-SLOW: # %bb.0: 566; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 567; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 568; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 569; AVX-SLOW-NEXT: retq 570; 571; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute: 572; AVX-FAST: # %bb.0: 573; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 574; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 575; AVX-FAST-NEXT: retq 576 %x0 = extractelement <4 x float> %x, i32 2 577 %x1 = extractelement <4 x float> %x, i32 3 578 %x01 = fadd float %x1, %x0 579 ret float %x01 580} 581 582define double @extract_extract01_v2f64_fadd_f64(<2 x double> %x) { 583; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64: 584; SSE3-SLOW: # %bb.0: 585; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 586; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 587; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 588; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 589; SSE3-SLOW-NEXT: retq 590; 591; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64: 592; SSE3-FAST: # %bb.0: 593; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 594; SSE3-FAST-NEXT: retq 595; 596; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64: 597; AVX-SLOW: # %bb.0: 598; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 599; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 600; AVX-SLOW-NEXT: retq 601; 602; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64: 603; AVX-FAST: # %bb.0: 604; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 605; AVX-FAST-NEXT: retq 606 %x0 = extractelement <2 x double> %x, i32 0 607 %x1 = extractelement <2 x double> %x, i32 1 608 %x01 = fadd double %x0, %x1 609 ret double %x01 610} 611 612define double @extract_extract01_v2f64_fadd_f64_commute(<2 x double> %x) { 613; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute: 614; SSE3-SLOW: # %bb.0: 615; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 616; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 617; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 618; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 619; SSE3-SLOW-NEXT: retq 620; 621; SSE3-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute: 622; SSE3-FAST: # %bb.0: 623; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 624; SSE3-FAST-NEXT: retq 625; 626; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute: 627; AVX-SLOW: # %bb.0: 628; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 629; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0 630; AVX-SLOW-NEXT: retq 631; 632; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute: 633; AVX-FAST: # %bb.0: 634; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 635; AVX-FAST-NEXT: retq 636 %x0 = extractelement <2 x double> %x, i32 0 637 %x1 = extractelement <2 x double> %x, i32 1 638 %x01 = fadd double %x1, %x0 639 ret double %x01 640} 641 642define float @extract_extract01_v4f32_fsub_f32(<4 x float> %x) { 643; SSE3-SLOW-LABEL: extract_extract01_v4f32_fsub_f32: 644; SSE3-SLOW: # %bb.0: 645; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 646; SSE3-SLOW-NEXT: subss %xmm1, %xmm0 647; SSE3-SLOW-NEXT: retq 648; 649; SSE3-FAST-LABEL: extract_extract01_v4f32_fsub_f32: 650; SSE3-FAST: # %bb.0: 651; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 652; SSE3-FAST-NEXT: retq 653; 654; AVX-SLOW-LABEL: extract_extract01_v4f32_fsub_f32: 655; AVX-SLOW: # %bb.0: 656; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 657; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0 658; AVX-SLOW-NEXT: retq 659; 660; AVX-FAST-LABEL: extract_extract01_v4f32_fsub_f32: 661; AVX-FAST: # %bb.0: 662; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 663; AVX-FAST-NEXT: retq 664 %x0 = extractelement <4 x float> %x, i32 0 665 %x1 = extractelement <4 x float> %x, i32 1 666 %x01 = fsub float %x0, %x1 667 ret float %x01 668} 669 670define float @extract_extract23_v4f32_fsub_f32(<4 x float> %x) { 671; SSE3-SLOW-LABEL: extract_extract23_v4f32_fsub_f32: 672; SSE3-SLOW: # %bb.0: 673; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 674; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 675; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 676; SSE3-SLOW-NEXT: subss %xmm0, %xmm1 677; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 678; SSE3-SLOW-NEXT: retq 679; 680; SSE3-FAST-LABEL: extract_extract23_v4f32_fsub_f32: 681; SSE3-FAST: # %bb.0: 682; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 683; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 684; SSE3-FAST-NEXT: retq 685; 686; AVX-SLOW-LABEL: extract_extract23_v4f32_fsub_f32: 687; AVX-SLOW: # %bb.0: 688; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 689; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 690; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0 691; AVX-SLOW-NEXT: retq 692; 693; AVX-FAST-LABEL: extract_extract23_v4f32_fsub_f32: 694; AVX-FAST: # %bb.0: 695; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 696; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 697; AVX-FAST-NEXT: retq 698 %x0 = extractelement <4 x float> %x, i32 2 699 %x1 = extractelement <4 x float> %x, i32 3 700 %x01 = fsub float %x0, %x1 701 ret float %x01 702} 703 704define float @extract_extract01_v4f32_fsub_f32_commute(<4 x float> %x) { 705; SSE3-LABEL: extract_extract01_v4f32_fsub_f32_commute: 706; SSE3: # %bb.0: 707; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 708; SSE3-NEXT: subss %xmm0, %xmm1 709; SSE3-NEXT: movaps %xmm1, %xmm0 710; SSE3-NEXT: retq 711; 712; AVX-LABEL: extract_extract01_v4f32_fsub_f32_commute: 713; AVX: # %bb.0: 714; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 715; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 716; AVX-NEXT: retq 717 %x0 = extractelement <4 x float> %x, i32 0 718 %x1 = extractelement <4 x float> %x, i32 1 719 %x01 = fsub float %x1, %x0 720 ret float %x01 721} 722 723define float @extract_extract23_v4f32_fsub_f32_commute(<4 x float> %x) { 724; SSE3-LABEL: extract_extract23_v4f32_fsub_f32_commute: 725; SSE3: # %bb.0: 726; SSE3-NEXT: movaps %xmm0, %xmm1 727; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 728; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 729; SSE3-NEXT: subss %xmm1, %xmm0 730; SSE3-NEXT: retq 731; 732; AVX-LABEL: extract_extract23_v4f32_fsub_f32_commute: 733; AVX: # %bb.0: 734; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 735; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 736; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 737; AVX-NEXT: retq 738 %x0 = extractelement <4 x float> %x, i32 2 739 %x1 = extractelement <4 x float> %x, i32 3 740 %x01 = fsub float %x1, %x0 741 ret float %x01 742} 743 744define double @extract_extract01_v2f64_fsub_f64(<2 x double> %x) { 745; SSE3-SLOW-LABEL: extract_extract01_v2f64_fsub_f64: 746; SSE3-SLOW: # %bb.0: 747; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 748; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 749; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0 750; SSE3-SLOW-NEXT: retq 751; 752; SSE3-FAST-LABEL: extract_extract01_v2f64_fsub_f64: 753; SSE3-FAST: # %bb.0: 754; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0 755; SSE3-FAST-NEXT: retq 756; 757; AVX-SLOW-LABEL: extract_extract01_v2f64_fsub_f64: 758; AVX-SLOW: # %bb.0: 759; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 760; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0 761; AVX-SLOW-NEXT: retq 762; 763; AVX-FAST-LABEL: extract_extract01_v2f64_fsub_f64: 764; AVX-FAST: # %bb.0: 765; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 766; AVX-FAST-NEXT: retq 767 %x0 = extractelement <2 x double> %x, i32 0 768 %x1 = extractelement <2 x double> %x, i32 1 769 %x01 = fsub double %x0, %x1 770 ret double %x01 771} 772 773define double @extract_extract01_v2f64_fsub_f64_commute(<2 x double> %x) { 774; SSE3-LABEL: extract_extract01_v2f64_fsub_f64_commute: 775; SSE3: # %bb.0: 776; SSE3-NEXT: movapd %xmm0, %xmm1 777; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 778; SSE3-NEXT: subsd %xmm0, %xmm1 779; SSE3-NEXT: movapd %xmm1, %xmm0 780; SSE3-NEXT: retq 781; 782; AVX-LABEL: extract_extract01_v2f64_fsub_f64_commute: 783; AVX: # %bb.0: 784; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 785; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 786; AVX-NEXT: retq 787 %x0 = extractelement <2 x double> %x, i32 0 788 %x1 = extractelement <2 x double> %x, i32 1 789 %x01 = fsub double %x1, %x0 790 ret double %x01 791} 792 793; 256-bit vectors, float/double, fadd/fsub 794 795define float @extract_extract01_v8f32_fadd_f32(<8 x float> %x) { 796; SSE3-SLOW-LABEL: extract_extract01_v8f32_fadd_f32: 797; SSE3-SLOW: # %bb.0: 798; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 799; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 800; SSE3-SLOW-NEXT: retq 801; 802; SSE3-FAST-LABEL: extract_extract01_v8f32_fadd_f32: 803; SSE3-FAST: # %bb.0: 804; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 805; SSE3-FAST-NEXT: retq 806; 807; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32: 808; AVX-SLOW: # %bb.0: 809; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 810; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 811; AVX-SLOW-NEXT: vzeroupper 812; AVX-SLOW-NEXT: retq 813; 814; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32: 815; AVX-FAST: # %bb.0: 816; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 817; AVX-FAST-NEXT: vzeroupper 818; AVX-FAST-NEXT: retq 819 %x0 = extractelement <8 x float> %x, i32 0 820 %x1 = extractelement <8 x float> %x, i32 1 821 %x01 = fadd float %x0, %x1 822 ret float %x01 823} 824 825define float @extract_extract23_v8f32_fadd_f32(<8 x float> %x) { 826; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32: 827; SSE3-SLOW: # %bb.0: 828; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 829; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 830; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 831; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 832; SSE3-SLOW-NEXT: retq 833; 834; SSE3-FAST-LABEL: extract_extract23_v8f32_fadd_f32: 835; SSE3-FAST: # %bb.0: 836; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 837; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 838; SSE3-FAST-NEXT: retq 839; 840; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32: 841; AVX-SLOW: # %bb.0: 842; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 843; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 844; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 845; AVX-SLOW-NEXT: vzeroupper 846; AVX-SLOW-NEXT: retq 847; 848; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32: 849; AVX-FAST: # %bb.0: 850; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 851; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 852; AVX-FAST-NEXT: vzeroupper 853; AVX-FAST-NEXT: retq 854 %x0 = extractelement <8 x float> %x, i32 2 855 %x1 = extractelement <8 x float> %x, i32 3 856 %x01 = fadd float %x0, %x1 857 ret float %x01 858} 859 860define float @extract_extract67_v8f32_fadd_f32(<8 x float> %x) { 861; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32: 862; SSE3-SLOW: # %bb.0: 863; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 864; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 865; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 866; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 867; SSE3-SLOW-NEXT: retq 868; 869; SSE3-FAST-LABEL: extract_extract67_v8f32_fadd_f32: 870; SSE3-FAST: # %bb.0: 871; SSE3-FAST-NEXT: haddps %xmm1, %xmm1 872; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 873; SSE3-FAST-NEXT: retq 874; 875; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32: 876; AVX-SLOW: # %bb.0: 877; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 878; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 879; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 880; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 881; AVX-SLOW-NEXT: vzeroupper 882; AVX-SLOW-NEXT: retq 883; 884; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32: 885; AVX-FAST: # %bb.0: 886; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 887; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 888; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 889; AVX-FAST-NEXT: vzeroupper 890; AVX-FAST-NEXT: retq 891 %x0 = extractelement <8 x float> %x, i32 6 892 %x1 = extractelement <8 x float> %x, i32 7 893 %x01 = fadd float %x0, %x1 894 ret float %x01 895} 896 897define float @extract_extract01_v8f32_fadd_f32_commute(<8 x float> %x) { 898; SSE3-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute: 899; SSE3-SLOW: # %bb.0: 900; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 901; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 902; SSE3-SLOW-NEXT: retq 903; 904; SSE3-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute: 905; SSE3-FAST: # %bb.0: 906; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 907; SSE3-FAST-NEXT: retq 908; 909; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute: 910; AVX-SLOW: # %bb.0: 911; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 912; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 913; AVX-SLOW-NEXT: vzeroupper 914; AVX-SLOW-NEXT: retq 915; 916; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute: 917; AVX-FAST: # %bb.0: 918; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 919; AVX-FAST-NEXT: vzeroupper 920; AVX-FAST-NEXT: retq 921 %x0 = extractelement <8 x float> %x, i32 0 922 %x1 = extractelement <8 x float> %x, i32 1 923 %x01 = fadd float %x1, %x0 924 ret float %x01 925} 926 927define float @extract_extract23_v8f32_fadd_f32_commute(<8 x float> %x) { 928; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute: 929; SSE3-SLOW: # %bb.0: 930; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 931; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 932; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 933; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 934; SSE3-SLOW-NEXT: retq 935; 936; SSE3-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute: 937; SSE3-FAST: # %bb.0: 938; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 939; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 940; SSE3-FAST-NEXT: retq 941; 942; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute: 943; AVX-SLOW: # %bb.0: 944; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 945; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 946; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 947; AVX-SLOW-NEXT: vzeroupper 948; AVX-SLOW-NEXT: retq 949; 950; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute: 951; AVX-FAST: # %bb.0: 952; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 953; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 954; AVX-FAST-NEXT: vzeroupper 955; AVX-FAST-NEXT: retq 956 %x0 = extractelement <8 x float> %x, i32 2 957 %x1 = extractelement <8 x float> %x, i32 3 958 %x01 = fadd float %x1, %x0 959 ret float %x01 960} 961 962define float @extract_extract67_v8f32_fadd_f32_commute(<8 x float> %x) { 963; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute: 964; SSE3-SLOW: # %bb.0: 965; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 966; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 967; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 968; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 969; SSE3-SLOW-NEXT: retq 970; 971; SSE3-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute: 972; SSE3-FAST: # %bb.0: 973; SSE3-FAST-NEXT: haddps %xmm1, %xmm1 974; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 975; SSE3-FAST-NEXT: retq 976; 977; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute: 978; AVX-SLOW: # %bb.0: 979; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 980; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 981; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 982; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 983; AVX-SLOW-NEXT: vzeroupper 984; AVX-SLOW-NEXT: retq 985; 986; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute: 987; AVX-FAST: # %bb.0: 988; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 989; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 990; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 991; AVX-FAST-NEXT: vzeroupper 992; AVX-FAST-NEXT: retq 993 %x0 = extractelement <8 x float> %x, i32 6 994 %x1 = extractelement <8 x float> %x, i32 7 995 %x01 = fadd float %x1, %x0 996 ret float %x01 997} 998 999define double @extract_extract01_v4f64_fadd_f64(<4 x double> %x) { 1000; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64: 1001; SSE3-SLOW: # %bb.0: 1002; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 1003; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1004; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 1005; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 1006; SSE3-SLOW-NEXT: retq 1007; 1008; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64: 1009; SSE3-FAST: # %bb.0: 1010; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 1011; SSE3-FAST-NEXT: retq 1012; 1013; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64: 1014; AVX-SLOW: # %bb.0: 1015; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1016; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1017; AVX-SLOW-NEXT: vzeroupper 1018; AVX-SLOW-NEXT: retq 1019; 1020; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64: 1021; AVX-FAST: # %bb.0: 1022; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1023; AVX-FAST-NEXT: vzeroupper 1024; AVX-FAST-NEXT: retq 1025 %x0 = extractelement <4 x double> %x, i32 0 1026 %x1 = extractelement <4 x double> %x, i32 1 1027 %x01 = fadd double %x0, %x1 1028 ret double %x01 1029} 1030 1031define double @extract_extract23_v4f64_fadd_f64(<4 x double> %x) { 1032; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64: 1033; SSE3-SLOW: # %bb.0: 1034; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 1035; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1036; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 1037; SSE3-SLOW-NEXT: retq 1038; 1039; SSE3-FAST-LABEL: extract_extract23_v4f64_fadd_f64: 1040; SSE3-FAST: # %bb.0: 1041; SSE3-FAST-NEXT: movapd %xmm1, %xmm0 1042; SSE3-FAST-NEXT: haddpd %xmm1, %xmm0 1043; SSE3-FAST-NEXT: retq 1044; 1045; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64: 1046; AVX-SLOW: # %bb.0: 1047; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 1048; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1049; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1050; AVX-SLOW-NEXT: vzeroupper 1051; AVX-SLOW-NEXT: retq 1052; 1053; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64: 1054; AVX-FAST: # %bb.0: 1055; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 1056; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1057; AVX-FAST-NEXT: vzeroupper 1058; AVX-FAST-NEXT: retq 1059 %x0 = extractelement <4 x double> %x, i32 2 1060 %x1 = extractelement <4 x double> %x, i32 3 1061 %x01 = fadd double %x0, %x1 1062 ret double %x01 1063} 1064 1065define double @extract_extract01_v4f64_fadd_f64_commute(<4 x double> %x) { 1066; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute: 1067; SSE3-SLOW: # %bb.0: 1068; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 1069; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1070; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 1071; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 1072; SSE3-SLOW-NEXT: retq 1073; 1074; SSE3-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute: 1075; SSE3-FAST: # %bb.0: 1076; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 1077; SSE3-FAST-NEXT: retq 1078; 1079; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute: 1080; AVX-SLOW: # %bb.0: 1081; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1082; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1083; AVX-SLOW-NEXT: vzeroupper 1084; AVX-SLOW-NEXT: retq 1085; 1086; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute: 1087; AVX-FAST: # %bb.0: 1088; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1089; AVX-FAST-NEXT: vzeroupper 1090; AVX-FAST-NEXT: retq 1091 %x0 = extractelement <4 x double> %x, i32 0 1092 %x1 = extractelement <4 x double> %x, i32 1 1093 %x01 = fadd double %x1, %x0 1094 ret double %x01 1095} 1096 1097define double @extract_extract23_v4f64_fadd_f64_commute(<4 x double> %x) { 1098; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute: 1099; SSE3-SLOW: # %bb.0: 1100; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 1101; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1102; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 1103; SSE3-SLOW-NEXT: retq 1104; 1105; SSE3-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute: 1106; SSE3-FAST: # %bb.0: 1107; SSE3-FAST-NEXT: movapd %xmm1, %xmm0 1108; SSE3-FAST-NEXT: haddpd %xmm1, %xmm0 1109; SSE3-FAST-NEXT: retq 1110; 1111; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute: 1112; AVX-SLOW: # %bb.0: 1113; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 1114; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1115; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1116; AVX-SLOW-NEXT: vzeroupper 1117; AVX-SLOW-NEXT: retq 1118; 1119; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute: 1120; AVX-FAST: # %bb.0: 1121; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 1122; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1123; AVX-FAST-NEXT: vzeroupper 1124; AVX-FAST-NEXT: retq 1125 %x0 = extractelement <4 x double> %x, i32 2 1126 %x1 = extractelement <4 x double> %x, i32 3 1127 %x01 = fadd double %x1, %x0 1128 ret double %x01 1129} 1130 1131define float @extract_extract01_v8f32_fsub_f32(<8 x float> %x) { 1132; SSE3-SLOW-LABEL: extract_extract01_v8f32_fsub_f32: 1133; SSE3-SLOW: # %bb.0: 1134; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1135; SSE3-SLOW-NEXT: subss %xmm1, %xmm0 1136; SSE3-SLOW-NEXT: retq 1137; 1138; SSE3-FAST-LABEL: extract_extract01_v8f32_fsub_f32: 1139; SSE3-FAST: # %bb.0: 1140; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 1141; SSE3-FAST-NEXT: retq 1142; 1143; AVX-SLOW-LABEL: extract_extract01_v8f32_fsub_f32: 1144; AVX-SLOW: # %bb.0: 1145; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1146; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0 1147; AVX-SLOW-NEXT: vzeroupper 1148; AVX-SLOW-NEXT: retq 1149; 1150; AVX-FAST-LABEL: extract_extract01_v8f32_fsub_f32: 1151; AVX-FAST: # %bb.0: 1152; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 1153; AVX-FAST-NEXT: vzeroupper 1154; AVX-FAST-NEXT: retq 1155 %x0 = extractelement <8 x float> %x, i32 0 1156 %x1 = extractelement <8 x float> %x, i32 1 1157 %x01 = fsub float %x0, %x1 1158 ret float %x01 1159} 1160 1161define float @extract_extract23_v8f32_fsub_f32(<8 x float> %x) { 1162; SSE3-SLOW-LABEL: extract_extract23_v8f32_fsub_f32: 1163; SSE3-SLOW: # %bb.0: 1164; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 1165; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1166; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1167; SSE3-SLOW-NEXT: subss %xmm0, %xmm1 1168; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 1169; SSE3-SLOW-NEXT: retq 1170; 1171; SSE3-FAST-LABEL: extract_extract23_v8f32_fsub_f32: 1172; SSE3-FAST: # %bb.0: 1173; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 1174; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 1175; SSE3-FAST-NEXT: retq 1176; 1177; AVX-SLOW-LABEL: extract_extract23_v8f32_fsub_f32: 1178; AVX-SLOW: # %bb.0: 1179; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1180; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 1181; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0 1182; AVX-SLOW-NEXT: vzeroupper 1183; AVX-SLOW-NEXT: retq 1184; 1185; AVX-FAST-LABEL: extract_extract23_v8f32_fsub_f32: 1186; AVX-FAST: # %bb.0: 1187; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 1188; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 1189; AVX-FAST-NEXT: vzeroupper 1190; AVX-FAST-NEXT: retq 1191 %x0 = extractelement <8 x float> %x, i32 2 1192 %x1 = extractelement <8 x float> %x, i32 3 1193 %x01 = fsub float %x0, %x1 1194 ret float %x01 1195} 1196 1197define float @extract_extract45_v8f32_fsub_f32(<8 x float> %x) { 1198; SSE3-SLOW-LABEL: extract_extract45_v8f32_fsub_f32: 1199; SSE3-SLOW: # %bb.0: 1200; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 1201; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] 1202; SSE3-SLOW-NEXT: subss %xmm1, %xmm0 1203; SSE3-SLOW-NEXT: retq 1204; 1205; SSE3-FAST-LABEL: extract_extract45_v8f32_fsub_f32: 1206; SSE3-FAST: # %bb.0: 1207; SSE3-FAST-NEXT: movaps %xmm1, %xmm0 1208; SSE3-FAST-NEXT: hsubps %xmm1, %xmm0 1209; SSE3-FAST-NEXT: retq 1210; 1211; AVX-SLOW-LABEL: extract_extract45_v8f32_fsub_f32: 1212; AVX-SLOW: # %bb.0: 1213; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 1214; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1215; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0 1216; AVX-SLOW-NEXT: vzeroupper 1217; AVX-SLOW-NEXT: retq 1218; 1219; AVX-FAST-LABEL: extract_extract45_v8f32_fsub_f32: 1220; AVX-FAST: # %bb.0: 1221; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 1222; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 1223; AVX-FAST-NEXT: vzeroupper 1224; AVX-FAST-NEXT: retq 1225 %x0 = extractelement <8 x float> %x, i32 4 1226 %x1 = extractelement <8 x float> %x, i32 5 1227 %x01 = fsub float %x0, %x1 1228 ret float %x01 1229} 1230 1231; Negative test...or get hoppy and negate? 1232 1233define float @extract_extract01_v8f32_fsub_f32_commute(<8 x float> %x) { 1234; SSE3-LABEL: extract_extract01_v8f32_fsub_f32_commute: 1235; SSE3: # %bb.0: 1236; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1237; SSE3-NEXT: subss %xmm0, %xmm1 1238; SSE3-NEXT: movaps %xmm1, %xmm0 1239; SSE3-NEXT: retq 1240; 1241; AVX-LABEL: extract_extract01_v8f32_fsub_f32_commute: 1242; AVX: # %bb.0: 1243; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1244; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 1245; AVX-NEXT: vzeroupper 1246; AVX-NEXT: retq 1247 %x0 = extractelement <8 x float> %x, i32 0 1248 %x1 = extractelement <8 x float> %x, i32 1 1249 %x01 = fsub float %x1, %x0 1250 ret float %x01 1251} 1252 1253define double @extract_extract01_v4f64_fsub_f64(<4 x double> %x) { 1254; SSE3-SLOW-LABEL: extract_extract01_v4f64_fsub_f64: 1255; SSE3-SLOW: # %bb.0: 1256; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 1257; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1258; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0 1259; SSE3-SLOW-NEXT: retq 1260; 1261; SSE3-FAST-LABEL: extract_extract01_v4f64_fsub_f64: 1262; SSE3-FAST: # %bb.0: 1263; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0 1264; SSE3-FAST-NEXT: retq 1265; 1266; AVX-SLOW-LABEL: extract_extract01_v4f64_fsub_f64: 1267; AVX-SLOW: # %bb.0: 1268; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1269; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0 1270; AVX-SLOW-NEXT: vzeroupper 1271; AVX-SLOW-NEXT: retq 1272; 1273; AVX-FAST-LABEL: extract_extract01_v4f64_fsub_f64: 1274; AVX-FAST: # %bb.0: 1275; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 1276; AVX-FAST-NEXT: vzeroupper 1277; AVX-FAST-NEXT: retq 1278 %x0 = extractelement <4 x double> %x, i32 0 1279 %x1 = extractelement <4 x double> %x, i32 1 1280 %x01 = fsub double %x0, %x1 1281 ret double %x01 1282} 1283 1284; Negative test...or get hoppy and negate? 1285 1286define double @extract_extract01_v4f64_fsub_f64_commute(<4 x double> %x) { 1287; SSE3-LABEL: extract_extract01_v4f64_fsub_f64_commute: 1288; SSE3: # %bb.0: 1289; SSE3-NEXT: movapd %xmm0, %xmm1 1290; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1291; SSE3-NEXT: subsd %xmm0, %xmm1 1292; SSE3-NEXT: movapd %xmm1, %xmm0 1293; SSE3-NEXT: retq 1294; 1295; AVX-LABEL: extract_extract01_v4f64_fsub_f64_commute: 1296; AVX: # %bb.0: 1297; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1298; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 1299; AVX-NEXT: vzeroupper 1300; AVX-NEXT: retq 1301 %x0 = extractelement <4 x double> %x, i32 0 1302 %x1 = extractelement <4 x double> %x, i32 1 1303 %x01 = fsub double %x1, %x0 1304 ret double %x01 1305} 1306 1307; 512-bit vectors, float/double, fadd/fsub 1308 1309define float @extract_extract01_v16f32_fadd_f32(<16 x float> %x) { 1310; SSE3-SLOW-LABEL: extract_extract01_v16f32_fadd_f32: 1311; SSE3-SLOW: # %bb.0: 1312; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1313; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 1314; SSE3-SLOW-NEXT: retq 1315; 1316; SSE3-FAST-LABEL: extract_extract01_v16f32_fadd_f32: 1317; SSE3-FAST: # %bb.0: 1318; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 1319; SSE3-FAST-NEXT: retq 1320; 1321; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32: 1322; AVX-SLOW: # %bb.0: 1323; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1324; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 1325; AVX-SLOW-NEXT: vzeroupper 1326; AVX-SLOW-NEXT: retq 1327; 1328; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32: 1329; AVX-FAST: # %bb.0: 1330; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1331; AVX-FAST-NEXT: vzeroupper 1332; AVX-FAST-NEXT: retq 1333 %x0 = extractelement <16 x float> %x, i32 0 1334 %x1 = extractelement <16 x float> %x, i32 1 1335 %x01 = fadd float %x0, %x1 1336 ret float %x01 1337} 1338 1339define float @extract_extract01_v16f32_fadd_f32_commute(<16 x float> %x) { 1340; SSE3-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute: 1341; SSE3-SLOW: # %bb.0: 1342; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1343; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 1344; SSE3-SLOW-NEXT: retq 1345; 1346; SSE3-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute: 1347; SSE3-FAST: # %bb.0: 1348; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 1349; SSE3-FAST-NEXT: retq 1350; 1351; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute: 1352; AVX-SLOW: # %bb.0: 1353; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1354; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 1355; AVX-SLOW-NEXT: vzeroupper 1356; AVX-SLOW-NEXT: retq 1357; 1358; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute: 1359; AVX-FAST: # %bb.0: 1360; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1361; AVX-FAST-NEXT: vzeroupper 1362; AVX-FAST-NEXT: retq 1363 %x0 = extractelement <16 x float> %x, i32 0 1364 %x1 = extractelement <16 x float> %x, i32 1 1365 %x01 = fadd float %x1, %x0 1366 ret float %x01 1367} 1368 1369define double @extract_extract01_v8f64_fadd_f64(<8 x double> %x) { 1370; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64: 1371; SSE3-SLOW: # %bb.0: 1372; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 1373; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1374; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 1375; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 1376; SSE3-SLOW-NEXT: retq 1377; 1378; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64: 1379; SSE3-FAST: # %bb.0: 1380; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 1381; SSE3-FAST-NEXT: retq 1382; 1383; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64: 1384; AVX-SLOW: # %bb.0: 1385; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1386; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1387; AVX-SLOW-NEXT: vzeroupper 1388; AVX-SLOW-NEXT: retq 1389; 1390; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64: 1391; AVX-FAST: # %bb.0: 1392; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1393; AVX-FAST-NEXT: vzeroupper 1394; AVX-FAST-NEXT: retq 1395 %x0 = extractelement <8 x double> %x, i32 0 1396 %x1 = extractelement <8 x double> %x, i32 1 1397 %x01 = fadd double %x0, %x1 1398 ret double %x01 1399} 1400 1401define double @extract_extract01_v8f64_fadd_f64_commute(<8 x double> %x) { 1402; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute: 1403; SSE3-SLOW: # %bb.0: 1404; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 1405; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1406; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1 1407; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 1408; SSE3-SLOW-NEXT: retq 1409; 1410; SSE3-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute: 1411; SSE3-FAST: # %bb.0: 1412; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 1413; SSE3-FAST-NEXT: retq 1414; 1415; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute: 1416; AVX-SLOW: # %bb.0: 1417; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1418; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0 1419; AVX-SLOW-NEXT: vzeroupper 1420; AVX-SLOW-NEXT: retq 1421; 1422; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute: 1423; AVX-FAST: # %bb.0: 1424; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1425; AVX-FAST-NEXT: vzeroupper 1426; AVX-FAST-NEXT: retq 1427 %x0 = extractelement <8 x double> %x, i32 0 1428 %x1 = extractelement <8 x double> %x, i32 1 1429 %x01 = fadd double %x1, %x0 1430 ret double %x01 1431} 1432 1433define float @extract_extract01_v16f32_fsub_f32(<16 x float> %x) { 1434; SSE3-SLOW-LABEL: extract_extract01_v16f32_fsub_f32: 1435; SSE3-SLOW: # %bb.0: 1436; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1437; SSE3-SLOW-NEXT: subss %xmm1, %xmm0 1438; SSE3-SLOW-NEXT: retq 1439; 1440; SSE3-FAST-LABEL: extract_extract01_v16f32_fsub_f32: 1441; SSE3-FAST: # %bb.0: 1442; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 1443; SSE3-FAST-NEXT: retq 1444; 1445; AVX-SLOW-LABEL: extract_extract01_v16f32_fsub_f32: 1446; AVX-SLOW: # %bb.0: 1447; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1448; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0 1449; AVX-SLOW-NEXT: vzeroupper 1450; AVX-SLOW-NEXT: retq 1451; 1452; AVX-FAST-LABEL: extract_extract01_v16f32_fsub_f32: 1453; AVX-FAST: # %bb.0: 1454; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 1455; AVX-FAST-NEXT: vzeroupper 1456; AVX-FAST-NEXT: retq 1457 %x0 = extractelement <16 x float> %x, i32 0 1458 %x1 = extractelement <16 x float> %x, i32 1 1459 %x01 = fsub float %x0, %x1 1460 ret float %x01 1461} 1462 1463define float @extract_extract01_v16f32_fsub_f32_commute(<16 x float> %x) { 1464; SSE3-LABEL: extract_extract01_v16f32_fsub_f32_commute: 1465; SSE3: # %bb.0: 1466; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1467; SSE3-NEXT: subss %xmm0, %xmm1 1468; SSE3-NEXT: movaps %xmm1, %xmm0 1469; SSE3-NEXT: retq 1470; 1471; AVX-LABEL: extract_extract01_v16f32_fsub_f32_commute: 1472; AVX: # %bb.0: 1473; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1474; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 1475; AVX-NEXT: vzeroupper 1476; AVX-NEXT: retq 1477 %x0 = extractelement <16 x float> %x, i32 0 1478 %x1 = extractelement <16 x float> %x, i32 1 1479 %x01 = fsub float %x1, %x0 1480 ret float %x01 1481} 1482 1483define double @extract_extract01_v8f64_fsub_f64(<8 x double> %x) { 1484; SSE3-SLOW-LABEL: extract_extract01_v8f64_fsub_f64: 1485; SSE3-SLOW: # %bb.0: 1486; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 1487; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1488; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0 1489; SSE3-SLOW-NEXT: retq 1490; 1491; SSE3-FAST-LABEL: extract_extract01_v8f64_fsub_f64: 1492; SSE3-FAST: # %bb.0: 1493; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0 1494; SSE3-FAST-NEXT: retq 1495; 1496; AVX-SLOW-LABEL: extract_extract01_v8f64_fsub_f64: 1497; AVX-SLOW: # %bb.0: 1498; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1499; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0 1500; AVX-SLOW-NEXT: vzeroupper 1501; AVX-SLOW-NEXT: retq 1502; 1503; AVX-FAST-LABEL: extract_extract01_v8f64_fsub_f64: 1504; AVX-FAST: # %bb.0: 1505; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 1506; AVX-FAST-NEXT: vzeroupper 1507; AVX-FAST-NEXT: retq 1508 %x0 = extractelement <8 x double> %x, i32 0 1509 %x1 = extractelement <8 x double> %x, i32 1 1510 %x01 = fsub double %x0, %x1 1511 ret double %x01 1512} 1513 1514define double @extract_extract01_v8f64_fsub_f64_commute(<8 x double> %x) { 1515; SSE3-LABEL: extract_extract01_v8f64_fsub_f64_commute: 1516; SSE3: # %bb.0: 1517; SSE3-NEXT: movapd %xmm0, %xmm1 1518; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1519; SSE3-NEXT: subsd %xmm0, %xmm1 1520; SSE3-NEXT: movapd %xmm1, %xmm0 1521; SSE3-NEXT: retq 1522; 1523; AVX-LABEL: extract_extract01_v8f64_fsub_f64_commute: 1524; AVX: # %bb.0: 1525; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1526; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 1527; AVX-NEXT: vzeroupper 1528; AVX-NEXT: retq 1529 %x0 = extractelement <8 x double> %x, i32 0 1530 %x1 = extractelement <8 x double> %x, i32 1 1531 %x01 = fsub double %x1, %x0 1532 ret double %x01 1533} 1534 1535; Check output when 1 or both extracts have extra uses. 1536 1537define float @extract_extract01_v4f32_fadd_f32_uses1(<4 x float> %x, float* %p) { 1538; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1: 1539; SSE3-SLOW: # %bb.0: 1540; SSE3-SLOW-NEXT: movss %xmm0, (%rdi) 1541; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1542; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 1543; SSE3-SLOW-NEXT: retq 1544; 1545; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1: 1546; SSE3-FAST: # %bb.0: 1547; SSE3-FAST-NEXT: movss %xmm0, (%rdi) 1548; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 1549; SSE3-FAST-NEXT: retq 1550; 1551; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1: 1552; AVX-SLOW: # %bb.0: 1553; AVX-SLOW-NEXT: vmovss %xmm0, (%rdi) 1554; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1555; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 1556; AVX-SLOW-NEXT: retq 1557; 1558; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1: 1559; AVX-FAST: # %bb.0: 1560; AVX-FAST-NEXT: vmovss %xmm0, (%rdi) 1561; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1562; AVX-FAST-NEXT: retq 1563 %x0 = extractelement <4 x float> %x, i32 0 1564 store float %x0, float* %p 1565 %x1 = extractelement <4 x float> %x, i32 1 1566 %x01 = fadd float %x0, %x1 1567 ret float %x01 1568} 1569 1570define float @extract_extract01_v4f32_fadd_f32_uses2(<4 x float> %x, float* %p) { 1571; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2: 1572; SSE3-SLOW: # %bb.0: 1573; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1574; SSE3-SLOW-NEXT: movss %xmm1, (%rdi) 1575; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 1576; SSE3-SLOW-NEXT: retq 1577; 1578; SSE3-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2: 1579; SSE3-FAST: # %bb.0: 1580; SSE3-FAST-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1581; SSE3-FAST-NEXT: movss %xmm1, (%rdi) 1582; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 1583; SSE3-FAST-NEXT: retq 1584; 1585; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2: 1586; AVX-SLOW: # %bb.0: 1587; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1588; AVX-SLOW-NEXT: vmovss %xmm1, (%rdi) 1589; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 1590; AVX-SLOW-NEXT: retq 1591; 1592; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2: 1593; AVX-FAST: # %bb.0: 1594; AVX-FAST-NEXT: vextractps $1, %xmm0, (%rdi) 1595; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1596; AVX-FAST-NEXT: retq 1597 %x0 = extractelement <4 x float> %x, i32 0 1598 %x1 = extractelement <4 x float> %x, i32 1 1599 store float %x1, float* %p 1600 %x01 = fadd float %x0, %x1 1601 ret float %x01 1602} 1603 1604define float @extract_extract01_v4f32_fadd_f32_uses3(<4 x float> %x, float* %p1, float* %p2) { 1605; SSE3-LABEL: extract_extract01_v4f32_fadd_f32_uses3: 1606; SSE3: # %bb.0: 1607; SSE3-NEXT: movss %xmm0, (%rdi) 1608; SSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1609; SSE3-NEXT: movss %xmm1, (%rsi) 1610; SSE3-NEXT: addss %xmm1, %xmm0 1611; SSE3-NEXT: retq 1612; 1613; AVX-LABEL: extract_extract01_v4f32_fadd_f32_uses3: 1614; AVX: # %bb.0: 1615; AVX-NEXT: vmovss %xmm0, (%rdi) 1616; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1617; AVX-NEXT: vmovss %xmm1, (%rsi) 1618; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 1619; AVX-NEXT: retq 1620 %x0 = extractelement <4 x float> %x, i32 0 1621 store float %x0, float* %p1 1622 %x1 = extractelement <4 x float> %x, i32 1 1623 store float %x1, float* %p2 1624 %x01 = fadd float %x0, %x1 1625 ret float %x01 1626} 1627 1628; Repeat tests from general reductions to verify output for hoppy targets: 1629; PR38971: https://bugs.llvm.org/show_bug.cgi?id=38971 1630 1631declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) 1632declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>) 1633 1634define float @fadd_reduce_v8f32(float %a0, <8 x float> %a1) { 1635; SSE3-SLOW-LABEL: fadd_reduce_v8f32: 1636; SSE3-SLOW: # %bb.0: 1637; SSE3-SLOW-NEXT: addps %xmm2, %xmm1 1638; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2 1639; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 1640; SSE3-SLOW-NEXT: addps %xmm1, %xmm2 1641; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 1642; SSE3-SLOW-NEXT: addss %xmm2, %xmm1 1643; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 1644; SSE3-SLOW-NEXT: retq 1645; 1646; SSE3-FAST-LABEL: fadd_reduce_v8f32: 1647; SSE3-FAST: # %bb.0: 1648; SSE3-FAST-NEXT: haddps %xmm1, %xmm2 1649; SSE3-FAST-NEXT: haddps %xmm2, %xmm2 1650; SSE3-FAST-NEXT: haddps %xmm2, %xmm2 1651; SSE3-FAST-NEXT: addss %xmm2, %xmm0 1652; SSE3-FAST-NEXT: retq 1653; 1654; AVX-SLOW-LABEL: fadd_reduce_v8f32: 1655; AVX-SLOW: # %bb.0: 1656; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 1657; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 1658; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1659; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 1660; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 1661; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 1662; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 1663; AVX-SLOW-NEXT: vzeroupper 1664; AVX-SLOW-NEXT: retq 1665; 1666; AVX-FAST-LABEL: fadd_reduce_v8f32: 1667; AVX-FAST: # %bb.0: 1668; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 1669; AVX-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1 1670; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 1671; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 1672; AVX-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 1673; AVX-FAST-NEXT: vzeroupper 1674; AVX-FAST-NEXT: retq 1675 %r = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1) 1676 ret float %r 1677} 1678 1679define double @fadd_reduce_v4f64(double %a0, <4 x double> %a1) { 1680; SSE3-SLOW-LABEL: fadd_reduce_v4f64: 1681; SSE3-SLOW: # %bb.0: 1682; SSE3-SLOW-NEXT: addpd %xmm2, %xmm1 1683; SSE3-SLOW-NEXT: movapd %xmm1, %xmm2 1684; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 1685; SSE3-SLOW-NEXT: addsd %xmm1, %xmm2 1686; SSE3-SLOW-NEXT: addsd %xmm2, %xmm0 1687; SSE3-SLOW-NEXT: retq 1688; 1689; SSE3-FAST-LABEL: fadd_reduce_v4f64: 1690; SSE3-FAST: # %bb.0: 1691; SSE3-FAST-NEXT: haddpd %xmm1, %xmm2 1692; SSE3-FAST-NEXT: haddpd %xmm2, %xmm2 1693; SSE3-FAST-NEXT: addsd %xmm2, %xmm0 1694; SSE3-FAST-NEXT: retq 1695; 1696; AVX-SLOW-LABEL: fadd_reduce_v4f64: 1697; AVX-SLOW: # %bb.0: 1698; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 1699; AVX-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1 1700; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 1701; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 1702; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1703; AVX-SLOW-NEXT: vzeroupper 1704; AVX-SLOW-NEXT: retq 1705; 1706; AVX-FAST-LABEL: fadd_reduce_v4f64: 1707; AVX-FAST: # %bb.0: 1708; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 1709; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm2, %xmm1 1710; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 1711; AVX-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1712; AVX-FAST-NEXT: vzeroupper 1713; AVX-FAST-NEXT: retq 1714 %r = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1) 1715 ret double %r 1716} 1717 1718define float @PR39936_v8f32(<8 x float>) { 1719; SSSE3-SLOW-LABEL: PR39936_v8f32: 1720; SSSE3-SLOW: # %bb.0: 1721; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0 1722; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 1723; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 1724; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] 1725; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 1726; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1727; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0 1728; SSSE3-SLOW-NEXT: retq 1729; 1730; SSSE3-FAST-LABEL: PR39936_v8f32: 1731; SSSE3-FAST: # %bb.0: 1732; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0 1733; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 1734; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 1735; SSSE3-FAST-NEXT: retq 1736; 1737; SSE3-SLOW-LABEL: PR39936_v8f32: 1738; SSE3-SLOW: # %bb.0: 1739; SSE3-SLOW-NEXT: haddps %xmm1, %xmm0 1740; SSE3-SLOW-NEXT: haddps %xmm0, %xmm0 1741; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1742; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 1743; SSE3-SLOW-NEXT: retq 1744; 1745; SSE3-FAST-LABEL: PR39936_v8f32: 1746; SSE3-FAST: # %bb.0: 1747; SSE3-FAST-NEXT: haddps %xmm1, %xmm0 1748; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 1749; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 1750; SSE3-FAST-NEXT: retq 1751; 1752; AVX-SLOW-LABEL: PR39936_v8f32: 1753; AVX-SLOW: # %bb.0: 1754; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1755; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 1756; AVX-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1757; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1758; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 1759; AVX-SLOW-NEXT: vzeroupper 1760; AVX-SLOW-NEXT: retq 1761; 1762; AVX-FAST-LABEL: PR39936_v8f32: 1763; AVX-FAST: # %bb.0: 1764; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 1765; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 1766; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1767; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1768; AVX-FAST-NEXT: vzeroupper 1769; AVX-FAST-NEXT: retq 1770 %2 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef> 1771 %3 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 1772 %4 = fadd <8 x float> %2, %3 1773 %5 = shufflevector <8 x float> %4, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1774 %6 = shufflevector <8 x float> %4, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1775 %7 = fadd <8 x float> %5, %6 1776 %8 = shufflevector <8 x float> %7, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1777 %9 = fadd <8 x float> %7, %8 1778 %10 = extractelement <8 x float> %9, i32 0 1779 ret float %10 1780} 1781 1782define float @hadd32_4(<4 x float> %x225) { 1783; SSE3-SLOW-LABEL: hadd32_4: 1784; SSE3-SLOW: # %bb.0: 1785; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 1786; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1787; SSE3-SLOW-NEXT: addps %xmm0, %xmm1 1788; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 1789; SSE3-SLOW-NEXT: addss %xmm0, %xmm1 1790; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 1791; SSE3-SLOW-NEXT: retq 1792; 1793; SSE3-FAST-LABEL: hadd32_4: 1794; SSE3-FAST: # %bb.0: 1795; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 1796; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1797; SSE3-FAST-NEXT: addps %xmm0, %xmm1 1798; SSE3-FAST-NEXT: haddps %xmm1, %xmm1 1799; SSE3-FAST-NEXT: movaps %xmm1, %xmm0 1800; SSE3-FAST-NEXT: retq 1801; 1802; AVX-SLOW-LABEL: hadd32_4: 1803; AVX-SLOW: # %bb.0: 1804; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1805; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 1806; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1807; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 1808; AVX-SLOW-NEXT: retq 1809; 1810; AVX-FAST-LABEL: hadd32_4: 1811; AVX-FAST: # %bb.0: 1812; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1813; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 1814; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1815; AVX-FAST-NEXT: retq 1816 %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 1817 %x227 = fadd <4 x float> %x225, %x226 1818 %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 1819 %x229 = fadd <4 x float> %x227, %x228 1820 %x230 = extractelement <4 x float> %x229, i32 0 1821 ret float %x230 1822} 1823 1824define float @hadd32_8(<8 x float> %x225) { 1825; SSE3-SLOW-LABEL: hadd32_8: 1826; SSE3-SLOW: # %bb.0: 1827; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 1828; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1829; SSE3-SLOW-NEXT: addps %xmm0, %xmm1 1830; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 1831; SSE3-SLOW-NEXT: addss %xmm0, %xmm1 1832; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 1833; SSE3-SLOW-NEXT: retq 1834; 1835; SSE3-FAST-LABEL: hadd32_8: 1836; SSE3-FAST: # %bb.0: 1837; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 1838; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1839; SSE3-FAST-NEXT: addps %xmm0, %xmm1 1840; SSE3-FAST-NEXT: haddps %xmm1, %xmm1 1841; SSE3-FAST-NEXT: movaps %xmm1, %xmm0 1842; SSE3-FAST-NEXT: retq 1843; 1844; AVX-SLOW-LABEL: hadd32_8: 1845; AVX-SLOW: # %bb.0: 1846; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1847; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 1848; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1849; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 1850; AVX-SLOW-NEXT: vzeroupper 1851; AVX-SLOW-NEXT: retq 1852; 1853; AVX-FAST-LABEL: hadd32_8: 1854; AVX-FAST: # %bb.0: 1855; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1856; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 1857; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1858; AVX-FAST-NEXT: vzeroupper 1859; AVX-FAST-NEXT: retq 1860 %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1861 %x227 = fadd <8 x float> %x225, %x226 1862 %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1863 %x229 = fadd <8 x float> %x227, %x228 1864 %x230 = extractelement <8 x float> %x229, i32 0 1865 ret float %x230 1866} 1867 1868define float @hadd32_16(<16 x float> %x225) { 1869; SSE3-SLOW-LABEL: hadd32_16: 1870; SSE3-SLOW: # %bb.0: 1871; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 1872; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1873; SSE3-SLOW-NEXT: addps %xmm0, %xmm1 1874; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 1875; SSE3-SLOW-NEXT: addss %xmm0, %xmm1 1876; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 1877; SSE3-SLOW-NEXT: retq 1878; 1879; SSE3-FAST-LABEL: hadd32_16: 1880; SSE3-FAST: # %bb.0: 1881; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 1882; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1883; SSE3-FAST-NEXT: addps %xmm0, %xmm1 1884; SSE3-FAST-NEXT: haddps %xmm1, %xmm1 1885; SSE3-FAST-NEXT: movaps %xmm1, %xmm0 1886; SSE3-FAST-NEXT: retq 1887; 1888; AVX-SLOW-LABEL: hadd32_16: 1889; AVX-SLOW: # %bb.0: 1890; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1891; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 1892; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 1893; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 1894; AVX-SLOW-NEXT: vzeroupper 1895; AVX-SLOW-NEXT: retq 1896; 1897; AVX-FAST-LABEL: hadd32_16: 1898; AVX-FAST: # %bb.0: 1899; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1900; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 1901; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1902; AVX-FAST-NEXT: vzeroupper 1903; AVX-FAST-NEXT: retq 1904 %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1905 %x227 = fadd <16 x float> %x225, %x226 1906 %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1907 %x229 = fadd <16 x float> %x227, %x228 1908 %x230 = extractelement <16 x float> %x229, i32 0 1909 ret float %x230 1910} 1911 1912define float @hadd32_4_optsize(<4 x float> %x225) optsize { 1913; SSE3-LABEL: hadd32_4_optsize: 1914; SSE3: # %bb.0: 1915; SSE3-NEXT: movaps %xmm0, %xmm1 1916; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1917; SSE3-NEXT: addps %xmm0, %xmm1 1918; SSE3-NEXT: haddps %xmm1, %xmm1 1919; SSE3-NEXT: movaps %xmm1, %xmm0 1920; SSE3-NEXT: retq 1921; 1922; AVX-LABEL: hadd32_4_optsize: 1923; AVX: # %bb.0: 1924; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1925; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 1926; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1927; AVX-NEXT: retq 1928 %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 1929 %x227 = fadd <4 x float> %x225, %x226 1930 %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 1931 %x229 = fadd <4 x float> %x227, %x228 1932 %x230 = extractelement <4 x float> %x229, i32 0 1933 ret float %x230 1934} 1935 1936define float @hadd32_8_optsize(<8 x float> %x225) optsize { 1937; SSE3-LABEL: hadd32_8_optsize: 1938; SSE3: # %bb.0: 1939; SSE3-NEXT: movaps %xmm0, %xmm1 1940; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1941; SSE3-NEXT: addps %xmm0, %xmm1 1942; SSE3-NEXT: haddps %xmm1, %xmm1 1943; SSE3-NEXT: movaps %xmm1, %xmm0 1944; SSE3-NEXT: retq 1945; 1946; AVX-LABEL: hadd32_8_optsize: 1947; AVX: # %bb.0: 1948; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1949; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 1950; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1951; AVX-NEXT: vzeroupper 1952; AVX-NEXT: retq 1953 %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1954 %x227 = fadd <8 x float> %x225, %x226 1955 %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1956 %x229 = fadd <8 x float> %x227, %x228 1957 %x230 = extractelement <8 x float> %x229, i32 0 1958 ret float %x230 1959} 1960 1961define float @hadd32_16_optsize(<16 x float> %x225) optsize { 1962; SSE3-LABEL: hadd32_16_optsize: 1963; SSE3: # %bb.0: 1964; SSE3-NEXT: movaps %xmm0, %xmm1 1965; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1966; SSE3-NEXT: addps %xmm0, %xmm1 1967; SSE3-NEXT: haddps %xmm1, %xmm1 1968; SSE3-NEXT: movaps %xmm1, %xmm0 1969; SSE3-NEXT: retq 1970; 1971; AVX-LABEL: hadd32_16_optsize: 1972; AVX: # %bb.0: 1973; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1974; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 1975; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 1976; AVX-NEXT: vzeroupper 1977; AVX-NEXT: retq 1978 %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1979 %x227 = fadd <16 x float> %x225, %x226 1980 %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1981 %x229 = fadd <16 x float> %x227, %x228 1982 %x230 = extractelement <16 x float> %x229, i32 0 1983 ret float %x230 1984} 1985 1986define float @hadd32_4_pgso(<4 x float> %x225) !prof !14 { 1987; SSE3-LABEL: hadd32_4_pgso: 1988; SSE3: # %bb.0: 1989; SSE3-NEXT: movaps %xmm0, %xmm1 1990; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1991; SSE3-NEXT: addps %xmm0, %xmm1 1992; SSE3-NEXT: haddps %xmm1, %xmm1 1993; SSE3-NEXT: movaps %xmm1, %xmm0 1994; SSE3-NEXT: retq 1995; 1996; AVX-LABEL: hadd32_4_pgso: 1997; AVX: # %bb.0: 1998; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1999; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 2000; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 2001; AVX-NEXT: retq 2002 %x226 = shufflevector <4 x float> %x225, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 2003 %x227 = fadd <4 x float> %x225, %x226 2004 %x228 = shufflevector <4 x float> %x227, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 2005 %x229 = fadd <4 x float> %x227, %x228 2006 %x230 = extractelement <4 x float> %x229, i32 0 2007 ret float %x230 2008} 2009 2010define float @hadd32_8_pgso(<8 x float> %x225) !prof !14 { 2011; SSE3-LABEL: hadd32_8_pgso: 2012; SSE3: # %bb.0: 2013; SSE3-NEXT: movaps %xmm0, %xmm1 2014; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2015; SSE3-NEXT: addps %xmm0, %xmm1 2016; SSE3-NEXT: haddps %xmm1, %xmm1 2017; SSE3-NEXT: movaps %xmm1, %xmm0 2018; SSE3-NEXT: retq 2019; 2020; AVX-LABEL: hadd32_8_pgso: 2021; AVX: # %bb.0: 2022; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2023; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 2024; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 2025; AVX-NEXT: vzeroupper 2026; AVX-NEXT: retq 2027 %x226 = shufflevector <8 x float> %x225, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2028 %x227 = fadd <8 x float> %x225, %x226 2029 %x228 = shufflevector <8 x float> %x227, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2030 %x229 = fadd <8 x float> %x227, %x228 2031 %x230 = extractelement <8 x float> %x229, i32 0 2032 ret float %x230 2033} 2034 2035define float @hadd32_16_pgso(<16 x float> %x225) !prof !14 { 2036; SSE3-LABEL: hadd32_16_pgso: 2037; SSE3: # %bb.0: 2038; SSE3-NEXT: movaps %xmm0, %xmm1 2039; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2040; SSE3-NEXT: addps %xmm0, %xmm1 2041; SSE3-NEXT: haddps %xmm1, %xmm1 2042; SSE3-NEXT: movaps %xmm1, %xmm0 2043; SSE3-NEXT: retq 2044; 2045; AVX-LABEL: hadd32_16_pgso: 2046; AVX: # %bb.0: 2047; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2048; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 2049; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 2050; AVX-NEXT: vzeroupper 2051; AVX-NEXT: retq 2052 %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2053 %x227 = fadd <16 x float> %x225, %x226 2054 %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2055 %x229 = fadd <16 x float> %x227, %x228 2056 %x230 = extractelement <16 x float> %x229, i32 0 2057 ret float %x230 2058} 2059 2060define float @partial_reduction_fadd_v8f32(<8 x float> %x) { 2061; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32: 2062; SSE3-SLOW: # %bb.0: 2063; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 2064; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2065; SSE3-SLOW-NEXT: addps %xmm0, %xmm1 2066; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 2067; SSE3-SLOW-NEXT: addss %xmm0, %xmm1 2068; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 2069; SSE3-SLOW-NEXT: retq 2070; 2071; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32: 2072; SSE3-FAST: # %bb.0: 2073; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 2074; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2075; SSE3-FAST-NEXT: addps %xmm0, %xmm1 2076; SSE3-FAST-NEXT: haddps %xmm1, %xmm1 2077; SSE3-FAST-NEXT: movaps %xmm1, %xmm0 2078; SSE3-FAST-NEXT: retq 2079; 2080; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32: 2081; AVX-SLOW: # %bb.0: 2082; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2083; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 2084; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2085; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 2086; AVX-SLOW-NEXT: vzeroupper 2087; AVX-SLOW-NEXT: retq 2088; 2089; AVX-FAST-LABEL: partial_reduction_fadd_v8f32: 2090; AVX-FAST: # %bb.0: 2091; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 2092; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 2093; AVX-FAST-NEXT: vzeroupper 2094; AVX-FAST-NEXT: retq 2095 %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2096 %x0213 = fadd <8 x float> %x, %x23 2097 %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2098 %x0123 = fadd nsz reassoc <8 x float> %x0213, %x13 2099 %r = extractelement <8 x float> %x0123, i32 0 2100 ret float %r 2101} 2102 2103; Negative test - only the flags on the final math op in the 2104; sequence determine whether we can transform to horizontal ops. 2105 2106define float @partial_reduction_fadd_v8f32_wrong_flags(<8 x float> %x) { 2107; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags: 2108; SSE3-SLOW: # %bb.0: 2109; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 2110; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2111; SSE3-SLOW-NEXT: addps %xmm0, %xmm1 2112; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 2113; SSE3-SLOW-NEXT: addss %xmm0, %xmm1 2114; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 2115; SSE3-SLOW-NEXT: retq 2116; 2117; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags: 2118; SSE3-FAST: # %bb.0: 2119; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 2120; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2121; SSE3-FAST-NEXT: addps %xmm0, %xmm1 2122; SSE3-FAST-NEXT: haddps %xmm1, %xmm1 2123; SSE3-FAST-NEXT: movaps %xmm1, %xmm0 2124; SSE3-FAST-NEXT: retq 2125; 2126; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags: 2127; AVX-SLOW: # %bb.0: 2128; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2129; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 2130; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2131; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 2132; AVX-SLOW-NEXT: vzeroupper 2133; AVX-SLOW-NEXT: retq 2134; 2135; AVX-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags: 2136; AVX-FAST: # %bb.0: 2137; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2138; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 2139; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 2140; AVX-FAST-NEXT: vzeroupper 2141; AVX-FAST-NEXT: retq 2142 %x23 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2143 %x0213 = fadd fast <8 x float> %x, %x23 2144 %x13 = shufflevector <8 x float> %x0213, <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2145 %x0123 = fadd ninf nnan <8 x float> %x0213, %x13 2146 %r = extractelement <8 x float> %x0123, i32 0 2147 ret float %r 2148} 2149 2150define float @partial_reduction_fadd_v16f32(<16 x float> %x) { 2151; SSE3-SLOW-LABEL: partial_reduction_fadd_v16f32: 2152; SSE3-SLOW: # %bb.0: 2153; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 2154; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2155; SSE3-SLOW-NEXT: addps %xmm0, %xmm1 2156; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 2157; SSE3-SLOW-NEXT: addss %xmm0, %xmm1 2158; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 2159; SSE3-SLOW-NEXT: retq 2160; 2161; SSE3-FAST-LABEL: partial_reduction_fadd_v16f32: 2162; SSE3-FAST: # %bb.0: 2163; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 2164; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2165; SSE3-FAST-NEXT: addps %xmm0, %xmm1 2166; SSE3-FAST-NEXT: haddps %xmm1, %xmm1 2167; SSE3-FAST-NEXT: movaps %xmm1, %xmm0 2168; SSE3-FAST-NEXT: retq 2169; 2170; AVX-SLOW-LABEL: partial_reduction_fadd_v16f32: 2171; AVX-SLOW: # %bb.0: 2172; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 2173; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 2174; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 2175; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 2176; AVX-SLOW-NEXT: vzeroupper 2177; AVX-SLOW-NEXT: retq 2178; 2179; AVX-FAST-LABEL: partial_reduction_fadd_v16f32: 2180; AVX-FAST: # %bb.0: 2181; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 2182; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 2183; AVX-FAST-NEXT: vzeroupper 2184; AVX-FAST-NEXT: retq 2185 %x23 = shufflevector <16 x float> %x, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2186 %x0213 = fadd <16 x float> %x, %x23 2187 %x13 = shufflevector <16 x float> %x0213, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2188 %x0123 = fadd reassoc nsz <16 x float> %x0213, %x13 2189 %r = extractelement <16 x float> %x0123, i32 0 2190 ret float %r 2191} 2192 2193!llvm.module.flags = !{!0} 2194!0 = !{i32 1, !"ProfileSummary", !1} 2195!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} 2196!2 = !{!"ProfileFormat", !"InstrProf"} 2197!3 = !{!"TotalCount", i64 10000} 2198!4 = !{!"MaxCount", i64 10} 2199!5 = !{!"MaxInternalCount", i64 1} 2200!6 = !{!"MaxFunctionCount", i64 1000} 2201!7 = !{!"NumCounts", i64 3} 2202!8 = !{!"NumFunctions", i64 3} 2203!9 = !{!"DetailedSummary", !10} 2204!10 = !{!11, !12, !13} 2205!11 = !{i32 10000, i64 100, i32 1} 2206!12 = !{i32 999000, i64 100, i32 1} 2207!13 = !{i32 999999, i64 1, i32 2} 2208!14 = !{!"function_entry_count", i64 0} 2209