1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX1-SLOW 5; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx,+fast-hops | FileCheck %s --check-prefix=AVX1-FAST 6; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 7; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 9 10; 11; vXf32 (accum) 12; 13 14define float @test_v2f32(float %a0, <2 x float> %a1) { 15; SSE2-LABEL: test_v2f32: 16; SSE2: # %bb.0: 17; SSE2-NEXT: movaps %xmm1, %xmm2 18; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] 19; SSE2-NEXT: addss %xmm1, %xmm2 20; SSE2-NEXT: addss %xmm2, %xmm0 21; SSE2-NEXT: retq 22; 23; SSE41-LABEL: test_v2f32: 24; SSE41: # %bb.0: 25; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 26; SSE41-NEXT: addss %xmm1, %xmm2 27; SSE41-NEXT: addss %xmm2, %xmm0 28; SSE41-NEXT: retq 29; 30; AVX1-SLOW-LABEL: test_v2f32: 31; AVX1-SLOW: # %bb.0: 32; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 33; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 34; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 35; AVX1-SLOW-NEXT: retq 36; 37; AVX1-FAST-LABEL: test_v2f32: 38; AVX1-FAST: # %bb.0: 39; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 40; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 41; AVX1-FAST-NEXT: retq 42; 43; AVX2-LABEL: test_v2f32: 44; AVX2: # %bb.0: 45; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 46; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 47; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 48; AVX2-NEXT: retq 49; 50; AVX512-LABEL: test_v2f32: 51; AVX512: # %bb.0: 52; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 53; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 54; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 55; AVX512-NEXT: retq 56 %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float %a0, <2 x float> %a1) 57 ret float %1 58} 59 60define float @test_v4f32(float %a0, <4 x float> %a1) { 61; SSE2-LABEL: test_v4f32: 62; SSE2: # %bb.0: 63; SSE2-NEXT: movaps %xmm1, %xmm2 64; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 65; SSE2-NEXT: addps %xmm1, %xmm2 66; SSE2-NEXT: movaps %xmm2, %xmm1 67; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 68; SSE2-NEXT: addss %xmm2, %xmm1 69; SSE2-NEXT: addss %xmm1, %xmm0 70; SSE2-NEXT: retq 71; 72; SSE41-LABEL: test_v4f32: 73; SSE41: # %bb.0: 74; SSE41-NEXT: movaps %xmm1, %xmm2 75; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 76; SSE41-NEXT: addps %xmm1, %xmm2 77; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 78; SSE41-NEXT: addss %xmm2, %xmm1 79; SSE41-NEXT: addss %xmm1, %xmm0 80; SSE41-NEXT: retq 81; 82; AVX1-SLOW-LABEL: test_v4f32: 83; AVX1-SLOW: # %bb.0: 84; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 85; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 86; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 87; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 88; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 89; AVX1-SLOW-NEXT: retq 90; 91; AVX1-FAST-LABEL: test_v4f32: 92; AVX1-FAST: # %bb.0: 93; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 94; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 95; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 96; AVX1-FAST-NEXT: retq 97; 98; AVX2-LABEL: test_v4f32: 99; AVX2: # %bb.0: 100; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 101; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 102; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 103; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 104; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 105; AVX2-NEXT: retq 106; 107; AVX512-LABEL: test_v4f32: 108; AVX512: # %bb.0: 109; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 110; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 111; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 112; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 113; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 114; AVX512-NEXT: retq 115 %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1) 116 ret float %1 117} 118 119define float @test_v8f32(float %a0, <8 x float> %a1) { 120; SSE2-LABEL: test_v8f32: 121; SSE2: # %bb.0: 122; SSE2-NEXT: addps %xmm2, %xmm1 123; SSE2-NEXT: movaps %xmm1, %xmm2 124; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 125; SSE2-NEXT: addps %xmm1, %xmm2 126; SSE2-NEXT: movaps %xmm2, %xmm1 127; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 128; SSE2-NEXT: addss %xmm2, %xmm1 129; SSE2-NEXT: addss %xmm1, %xmm0 130; SSE2-NEXT: retq 131; 132; SSE41-LABEL: test_v8f32: 133; SSE41: # %bb.0: 134; SSE41-NEXT: addps %xmm2, %xmm1 135; SSE41-NEXT: movaps %xmm1, %xmm2 136; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 137; SSE41-NEXT: addps %xmm1, %xmm2 138; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 139; SSE41-NEXT: addss %xmm2, %xmm1 140; SSE41-NEXT: addss %xmm1, %xmm0 141; SSE41-NEXT: retq 142; 143; AVX1-SLOW-LABEL: test_v8f32: 144; AVX1-SLOW: # %bb.0: 145; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 146; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 147; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 148; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 149; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 150; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 151; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 152; AVX1-SLOW-NEXT: vzeroupper 153; AVX1-SLOW-NEXT: retq 154; 155; AVX1-FAST-LABEL: test_v8f32: 156; AVX1-FAST: # %bb.0: 157; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 158; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1 159; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 160; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 161; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 162; AVX1-FAST-NEXT: vzeroupper 163; AVX1-FAST-NEXT: retq 164; 165; AVX2-LABEL: test_v8f32: 166; AVX2: # %bb.0: 167; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 168; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 169; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 170; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 171; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 172; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 173; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 174; AVX2-NEXT: vzeroupper 175; AVX2-NEXT: retq 176; 177; AVX512-LABEL: test_v8f32: 178; AVX512: # %bb.0: 179; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 180; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 181; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 182; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 183; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 184; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 185; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 186; AVX512-NEXT: vzeroupper 187; AVX512-NEXT: retq 188 %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float %a0, <8 x float> %a1) 189 ret float %1 190} 191 192define float @test_v16f32(float %a0, <16 x float> %a1) { 193; SSE2-LABEL: test_v16f32: 194; SSE2: # %bb.0: 195; SSE2-NEXT: addps %xmm4, %xmm2 196; SSE2-NEXT: addps %xmm3, %xmm1 197; SSE2-NEXT: addps %xmm2, %xmm1 198; SSE2-NEXT: movaps %xmm1, %xmm2 199; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 200; SSE2-NEXT: addps %xmm1, %xmm2 201; SSE2-NEXT: movaps %xmm2, %xmm1 202; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] 203; SSE2-NEXT: addss %xmm2, %xmm1 204; SSE2-NEXT: addss %xmm1, %xmm0 205; SSE2-NEXT: retq 206; 207; SSE41-LABEL: test_v16f32: 208; SSE41: # %bb.0: 209; SSE41-NEXT: addps %xmm4, %xmm2 210; SSE41-NEXT: addps %xmm3, %xmm1 211; SSE41-NEXT: addps %xmm2, %xmm1 212; SSE41-NEXT: movaps %xmm1, %xmm2 213; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 214; SSE41-NEXT: addps %xmm1, %xmm2 215; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 216; SSE41-NEXT: addss %xmm2, %xmm1 217; SSE41-NEXT: addss %xmm1, %xmm0 218; SSE41-NEXT: retq 219; 220; AVX1-SLOW-LABEL: test_v16f32: 221; AVX1-SLOW: # %bb.0: 222; AVX1-SLOW-NEXT: vaddps %ymm2, %ymm1, %ymm1 223; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 224; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 225; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 226; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 227; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 228; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 229; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 230; AVX1-SLOW-NEXT: vzeroupper 231; AVX1-SLOW-NEXT: retq 232; 233; AVX1-FAST-LABEL: test_v16f32: 234; AVX1-FAST: # %bb.0: 235; AVX1-FAST-NEXT: vaddps %ymm2, %ymm1, %ymm1 236; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 237; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 238; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 239; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1 240; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 241; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 242; AVX1-FAST-NEXT: vzeroupper 243; AVX1-FAST-NEXT: retq 244; 245; AVX2-LABEL: test_v16f32: 246; AVX2: # %bb.0: 247; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 248; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 249; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 250; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 251; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm1 252; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 253; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 254; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 255; AVX2-NEXT: vzeroupper 256; AVX2-NEXT: retq 257; 258; AVX512-LABEL: test_v16f32: 259; AVX512: # %bb.0: 260; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 261; AVX512-NEXT: vaddps %zmm2, %zmm1, %zmm1 262; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 263; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 264; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 265; AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 266; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 267; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 268; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 269; AVX512-NEXT: vzeroupper 270; AVX512-NEXT: retq 271 %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float %a0, <16 x float> %a1) 272 ret float %1 273} 274 275; 276; vXf32 (zero) 277; 278 279define float @test_v2f32_zero(<2 x float> %a0) { 280; SSE2-LABEL: test_v2f32_zero: 281; SSE2: # %bb.0: 282; SSE2-NEXT: movaps %xmm0, %xmm1 283; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 284; SSE2-NEXT: addss %xmm0, %xmm1 285; SSE2-NEXT: movaps %xmm1, %xmm0 286; SSE2-NEXT: retq 287; 288; SSE41-LABEL: test_v2f32_zero: 289; SSE41: # %bb.0: 290; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 291; SSE41-NEXT: addss %xmm1, %xmm0 292; SSE41-NEXT: retq 293; 294; AVX1-SLOW-LABEL: test_v2f32_zero: 295; AVX1-SLOW: # %bb.0: 296; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 297; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 298; AVX1-SLOW-NEXT: retq 299; 300; AVX1-FAST-LABEL: test_v2f32_zero: 301; AVX1-FAST: # %bb.0: 302; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 303; AVX1-FAST-NEXT: retq 304; 305; AVX2-LABEL: test_v2f32_zero: 306; AVX2: # %bb.0: 307; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 308; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 309; AVX2-NEXT: retq 310; 311; AVX512-LABEL: test_v2f32_zero: 312; AVX512: # %bb.0: 313; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 314; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 315; AVX512-NEXT: retq 316 %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0) 317 ret float %1 318} 319 320define float @test_v4f32_zero(<4 x float> %a0) { 321; SSE2-LABEL: test_v4f32_zero: 322; SSE2: # %bb.0: 323; SSE2-NEXT: movaps %xmm0, %xmm1 324; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 325; SSE2-NEXT: addps %xmm0, %xmm1 326; SSE2-NEXT: movaps %xmm1, %xmm0 327; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 328; SSE2-NEXT: addss %xmm1, %xmm0 329; SSE2-NEXT: retq 330; 331; SSE41-LABEL: test_v4f32_zero: 332; SSE41: # %bb.0: 333; SSE41-NEXT: movaps %xmm0, %xmm1 334; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 335; SSE41-NEXT: addps %xmm0, %xmm1 336; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 337; SSE41-NEXT: addss %xmm0, %xmm1 338; SSE41-NEXT: movaps %xmm1, %xmm0 339; SSE41-NEXT: retq 340; 341; AVX1-SLOW-LABEL: test_v4f32_zero: 342; AVX1-SLOW: # %bb.0: 343; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 344; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 345; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 346; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 347; AVX1-SLOW-NEXT: retq 348; 349; AVX1-FAST-LABEL: test_v4f32_zero: 350; AVX1-FAST: # %bb.0: 351; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 352; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 353; AVX1-FAST-NEXT: retq 354; 355; AVX2-LABEL: test_v4f32_zero: 356; AVX2: # %bb.0: 357; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 358; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 359; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 360; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 361; AVX2-NEXT: retq 362; 363; AVX512-LABEL: test_v4f32_zero: 364; AVX512: # %bb.0: 365; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 366; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 367; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 368; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 369; AVX512-NEXT: retq 370 %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0) 371 ret float %1 372} 373 374define float @test_v8f32_zero(<8 x float> %a0) { 375; SSE2-LABEL: test_v8f32_zero: 376; SSE2: # %bb.0: 377; SSE2-NEXT: addps %xmm1, %xmm0 378; SSE2-NEXT: movaps %xmm0, %xmm1 379; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 380; SSE2-NEXT: addps %xmm0, %xmm1 381; SSE2-NEXT: movaps %xmm1, %xmm0 382; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 383; SSE2-NEXT: addss %xmm1, %xmm0 384; SSE2-NEXT: retq 385; 386; SSE41-LABEL: test_v8f32_zero: 387; SSE41: # %bb.0: 388; SSE41-NEXT: addps %xmm1, %xmm0 389; SSE41-NEXT: movaps %xmm0, %xmm1 390; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 391; SSE41-NEXT: addps %xmm0, %xmm1 392; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 393; SSE41-NEXT: addss %xmm0, %xmm1 394; SSE41-NEXT: movaps %xmm1, %xmm0 395; SSE41-NEXT: retq 396; 397; AVX1-SLOW-LABEL: test_v8f32_zero: 398; AVX1-SLOW: # %bb.0: 399; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 400; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 401; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 402; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 403; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 404; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 405; AVX1-SLOW-NEXT: vzeroupper 406; AVX1-SLOW-NEXT: retq 407; 408; AVX1-FAST-LABEL: test_v8f32_zero: 409; AVX1-FAST: # %bb.0: 410; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 411; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0 412; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 413; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 414; AVX1-FAST-NEXT: vzeroupper 415; AVX1-FAST-NEXT: retq 416; 417; AVX2-LABEL: test_v8f32_zero: 418; AVX2: # %bb.0: 419; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 420; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 421; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 422; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 423; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 424; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 425; AVX2-NEXT: vzeroupper 426; AVX2-NEXT: retq 427; 428; AVX512-LABEL: test_v8f32_zero: 429; AVX512: # %bb.0: 430; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 431; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 432; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 433; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 434; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 435; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 436; AVX512-NEXT: vzeroupper 437; AVX512-NEXT: retq 438 %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0) 439 ret float %1 440} 441 442define float @test_v16f32_zero(<16 x float> %a0) { 443; SSE2-LABEL: test_v16f32_zero: 444; SSE2: # %bb.0: 445; SSE2-NEXT: addps %xmm3, %xmm1 446; SSE2-NEXT: addps %xmm2, %xmm0 447; SSE2-NEXT: addps %xmm1, %xmm0 448; SSE2-NEXT: movaps %xmm0, %xmm1 449; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 450; SSE2-NEXT: addps %xmm0, %xmm1 451; SSE2-NEXT: movaps %xmm1, %xmm0 452; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 453; SSE2-NEXT: addss %xmm1, %xmm0 454; SSE2-NEXT: retq 455; 456; SSE41-LABEL: test_v16f32_zero: 457; SSE41: # %bb.0: 458; SSE41-NEXT: addps %xmm3, %xmm1 459; SSE41-NEXT: addps %xmm2, %xmm0 460; SSE41-NEXT: addps %xmm1, %xmm0 461; SSE41-NEXT: movaps %xmm0, %xmm1 462; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 463; SSE41-NEXT: addps %xmm0, %xmm1 464; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 465; SSE41-NEXT: addss %xmm0, %xmm1 466; SSE41-NEXT: movaps %xmm1, %xmm0 467; SSE41-NEXT: retq 468; 469; AVX1-SLOW-LABEL: test_v16f32_zero: 470; AVX1-SLOW: # %bb.0: 471; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0 472; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 473; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 474; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 475; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 476; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 477; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 478; AVX1-SLOW-NEXT: vzeroupper 479; AVX1-SLOW-NEXT: retq 480; 481; AVX1-FAST-LABEL: test_v16f32_zero: 482; AVX1-FAST: # %bb.0: 483; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 484; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 485; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 486; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 487; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 488; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 489; AVX1-FAST-NEXT: vzeroupper 490; AVX1-FAST-NEXT: retq 491; 492; AVX2-LABEL: test_v16f32_zero: 493; AVX2: # %bb.0: 494; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 495; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 496; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 497; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 498; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 499; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 500; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 501; AVX2-NEXT: vzeroupper 502; AVX2-NEXT: retq 503; 504; AVX512-LABEL: test_v16f32_zero: 505; AVX512: # %bb.0: 506; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 507; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 508; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 509; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 510; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 511; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 512; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 513; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 514; AVX512-NEXT: vzeroupper 515; AVX512-NEXT: retq 516 %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0) 517 ret float %1 518} 519 520; 521; vXf32 (undef) 522; 523 524define float @test_v2f32_undef(<2 x float> %a0) { 525; SSE2-LABEL: test_v2f32_undef: 526; SSE2: # %bb.0: 527; SSE2-NEXT: movaps %xmm0, %xmm1 528; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] 529; SSE2-NEXT: addss %xmm0, %xmm1 530; SSE2-NEXT: movaps %xmm1, %xmm0 531; SSE2-NEXT: retq 532; 533; SSE41-LABEL: test_v2f32_undef: 534; SSE41: # %bb.0: 535; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 536; SSE41-NEXT: addss %xmm1, %xmm0 537; SSE41-NEXT: retq 538; 539; AVX1-SLOW-LABEL: test_v2f32_undef: 540; AVX1-SLOW: # %bb.0: 541; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 542; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 543; AVX1-SLOW-NEXT: retq 544; 545; AVX1-FAST-LABEL: test_v2f32_undef: 546; AVX1-FAST: # %bb.0: 547; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 548; AVX1-FAST-NEXT: retq 549; 550; AVX2-LABEL: test_v2f32_undef: 551; AVX2: # %bb.0: 552; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 553; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 554; AVX2-NEXT: retq 555; 556; AVX512-LABEL: test_v2f32_undef: 557; AVX512: # %bb.0: 558; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 559; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 560; AVX512-NEXT: retq 561 %1 = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float 0.0, <2 x float> %a0) 562 ret float %1 563} 564 565define float @test_v4f32_undef(<4 x float> %a0) { 566; SSE2-LABEL: test_v4f32_undef: 567; SSE2: # %bb.0: 568; SSE2-NEXT: movaps %xmm0, %xmm1 569; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 570; SSE2-NEXT: addps %xmm0, %xmm1 571; SSE2-NEXT: movaps %xmm1, %xmm0 572; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 573; SSE2-NEXT: addss %xmm1, %xmm0 574; SSE2-NEXT: retq 575; 576; SSE41-LABEL: test_v4f32_undef: 577; SSE41: # %bb.0: 578; SSE41-NEXT: movaps %xmm0, %xmm1 579; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 580; SSE41-NEXT: addps %xmm0, %xmm1 581; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 582; SSE41-NEXT: addss %xmm0, %xmm1 583; SSE41-NEXT: movaps %xmm1, %xmm0 584; SSE41-NEXT: retq 585; 586; AVX1-SLOW-LABEL: test_v4f32_undef: 587; AVX1-SLOW: # %bb.0: 588; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 589; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 590; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 591; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 592; AVX1-SLOW-NEXT: retq 593; 594; AVX1-FAST-LABEL: test_v4f32_undef: 595; AVX1-FAST: # %bb.0: 596; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 597; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 598; AVX1-FAST-NEXT: retq 599; 600; AVX2-LABEL: test_v4f32_undef: 601; AVX2: # %bb.0: 602; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 603; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 604; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 605; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 606; AVX2-NEXT: retq 607; 608; AVX512-LABEL: test_v4f32_undef: 609; AVX512: # %bb.0: 610; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 611; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 612; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 613; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 614; AVX512-NEXT: retq 615 %1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.0, <4 x float> %a0) 616 ret float %1 617} 618 619define float @test_v8f32_undef(<8 x float> %a0) { 620; SSE2-LABEL: test_v8f32_undef: 621; SSE2: # %bb.0: 622; SSE2-NEXT: addps %xmm1, %xmm0 623; SSE2-NEXT: movaps %xmm0, %xmm1 624; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 625; SSE2-NEXT: addps %xmm0, %xmm1 626; SSE2-NEXT: movaps %xmm1, %xmm0 627; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 628; SSE2-NEXT: addss %xmm1, %xmm0 629; SSE2-NEXT: retq 630; 631; SSE41-LABEL: test_v8f32_undef: 632; SSE41: # %bb.0: 633; SSE41-NEXT: addps %xmm1, %xmm0 634; SSE41-NEXT: movaps %xmm0, %xmm1 635; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 636; SSE41-NEXT: addps %xmm0, %xmm1 637; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 638; SSE41-NEXT: addss %xmm0, %xmm1 639; SSE41-NEXT: movaps %xmm1, %xmm0 640; SSE41-NEXT: retq 641; 642; AVX1-SLOW-LABEL: test_v8f32_undef: 643; AVX1-SLOW: # %bb.0: 644; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 645; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 646; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 647; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 648; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 649; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 650; AVX1-SLOW-NEXT: vzeroupper 651; AVX1-SLOW-NEXT: retq 652; 653; AVX1-FAST-LABEL: test_v8f32_undef: 654; AVX1-FAST: # %bb.0: 655; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 656; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm0 657; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 658; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 659; AVX1-FAST-NEXT: vzeroupper 660; AVX1-FAST-NEXT: retq 661; 662; AVX2-LABEL: test_v8f32_undef: 663; AVX2: # %bb.0: 664; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 665; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 666; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 667; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 668; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 669; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 670; AVX2-NEXT: vzeroupper 671; AVX2-NEXT: retq 672; 673; AVX512-LABEL: test_v8f32_undef: 674; AVX512: # %bb.0: 675; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 676; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 677; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 678; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 679; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 680; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 681; AVX512-NEXT: vzeroupper 682; AVX512-NEXT: retq 683 %1 = call fast float @llvm.vector.reduce.fadd.f32.v8f32(float 0.0, <8 x float> %a0) 684 ret float %1 685} 686 687define float @test_v16f32_undef(<16 x float> %a0) { 688; SSE2-LABEL: test_v16f32_undef: 689; SSE2: # %bb.0: 690; SSE2-NEXT: addps %xmm3, %xmm1 691; SSE2-NEXT: addps %xmm2, %xmm0 692; SSE2-NEXT: addps %xmm1, %xmm0 693; SSE2-NEXT: movaps %xmm0, %xmm1 694; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 695; SSE2-NEXT: addps %xmm0, %xmm1 696; SSE2-NEXT: movaps %xmm1, %xmm0 697; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] 698; SSE2-NEXT: addss %xmm1, %xmm0 699; SSE2-NEXT: retq 700; 701; SSE41-LABEL: test_v16f32_undef: 702; SSE41: # %bb.0: 703; SSE41-NEXT: addps %xmm3, %xmm1 704; SSE41-NEXT: addps %xmm2, %xmm0 705; SSE41-NEXT: addps %xmm1, %xmm0 706; SSE41-NEXT: movaps %xmm0, %xmm1 707; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 708; SSE41-NEXT: addps %xmm0, %xmm1 709; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 710; SSE41-NEXT: addss %xmm0, %xmm1 711; SSE41-NEXT: movaps %xmm1, %xmm0 712; SSE41-NEXT: retq 713; 714; AVX1-SLOW-LABEL: test_v16f32_undef: 715; AVX1-SLOW: # %bb.0: 716; AVX1-SLOW-NEXT: vaddps %ymm1, %ymm0, %ymm0 717; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 718; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 719; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 720; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 721; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 722; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 723; AVX1-SLOW-NEXT: vzeroupper 724; AVX1-SLOW-NEXT: retq 725; 726; AVX1-FAST-LABEL: test_v16f32_undef: 727; AVX1-FAST: # %bb.0: 728; AVX1-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 729; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 730; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 731; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 732; AVX1-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 733; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 734; AVX1-FAST-NEXT: vzeroupper 735; AVX1-FAST-NEXT: retq 736; 737; AVX2-LABEL: test_v16f32_undef: 738; AVX2: # %bb.0: 739; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 740; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 741; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 742; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 743; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 744; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 745; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 746; AVX2-NEXT: vzeroupper 747; AVX2-NEXT: retq 748; 749; AVX512-LABEL: test_v16f32_undef: 750; AVX512: # %bb.0: 751; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 752; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 753; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 754; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 755; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 756; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 757; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] 758; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 759; AVX512-NEXT: vzeroupper 760; AVX512-NEXT: retq 761 %1 = call fast float @llvm.vector.reduce.fadd.f32.v16f32(float 0.0, <16 x float> %a0) 762 ret float %1 763} 764 765; 766; vXf64 (accum) 767; 768 769define double @test_v2f64(double %a0, <2 x double> %a1) { 770; SSE-LABEL: test_v2f64: 771; SSE: # %bb.0: 772; SSE-NEXT: movapd %xmm1, %xmm2 773; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 774; SSE-NEXT: addsd %xmm1, %xmm2 775; SSE-NEXT: addsd %xmm2, %xmm0 776; SSE-NEXT: retq 777; 778; AVX1-SLOW-LABEL: test_v2f64: 779; AVX1-SLOW: # %bb.0: 780; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 781; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 782; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 783; AVX1-SLOW-NEXT: retq 784; 785; AVX1-FAST-LABEL: test_v2f64: 786; AVX1-FAST: # %bb.0: 787; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 788; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 789; AVX1-FAST-NEXT: retq 790; 791; AVX2-LABEL: test_v2f64: 792; AVX2: # %bb.0: 793; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 794; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1 795; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 796; AVX2-NEXT: retq 797; 798; AVX512-LABEL: test_v2f64: 799; AVX512: # %bb.0: 800; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 801; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 802; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 803; AVX512-NEXT: retq 804 %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double %a0, <2 x double> %a1) 805 ret double %1 806} 807 808define double @test_v4f64(double %a0, <4 x double> %a1) { 809; SSE-LABEL: test_v4f64: 810; SSE: # %bb.0: 811; SSE-NEXT: addpd %xmm2, %xmm1 812; SSE-NEXT: movapd %xmm1, %xmm2 813; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 814; SSE-NEXT: addsd %xmm1, %xmm2 815; SSE-NEXT: addsd %xmm2, %xmm0 816; SSE-NEXT: retq 817; 818; AVX1-SLOW-LABEL: test_v4f64: 819; AVX1-SLOW: # %bb.0: 820; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 821; AVX1-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1 822; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 823; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 824; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 825; AVX1-SLOW-NEXT: vzeroupper 826; AVX1-SLOW-NEXT: retq 827; 828; AVX1-FAST-LABEL: test_v4f64: 829; AVX1-FAST: # %bb.0: 830; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 831; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm2, %xmm1 832; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 833; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 834; AVX1-FAST-NEXT: vzeroupper 835; AVX1-FAST-NEXT: retq 836; 837; AVX2-LABEL: test_v4f64: 838; AVX2: # %bb.0: 839; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 840; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1 841; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 842; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1 843; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 844; AVX2-NEXT: vzeroupper 845; AVX2-NEXT: retq 846; 847; AVX512-LABEL: test_v4f64: 848; AVX512: # %bb.0: 849; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 850; AVX512-NEXT: vaddpd %xmm2, %xmm1, %xmm1 851; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 852; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 853; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 854; AVX512-NEXT: vzeroupper 855; AVX512-NEXT: retq 856 %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double %a0, <4 x double> %a1) 857 ret double %1 858} 859 860define double @test_v8f64(double %a0, <8 x double> %a1) { 861; SSE-LABEL: test_v8f64: 862; SSE: # %bb.0: 863; SSE-NEXT: addpd %xmm4, %xmm2 864; SSE-NEXT: addpd %xmm3, %xmm1 865; SSE-NEXT: addpd %xmm2, %xmm1 866; SSE-NEXT: movapd %xmm1, %xmm2 867; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] 868; SSE-NEXT: addsd %xmm1, %xmm2 869; SSE-NEXT: addsd %xmm2, %xmm0 870; SSE-NEXT: retq 871; 872; AVX1-SLOW-LABEL: test_v8f64: 873; AVX1-SLOW: # %bb.0: 874; AVX1-SLOW-NEXT: vaddpd %ymm2, %ymm1, %ymm1 875; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 876; AVX1-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1 877; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 878; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 879; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 880; AVX1-SLOW-NEXT: vzeroupper 881; AVX1-SLOW-NEXT: retq 882; 883; AVX1-FAST-LABEL: test_v8f64: 884; AVX1-FAST: # %bb.0: 885; AVX1-FAST-NEXT: vaddpd %ymm2, %ymm1, %ymm1 886; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 887; AVX1-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1 888; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 889; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 890; AVX1-FAST-NEXT: vzeroupper 891; AVX1-FAST-NEXT: retq 892; 893; AVX2-LABEL: test_v8f64: 894; AVX2: # %bb.0: 895; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 896; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 897; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1 898; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 899; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1 900; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 901; AVX2-NEXT: vzeroupper 902; AVX2-NEXT: retq 903; 904; AVX512-LABEL: test_v8f64: 905; AVX512: # %bb.0: 906; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 907; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 908; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 909; AVX512-NEXT: vaddpd %xmm2, %xmm1, %xmm1 910; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 911; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 912; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 913; AVX512-NEXT: vzeroupper 914; AVX512-NEXT: retq 915 %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double %a0, <8 x double> %a1) 916 ret double %1 917} 918 919define double @test_v16f64(double %a0, <16 x double> %a1) { 920; SSE-LABEL: test_v16f64: 921; SSE: # %bb.0: 922; SSE-NEXT: addpd %xmm6, %xmm2 923; SSE-NEXT: addpd %xmm7, %xmm3 924; SSE-NEXT: addpd %xmm5, %xmm1 925; SSE-NEXT: addpd %xmm3, %xmm1 926; SSE-NEXT: addpd {{[0-9]+}}(%rsp), %xmm4 927; SSE-NEXT: addpd %xmm2, %xmm4 928; SSE-NEXT: addpd %xmm1, %xmm4 929; SSE-NEXT: movapd %xmm4, %xmm1 930; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] 931; SSE-NEXT: addsd %xmm4, %xmm1 932; SSE-NEXT: addsd %xmm1, %xmm0 933; SSE-NEXT: retq 934; 935; AVX1-SLOW-LABEL: test_v16f64: 936; AVX1-SLOW: # %bb.0: 937; AVX1-SLOW-NEXT: vaddpd %ymm4, %ymm2, %ymm2 938; AVX1-SLOW-NEXT: vaddpd %ymm3, %ymm1, %ymm1 939; AVX1-SLOW-NEXT: vaddpd %ymm2, %ymm1, %ymm1 940; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 941; AVX1-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1 942; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 943; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1 944; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 945; AVX1-SLOW-NEXT: vzeroupper 946; AVX1-SLOW-NEXT: retq 947; 948; AVX1-FAST-LABEL: test_v16f64: 949; AVX1-FAST: # %bb.0: 950; AVX1-FAST-NEXT: vaddpd %ymm4, %ymm2, %ymm2 951; AVX1-FAST-NEXT: vaddpd %ymm3, %ymm1, %ymm1 952; AVX1-FAST-NEXT: vaddpd %ymm2, %ymm1, %ymm1 953; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 954; AVX1-FAST-NEXT: vaddpd %xmm2, %xmm1, %xmm1 955; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 956; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0 957; AVX1-FAST-NEXT: vzeroupper 958; AVX1-FAST-NEXT: retq 959; 960; AVX2-LABEL: test_v16f64: 961; AVX2: # %bb.0: 962; AVX2-NEXT: vaddpd %ymm4, %ymm2, %ymm2 963; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1 964; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 965; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 966; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1 967; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 968; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1 969; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 970; AVX2-NEXT: vzeroupper 971; AVX2-NEXT: retq 972; 973; AVX512-LABEL: test_v16f64: 974; AVX512: # %bb.0: 975; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 976; AVX512-NEXT: vextractf64x4 $1, %zmm1, %ymm2 977; AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 978; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 979; AVX512-NEXT: vaddpd %xmm2, %xmm1, %xmm1 980; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] 981; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1 982; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 983; AVX512-NEXT: vzeroupper 984; AVX512-NEXT: retq 985 %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double %a0, <16 x double> %a1) 986 ret double %1 987} 988 989; 990; vXf64 (zero) 991; 992 993define double @test_v2f64_zero(<2 x double> %a0) { 994; SSE-LABEL: test_v2f64_zero: 995; SSE: # %bb.0: 996; SSE-NEXT: movapd %xmm0, %xmm1 997; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 998; SSE-NEXT: addsd %xmm0, %xmm1 999; SSE-NEXT: movapd %xmm1, %xmm0 1000; SSE-NEXT: retq 1001; 1002; AVX1-SLOW-LABEL: test_v2f64_zero: 1003; AVX1-SLOW: # %bb.0: 1004; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1005; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1006; AVX1-SLOW-NEXT: retq 1007; 1008; AVX1-FAST-LABEL: test_v2f64_zero: 1009; AVX1-FAST: # %bb.0: 1010; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1011; AVX1-FAST-NEXT: retq 1012; 1013; AVX2-LABEL: test_v2f64_zero: 1014; AVX2: # %bb.0: 1015; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1016; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1017; AVX2-NEXT: retq 1018; 1019; AVX512-LABEL: test_v2f64_zero: 1020; AVX512: # %bb.0: 1021; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1022; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1023; AVX512-NEXT: retq 1024 %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0) 1025 ret double %1 1026} 1027 1028define double @test_v4f64_zero(<4 x double> %a0) { 1029; SSE-LABEL: test_v4f64_zero: 1030; SSE: # %bb.0: 1031; SSE-NEXT: addpd %xmm1, %xmm0 1032; SSE-NEXT: movapd %xmm0, %xmm1 1033; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1034; SSE-NEXT: addsd %xmm0, %xmm1 1035; SSE-NEXT: movapd %xmm1, %xmm0 1036; SSE-NEXT: retq 1037; 1038; AVX1-SLOW-LABEL: test_v4f64_zero: 1039; AVX1-SLOW: # %bb.0: 1040; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1041; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1042; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1043; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1044; AVX1-SLOW-NEXT: vzeroupper 1045; AVX1-SLOW-NEXT: retq 1046; 1047; AVX1-FAST-LABEL: test_v4f64_zero: 1048; AVX1-FAST: # %bb.0: 1049; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 1050; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 1051; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1052; AVX1-FAST-NEXT: vzeroupper 1053; AVX1-FAST-NEXT: retq 1054; 1055; AVX2-LABEL: test_v4f64_zero: 1056; AVX2: # %bb.0: 1057; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 1058; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1059; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1060; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1061; AVX2-NEXT: vzeroupper 1062; AVX2-NEXT: retq 1063; 1064; AVX512-LABEL: test_v4f64_zero: 1065; AVX512: # %bb.0: 1066; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 1067; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1068; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1069; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1070; AVX512-NEXT: vzeroupper 1071; AVX512-NEXT: retq 1072 %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0) 1073 ret double %1 1074} 1075 1076define double @test_v8f64_zero(<8 x double> %a0) { 1077; SSE-LABEL: test_v8f64_zero: 1078; SSE: # %bb.0: 1079; SSE-NEXT: addpd %xmm3, %xmm1 1080; SSE-NEXT: addpd %xmm2, %xmm0 1081; SSE-NEXT: addpd %xmm1, %xmm0 1082; SSE-NEXT: movapd %xmm0, %xmm1 1083; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1084; SSE-NEXT: addsd %xmm0, %xmm1 1085; SSE-NEXT: movapd %xmm1, %xmm0 1086; SSE-NEXT: retq 1087; 1088; AVX1-SLOW-LABEL: test_v8f64_zero: 1089; AVX1-SLOW: # %bb.0: 1090; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1091; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1092; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1093; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1094; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1095; AVX1-SLOW-NEXT: vzeroupper 1096; AVX1-SLOW-NEXT: retq 1097; 1098; AVX1-FAST-LABEL: test_v8f64_zero: 1099; AVX1-FAST: # %bb.0: 1100; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1101; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 1102; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1103; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1104; AVX1-FAST-NEXT: vzeroupper 1105; AVX1-FAST-NEXT: retq 1106; 1107; AVX2-LABEL: test_v8f64_zero: 1108; AVX2: # %bb.0: 1109; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1110; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 1111; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1112; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1113; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1114; AVX2-NEXT: vzeroupper 1115; AVX2-NEXT: retq 1116; 1117; AVX512-LABEL: test_v8f64_zero: 1118; AVX512: # %bb.0: 1119; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 1120; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1121; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 1122; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1123; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1124; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1125; AVX512-NEXT: vzeroupper 1126; AVX512-NEXT: retq 1127 %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0) 1128 ret double %1 1129} 1130 1131define double @test_v16f64_zero(<16 x double> %a0) { 1132; SSE-LABEL: test_v16f64_zero: 1133; SSE: # %bb.0: 1134; SSE-NEXT: addpd %xmm6, %xmm2 1135; SSE-NEXT: addpd %xmm4, %xmm0 1136; SSE-NEXT: addpd %xmm2, %xmm0 1137; SSE-NEXT: addpd %xmm7, %xmm3 1138; SSE-NEXT: addpd %xmm5, %xmm1 1139; SSE-NEXT: addpd %xmm3, %xmm1 1140; SSE-NEXT: addpd %xmm0, %xmm1 1141; SSE-NEXT: movapd %xmm1, %xmm0 1142; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1143; SSE-NEXT: addsd %xmm1, %xmm0 1144; SSE-NEXT: retq 1145; 1146; AVX1-SLOW-LABEL: test_v16f64_zero: 1147; AVX1-SLOW: # %bb.0: 1148; AVX1-SLOW-NEXT: vaddpd %ymm3, %ymm1, %ymm1 1149; AVX1-SLOW-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1150; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1151; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1152; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1153; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1154; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1155; AVX1-SLOW-NEXT: vzeroupper 1156; AVX1-SLOW-NEXT: retq 1157; 1158; AVX1-FAST-LABEL: test_v16f64_zero: 1159; AVX1-FAST: # %bb.0: 1160; AVX1-FAST-NEXT: vaddpd %ymm3, %ymm1, %ymm1 1161; AVX1-FAST-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1162; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1163; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 1164; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1165; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1166; AVX1-FAST-NEXT: vzeroupper 1167; AVX1-FAST-NEXT: retq 1168; 1169; AVX2-LABEL: test_v16f64_zero: 1170; AVX2: # %bb.0: 1171; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1 1172; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1173; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1174; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 1175; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1176; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1177; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1178; AVX2-NEXT: vzeroupper 1179; AVX2-NEXT: retq 1180; 1181; AVX512-LABEL: test_v16f64_zero: 1182; AVX512: # %bb.0: 1183; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1184; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 1185; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1186; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 1187; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1188; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1189; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1190; AVX512-NEXT: vzeroupper 1191; AVX512-NEXT: retq 1192 %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0) 1193 ret double %1 1194} 1195 1196; 1197; vXf64 (undef) 1198; 1199 1200define double @test_v2f64_undef(<2 x double> %a0) { 1201; SSE-LABEL: test_v2f64_undef: 1202; SSE: # %bb.0: 1203; SSE-NEXT: movapd %xmm0, %xmm1 1204; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1205; SSE-NEXT: addsd %xmm0, %xmm1 1206; SSE-NEXT: movapd %xmm1, %xmm0 1207; SSE-NEXT: retq 1208; 1209; AVX1-SLOW-LABEL: test_v2f64_undef: 1210; AVX1-SLOW: # %bb.0: 1211; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1212; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1213; AVX1-SLOW-NEXT: retq 1214; 1215; AVX1-FAST-LABEL: test_v2f64_undef: 1216; AVX1-FAST: # %bb.0: 1217; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1218; AVX1-FAST-NEXT: retq 1219; 1220; AVX2-LABEL: test_v2f64_undef: 1221; AVX2: # %bb.0: 1222; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1223; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1224; AVX2-NEXT: retq 1225; 1226; AVX512-LABEL: test_v2f64_undef: 1227; AVX512: # %bb.0: 1228; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1229; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1230; AVX512-NEXT: retq 1231 %1 = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double 0.0, <2 x double> %a0) 1232 ret double %1 1233} 1234 1235define double @test_v4f64_undef(<4 x double> %a0) { 1236; SSE-LABEL: test_v4f64_undef: 1237; SSE: # %bb.0: 1238; SSE-NEXT: addpd %xmm1, %xmm0 1239; SSE-NEXT: movapd %xmm0, %xmm1 1240; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1241; SSE-NEXT: addsd %xmm0, %xmm1 1242; SSE-NEXT: movapd %xmm1, %xmm0 1243; SSE-NEXT: retq 1244; 1245; AVX1-SLOW-LABEL: test_v4f64_undef: 1246; AVX1-SLOW: # %bb.0: 1247; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1248; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1249; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1250; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1251; AVX1-SLOW-NEXT: vzeroupper 1252; AVX1-SLOW-NEXT: retq 1253; 1254; AVX1-FAST-LABEL: test_v4f64_undef: 1255; AVX1-FAST: # %bb.0: 1256; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 1257; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm1, %xmm0 1258; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1259; AVX1-FAST-NEXT: vzeroupper 1260; AVX1-FAST-NEXT: retq 1261; 1262; AVX2-LABEL: test_v4f64_undef: 1263; AVX2: # %bb.0: 1264; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 1265; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1266; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1267; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1268; AVX2-NEXT: vzeroupper 1269; AVX2-NEXT: retq 1270; 1271; AVX512-LABEL: test_v4f64_undef: 1272; AVX512: # %bb.0: 1273; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 1274; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1275; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1276; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1277; AVX512-NEXT: vzeroupper 1278; AVX512-NEXT: retq 1279 %1 = call fast double @llvm.vector.reduce.fadd.f64.v4f64(double 0.0, <4 x double> %a0) 1280 ret double %1 1281} 1282 1283define double @test_v8f64_undef(<8 x double> %a0) { 1284; SSE-LABEL: test_v8f64_undef: 1285; SSE: # %bb.0: 1286; SSE-NEXT: addpd %xmm3, %xmm1 1287; SSE-NEXT: addpd %xmm2, %xmm0 1288; SSE-NEXT: addpd %xmm1, %xmm0 1289; SSE-NEXT: movapd %xmm0, %xmm1 1290; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1291; SSE-NEXT: addsd %xmm0, %xmm1 1292; SSE-NEXT: movapd %xmm1, %xmm0 1293; SSE-NEXT: retq 1294; 1295; AVX1-SLOW-LABEL: test_v8f64_undef: 1296; AVX1-SLOW: # %bb.0: 1297; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1298; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1299; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1300; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1301; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1302; AVX1-SLOW-NEXT: vzeroupper 1303; AVX1-SLOW-NEXT: retq 1304; 1305; AVX1-FAST-LABEL: test_v8f64_undef: 1306; AVX1-FAST: # %bb.0: 1307; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1308; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 1309; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1310; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1311; AVX1-FAST-NEXT: vzeroupper 1312; AVX1-FAST-NEXT: retq 1313; 1314; AVX2-LABEL: test_v8f64_undef: 1315; AVX2: # %bb.0: 1316; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1317; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 1318; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1319; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1320; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1321; AVX2-NEXT: vzeroupper 1322; AVX2-NEXT: retq 1323; 1324; AVX512-LABEL: test_v8f64_undef: 1325; AVX512: # %bb.0: 1326; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 1327; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1328; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 1329; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1330; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1331; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1332; AVX512-NEXT: vzeroupper 1333; AVX512-NEXT: retq 1334 %1 = call fast double @llvm.vector.reduce.fadd.f64.v8f64(double 0.0, <8 x double> %a0) 1335 ret double %1 1336} 1337 1338define double @test_v16f64_undef(<16 x double> %a0) { 1339; SSE-LABEL: test_v16f64_undef: 1340; SSE: # %bb.0: 1341; SSE-NEXT: addpd %xmm6, %xmm2 1342; SSE-NEXT: addpd %xmm4, %xmm0 1343; SSE-NEXT: addpd %xmm2, %xmm0 1344; SSE-NEXT: addpd %xmm7, %xmm3 1345; SSE-NEXT: addpd %xmm5, %xmm1 1346; SSE-NEXT: addpd %xmm3, %xmm1 1347; SSE-NEXT: addpd %xmm0, %xmm1 1348; SSE-NEXT: movapd %xmm1, %xmm0 1349; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1350; SSE-NEXT: addsd %xmm1, %xmm0 1351; SSE-NEXT: retq 1352; 1353; AVX1-SLOW-LABEL: test_v16f64_undef: 1354; AVX1-SLOW: # %bb.0: 1355; AVX1-SLOW-NEXT: vaddpd %ymm3, %ymm1, %ymm1 1356; AVX1-SLOW-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1357; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1358; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1359; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1360; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1361; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1362; AVX1-SLOW-NEXT: vzeroupper 1363; AVX1-SLOW-NEXT: retq 1364; 1365; AVX1-FAST-LABEL: test_v16f64_undef: 1366; AVX1-FAST: # %bb.0: 1367; AVX1-FAST-NEXT: vaddpd %ymm3, %ymm1, %ymm1 1368; AVX1-FAST-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1369; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1370; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 1371; AVX1-FAST-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1372; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 1373; AVX1-FAST-NEXT: vzeroupper 1374; AVX1-FAST-NEXT: retq 1375; 1376; AVX2-LABEL: test_v16f64_undef: 1377; AVX2: # %bb.0: 1378; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1 1379; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1380; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 1381; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 1382; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1383; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1384; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1385; AVX2-NEXT: vzeroupper 1386; AVX2-NEXT: retq 1387; 1388; AVX512-LABEL: test_v16f64_undef: 1389; AVX512: # %bb.0: 1390; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1391; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 1392; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 1393; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 1394; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 1395; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 1396; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1397; AVX512-NEXT: vzeroupper 1398; AVX512-NEXT: retq 1399 %1 = call fast double @llvm.vector.reduce.fadd.f64.v16f64(double 0.0, <16 x double> %a0) 1400 ret double %1 1401} 1402 1403declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>) 1404declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) 1405declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>) 1406declare float @llvm.vector.reduce.fadd.f32.v16f32(float, <16 x float>) 1407 1408declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>) 1409declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>) 1410declare double @llvm.vector.reduce.fadd.f64.v8f64(double, <8 x double>) 1411declare double @llvm.vector.reduce.fadd.f64.v16f64(double, <16 x double>) 1412