1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3-SLOW 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3-FAST 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX-SLOW,AVX1-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX-FAST,AVX1-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-SLOW,AVX2-SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX-FAST,AVX2-FAST 8 9; Vectorized Pairwise Sum Reductions 10; e.g. 11; inline STYPE sum(VTYPE x) { 12; return (x[0] + x[1]) + (x[2] + x[3]); 13; } 14; 15; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) { 16; return (VTYPE) { sum( A0 ), sum( A1 ), sum( A2 ), sum( A3 ) }; 17; } 18 19define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) { 20; SSSE3-SLOW-LABEL: pair_sum_v4f32_v4f32: 21; SSSE3-SLOW: # %bb.0: 22; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0 23; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm3 24; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm0 25; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,3,2] 26; SSSE3-SLOW-NEXT: retq 27; 28; SSSE3-FAST-LABEL: pair_sum_v4f32_v4f32: 29; SSSE3-FAST: # %bb.0: 30; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0 31; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2 32; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0 33; SSSE3-FAST-NEXT: retq 34; 35; AVX1-SLOW-LABEL: pair_sum_v4f32_v4f32: 36; AVX1-SLOW: # %bb.0: 37; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 38; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 39; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1] 40; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1] 41; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 42; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[1] 43; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] 44; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 45; AVX1-SLOW-NEXT: retq 46; 47; AVX-FAST-LABEL: pair_sum_v4f32_v4f32: 48; AVX-FAST: # %bb.0: 49; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 50; AVX-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm1 51; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 52; AVX-FAST-NEXT: retq 53; 54; AVX2-SLOW-LABEL: pair_sum_v4f32_v4f32: 55; AVX2-SLOW: # %bb.0: 56; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 57; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 58; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,3] 59; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1] 60; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 61; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[1] 62; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] 63; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 64; AVX2-SLOW-NEXT: retq 65 %5 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2> 66 %6 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3> 67 %7 = fadd <2 x float> %5, %6 68 %8 = shufflevector <2 x float> %7, <2 x float> poison, <2 x i32> <i32 1, i32 undef> 69 %9 = fadd <2 x float> %7, %8 70 %10 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 0, i32 2> 71 %11 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 1, i32 3> 72 %12 = fadd <2 x float> %10, %11 73 %13 = shufflevector <2 x float> %12, <2 x float> poison, <2 x i32> <i32 1, i32 undef> 74 %14 = fadd <2 x float> %12, %13 75 %15 = shufflevector <2 x float> %9, <2 x float> %14, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 76 %16 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 0, i32 2> 77 %17 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 1, i32 3> 78 %18 = fadd <2 x float> %16, %17 79 %19 = shufflevector <2 x float> %18, <2 x float> poison, <2 x i32> <i32 1, i32 undef> 80 %20 = fadd <2 x float> %18, %19 81 %21 = shufflevector <2 x float> %20, <2 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 82 %22 = shufflevector <4 x float> %15, <4 x float> %21, <4 x i32> <i32 0, i32 1, i32 4, i32 undef> 83 %23 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 0, i32 2> 84 %24 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 1, i32 3> 85 %25 = fadd <2 x float> %23, %24 86 %26 = shufflevector <2 x float> %25, <2 x float> poison, <2 x i32> <i32 1, i32 undef> 87 %27 = fadd <2 x float> %25, %26 88 %28 = shufflevector <2 x float> %27, <2 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 89 %29 = shufflevector <4 x float> %22, <4 x float> %28, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 90 ret <4 x float> %29 91} 92 93define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) { 94; SSSE3-SLOW-LABEL: pair_sum_v4i32_v4i32: 95; SSSE3-SLOW: # %bb.0: 96; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0 97; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] 98; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 99; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 100; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm3 101; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] 102; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1 103; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 104; SSSE3-SLOW-NEXT: retq 105; 106; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32: 107; SSSE3-FAST: # %bb.0: 108; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2 109; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0 110; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm0 111; SSSE3-FAST-NEXT: retq 112; 113; AVX1-SLOW-LABEL: pair_sum_v4i32_v4i32: 114; AVX1-SLOW: # %bb.0: 115; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 116; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] 117; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 118; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 119; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 120; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 121; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 122; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 123; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 124; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] 125; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 126; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 127; AVX1-SLOW-NEXT: retq 128; 129; AVX-FAST-LABEL: pair_sum_v4i32_v4i32: 130; AVX-FAST: # %bb.0: 131; AVX-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2 132; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 133; AVX-FAST-NEXT: vphaddd %xmm2, %xmm0, %xmm0 134; AVX-FAST-NEXT: retq 135; 136; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32: 137; AVX2-SLOW: # %bb.0: 138; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 139; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 140; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 141; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 142; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 143; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 144; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3] 145; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 146; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm2 147; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 148; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 149; AVX2-SLOW-NEXT: retq 150 %5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 151 %6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 152 %7 = add <2 x i32> %5, %6 153 %8 = shufflevector <2 x i32> %7, <2 x i32> poison, <2 x i32> <i32 1, i32 undef> 154 %9 = add <2 x i32> %7, %8 155 %10 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 156 %11 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 157 %12 = add <2 x i32> %10, %11 158 %13 = shufflevector <2 x i32> %12, <2 x i32> poison, <2 x i32> <i32 1, i32 undef> 159 %14 = add <2 x i32> %12, %13 160 %15 = shufflevector <2 x i32> %9, <2 x i32> %14, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 161 %16 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 162 %17 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 163 %18 = add <2 x i32> %16, %17 164 %19 = shufflevector <2 x i32> %18, <2 x i32> poison, <2 x i32> <i32 1, i32 undef> 165 %20 = add <2 x i32> %18, %19 166 %21 = shufflevector <2 x i32> %20, <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 167 %22 = shufflevector <4 x i32> %15, <4 x i32> %21, <4 x i32> <i32 0, i32 1, i32 4, i32 undef> 168 %23 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 169 %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 170 %25 = add <2 x i32> %23, %24 171 %26 = shufflevector <2 x i32> %25, <2 x i32> poison, <2 x i32> <i32 1, i32 undef> 172 %27 = add <2 x i32> %25, %26 173 %28 = shufflevector <2 x i32> %27, <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> 174 %29 = shufflevector <4 x i32> %22, <4 x i32> %28, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 175 ret <4 x i32> %29 176} 177 178define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5, <4 x float> %6, <4 x float> %7) { 179; SSSE3-SLOW-LABEL: pair_sum_v8f32_v4f32: 180; SSSE3-SLOW: # %bb.0: 181; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0 182; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 183; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] 184; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 185; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 186; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm2 187; SSSE3-SLOW-NEXT: movaps %xmm5, %xmm1 188; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm1 189; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm2 190; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,3,2] 191; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 192; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6 193; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm4 194; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm4 195; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm1 196; SSSE3-SLOW-NEXT: retq 197; 198; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32: 199; SSSE3-FAST: # %bb.0: 200; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0 201; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 202; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2 203; SSSE3-FAST-NEXT: haddps %xmm5, %xmm4 204; SSSE3-FAST-NEXT: haddps %xmm4, %xmm2 205; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 206; SSSE3-FAST-NEXT: haddps %xmm7, %xmm6 207; SSSE3-FAST-NEXT: haddps %xmm6, %xmm4 208; SSSE3-FAST-NEXT: movaps %xmm4, %xmm1 209; SSSE3-FAST-NEXT: retq 210; 211; AVX1-SLOW-LABEL: pair_sum_v8f32_v4f32: 212; AVX1-SLOW: # %bb.0: 213; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 214; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3] 215; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 216; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 217; AVX1-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1 218; AVX1-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4 219; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2 220; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1] 221; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] 222; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] 223; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1] 224; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1 225; AVX1-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 226; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 227; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 228; AVX1-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2 229; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2 230; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 231; AVX1-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] 232; AVX1-SLOW-NEXT: retq 233; 234; AVX1-FAST-LABEL: pair_sum_v8f32_v4f32: 235; AVX1-FAST: # %bb.0: 236; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 237; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 238; AVX1-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1 239; AVX1-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm4 240; AVX1-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2 241; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1] 242; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] 243; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] 244; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1] 245; AVX1-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1 246; AVX1-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 247; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 248; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 249; AVX1-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2 250; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2 251; AVX1-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 252; AVX1-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] 253; AVX1-FAST-NEXT: retq 254; 255; AVX2-SLOW-LABEL: pair_sum_v8f32_v4f32: 256; AVX2-SLOW: # %bb.0: 257; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 258; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3] 259; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 260; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 261; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1 262; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4 263; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2 264; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3] 265; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] 266; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] 267; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1] 268; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1 269; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 270; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 271; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 272; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2 273; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2 274; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 275; AVX2-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] 276; AVX2-SLOW-NEXT: retq 277; 278; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32: 279; AVX2-FAST: # %bb.0: 280; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 281; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 282; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1 283; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm4 284; AVX2-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2 285; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3] 286; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] 287; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] 288; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1] 289; AVX2-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1 290; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 291; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 292; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 293; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2 294; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2 295; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 296; AVX2-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] 297; AVX2-FAST-NEXT: retq 298 %9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2> 299 %10 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3> 300 %11 = fadd <2 x float> %9, %10 301 %12 = shufflevector <2 x float> %11, <2 x float> poison, <2 x i32> <i32 1, i32 undef> 302 %13 = fadd <2 x float> %11, %12 303 %14 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 0, i32 2> 304 %15 = shufflevector <4 x float> %1, <4 x float> poison, <2 x i32> <i32 1, i32 3> 305 %16 = fadd <2 x float> %14, %15 306 %17 = shufflevector <2 x float> %16, <2 x float> poison, <2 x i32> <i32 1, i32 undef> 307 %18 = fadd <2 x float> %16, %17 308 %19 = shufflevector <2 x float> %13, <2 x float> %18, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 309 %20 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 0, i32 2> 310 %21 = shufflevector <4 x float> %2, <4 x float> poison, <2 x i32> <i32 1, i32 3> 311 %22 = fadd <2 x float> %20, %21 312 %23 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 0, i32 2> 313 %24 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 1, i32 3> 314 %25 = fadd <2 x float> %23, %24 315 %26 = shufflevector <4 x float> %4, <4 x float> poison, <2 x i32> <i32 0, i32 2> 316 %27 = shufflevector <4 x float> %4, <4 x float> poison, <2 x i32> <i32 1, i32 3> 317 %28 = fadd <2 x float> %26, %27 318 %29 = shufflevector <2 x float> %28, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 319 %30 = shufflevector <4 x float> %5, <4 x float> poison, <2 x i32> <i32 0, i32 2> 320 %31 = shufflevector <4 x float> %5, <4 x float> poison, <2 x i32> <i32 1, i32 3> 321 %32 = fadd <2 x float> %30, %31 322 %33 = shufflevector <2 x float> %32, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 323 %34 = shufflevector <2 x float> %22, <2 x float> %25, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 324 %35 = shufflevector <4 x float> %34, <4 x float> %29, <4 x i32> <i32 0, i32 1, i32 4, i32 undef> 325 %36 = shufflevector <4 x float> %35, <4 x float> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 326 %37 = shufflevector <2 x float> %22, <2 x float> %25, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 327 %38 = shufflevector <4 x float> %37, <4 x float> %29, <4 x i32> <i32 0, i32 1, i32 5, i32 undef> 328 %39 = shufflevector <4 x float> %38, <4 x float> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 5> 329 %40 = fadd <4 x float> %36, %39 330 %41 = shufflevector <4 x float> %40, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 331 %42 = shufflevector <8 x float> %19, <8 x float> %41, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef> 332 %43 = shufflevector <4 x float> %6, <4 x float> poison, <2 x i32> <i32 0, i32 2> 333 %44 = shufflevector <4 x float> %6, <4 x float> poison, <2 x i32> <i32 1, i32 3> 334 %45 = fadd <2 x float> %43, %44 335 %46 = shufflevector <4 x float> %7, <4 x float> poison, <2 x i32> <i32 0, i32 2> 336 %47 = shufflevector <4 x float> %7, <4 x float> poison, <2 x i32> <i32 1, i32 3> 337 %48 = fadd <2 x float> %46, %47 338 %49 = shufflevector <2 x float> %45, <2 x float> %48, <2 x i32> <i32 0, i32 2> 339 %50 = shufflevector <2 x float> %45, <2 x float> %48, <2 x i32> <i32 1, i32 3> 340 %51 = fadd <2 x float> %49, %50 341 %52 = shufflevector <2 x float> %51, <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 342 %53 = shufflevector <8 x float> %42, <8 x float> %52, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9> 343 ret <8 x float> %53 344} 345 346define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, <4 x i32> %5, <4 x i32> %6, <4 x i32> %7) { 347; SSSE3-SLOW-LABEL: pair_sum_v8i32_v4i32: 348; SSSE3-SLOW: # %bb.0: 349; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0 350; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] 351; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 352; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 353; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm2 354; SSSE3-SLOW-NEXT: phaddd %xmm4, %xmm5 355; SSSE3-SLOW-NEXT: phaddd %xmm5, %xmm2 356; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,3,2] 357; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 358; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm6 359; SSSE3-SLOW-NEXT: phaddd %xmm6, %xmm6 360; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,1,1,1] 361; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm2[0,2] 362; SSSE3-SLOW-NEXT: retq 363; 364; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32: 365; SSSE3-FAST: # %bb.0: 366; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0 367; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 368; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2 369; SSSE3-FAST-NEXT: phaddd %xmm5, %xmm4 370; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm2 371; SSSE3-FAST-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 372; SSSE3-FAST-NEXT: phaddd %xmm6, %xmm6 373; SSSE3-FAST-NEXT: phaddd %xmm7, %xmm7 374; SSSE3-FAST-NEXT: phaddd %xmm7, %xmm6 375; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,2] 376; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1 377; SSSE3-FAST-NEXT: retq 378; 379; AVX1-SLOW-LABEL: pair_sum_v8i32_v4i32: 380; AVX1-SLOW: # %bb.0: 381; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 382; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] 383; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 384; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 385; AVX1-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1 386; AVX1-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4 387; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2 388; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] 389; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 390; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] 391; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7] 392; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] 393; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] 394; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 395; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 396; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 397; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 398; AVX1-SLOW-NEXT: vphaddd %xmm7, %xmm6, %xmm2 399; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm2 400; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 401; AVX1-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] 402; AVX1-SLOW-NEXT: retq 403; 404; AVX1-FAST-LABEL: pair_sum_v8i32_v4i32: 405; AVX1-FAST: # %bb.0: 406; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 407; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 408; AVX1-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm1 409; AVX1-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4 410; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2 411; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] 412; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 413; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0] 414; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7] 415; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] 416; AVX1-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] 417; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 418; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 419; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 420; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 421; AVX1-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm2 422; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm2, %xmm2 423; AVX1-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 424; AVX1-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] 425; AVX1-FAST-NEXT: retq 426; 427; AVX2-SLOW-LABEL: pair_sum_v8i32_v4i32: 428; AVX2-SLOW: # %bb.0: 429; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 430; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3] 431; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 432; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 433; AVX2-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1 434; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4 435; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2 436; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3] 437; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] 438; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] 439; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] 440; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 441; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 442; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 443; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 444; AVX2-SLOW-NEXT: vphaddd %xmm7, %xmm6, %xmm1 445; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1 446; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 447; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 448; AVX2-SLOW-NEXT: retq 449; 450; AVX2-FAST-LABEL: pair_sum_v8i32_v4i32: 451; AVX2-FAST: # %bb.0: 452; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 453; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 454; AVX2-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm1 455; AVX2-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4 456; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2 457; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3] 458; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] 459; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] 460; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] 461; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 462; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 463; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 464; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 465; AVX2-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm1 466; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm2 467; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1 468; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 469; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 470; AVX2-FAST-NEXT: retq 471 %9 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 472 %10 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 473 %11 = add <2 x i32> %9, %10 474 %12 = shufflevector <2 x i32> %11, <2 x i32> poison, <2 x i32> <i32 1, i32 undef> 475 %13 = add <2 x i32> %11, %12 476 %14 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 477 %15 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 478 %16 = add <2 x i32> %14, %15 479 %17 = shufflevector <2 x i32> %16, <2 x i32> poison, <2 x i32> <i32 1, i32 undef> 480 %18 = add <2 x i32> %16, %17 481 %19 = shufflevector <2 x i32> %13, <2 x i32> %18, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 482 %20 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 483 %21 = shufflevector <4 x i32> %2, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 484 %22 = add <2 x i32> %20, %21 485 %23 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 486 %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 487 %25 = add <2 x i32> %23, %24 488 %26 = shufflevector <4 x i32> %4, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 489 %27 = shufflevector <4 x i32> %4, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 490 %28 = add <2 x i32> %26, %27 491 %29 = shufflevector <2 x i32> %28, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 492 %30 = shufflevector <4 x i32> %5, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 493 %31 = shufflevector <4 x i32> %5, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 494 %32 = add <2 x i32> %30, %31 495 %33 = shufflevector <2 x i32> %32, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 496 %34 = shufflevector <2 x i32> %22, <2 x i32> %25, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef> 497 %35 = shufflevector <4 x i32> %34, <4 x i32> %29, <4 x i32> <i32 0, i32 1, i32 4, i32 undef> 498 %36 = shufflevector <4 x i32> %35, <4 x i32> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 499 %37 = shufflevector <2 x i32> %22, <2 x i32> %25, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> 500 %38 = shufflevector <4 x i32> %37, <4 x i32> %29, <4 x i32> <i32 0, i32 1, i32 5, i32 undef> 501 %39 = shufflevector <4 x i32> %38, <4 x i32> %33, <4 x i32> <i32 0, i32 1, i32 2, i32 5> 502 %40 = add <4 x i32> %36, %39 503 %41 = shufflevector <4 x i32> %40, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 504 %42 = shufflevector <8 x i32> %19, <8 x i32> %41, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef> 505 %43 = shufflevector <4 x i32> %6, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 506 %44 = shufflevector <4 x i32> %6, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 507 %45 = add <2 x i32> %43, %44 508 %46 = shufflevector <4 x i32> %7, <4 x i32> poison, <2 x i32> <i32 0, i32 2> 509 %47 = shufflevector <4 x i32> %7, <4 x i32> poison, <2 x i32> <i32 1, i32 3> 510 %48 = add <2 x i32> %46, %47 511 %49 = shufflevector <2 x i32> %45, <2 x i32> %48, <2 x i32> <i32 0, i32 2> 512 %50 = shufflevector <2 x i32> %45, <2 x i32> %48, <2 x i32> <i32 1, i32 3> 513 %51 = add <2 x i32> %49, %50 514 %52 = shufflevector <2 x i32> %51, <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 515 %53 = shufflevector <8 x i32> %42, <8 x i32> %52, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9> 516 ret <8 x i32> %53 517} 518 519; Vectorized Sequential Sum Reductions 520; e.g. 521; inline STYPE sum(VTYPE x) { 522; return ((x[0] + x[1]) + x[2]) + x[3]; 523; } 524; 525; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) { 526; return (VTYPE) { sum( A0 ), sum( A1 ), sum( A2 ), sum( A3 ) }; 527; } 528 529define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) { 530; SSSE3-SLOW-LABEL: sequential_sum_v4f32_v4f32: 531; SSSE3-SLOW: # %bb.0: 532; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4 533; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm4 534; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5 535; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] 536; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] 537; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] 538; SSSE3-SLOW-NEXT: addps %xmm2, %xmm0 539; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] 540; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3] 541; SSSE3-SLOW-NEXT: addps %xmm4, %xmm5 542; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3] 543; SSSE3-SLOW-NEXT: addps %xmm5, %xmm1 544; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3] 545; SSSE3-SLOW-NEXT: addps %xmm3, %xmm0 546; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2 547; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] 548; SSSE3-SLOW-NEXT: addps %xmm0, %xmm2 549; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 550; SSSE3-SLOW-NEXT: addps %xmm2, %xmm3 551; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] 552; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] 553; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0 554; SSSE3-SLOW-NEXT: retq 555; 556; SSSE3-FAST-LABEL: sequential_sum_v4f32_v4f32: 557; SSSE3-FAST: # %bb.0: 558; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 559; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4 560; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5 561; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] 562; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] 563; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3] 564; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3] 565; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2 566; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,1] 567; SSSE3-FAST-NEXT: addps %xmm4, %xmm5 568; SSSE3-FAST-NEXT: addps %xmm5, %xmm1 569; SSSE3-FAST-NEXT: movaps %xmm3, %xmm0 570; SSSE3-FAST-NEXT: haddps %xmm3, %xmm0 571; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2 572; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] 573; SSSE3-FAST-NEXT: addps %xmm0, %xmm2 574; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 575; SSSE3-FAST-NEXT: addps %xmm2, %xmm3 576; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] 577; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] 578; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0 579; SSSE3-FAST-NEXT: retq 580; 581; AVX-SLOW-LABEL: sequential_sum_v4f32_v4f32: 582; AVX-SLOW: # %bb.0: 583; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm4 584; AVX-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 585; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 586; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero 587; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 588; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 589; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1] 590; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3] 591; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] 592; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] 593; AVX-SLOW-NEXT: vaddps %xmm3, %xmm4, %xmm4 594; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] 595; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2] 596; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 597; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] 598; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 599; AVX-SLOW-NEXT: retq 600; 601; AVX-FAST-LABEL: sequential_sum_v4f32_v4f32: 602; AVX-FAST: # %bb.0: 603; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm4 604; AVX-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 605; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] 606; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero 607; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1 608; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1] 609; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3] 610; AVX-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] 611; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm4 612; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] 613; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2] 614; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1 615; AVX-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] 616; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 617; AVX-FAST-NEXT: retq 618 %5 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 0, i32 4> 619 %6 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 1, i32 5> 620 %7 = fadd <2 x float> %5, %6 621 %8 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 2, i32 6> 622 %9 = fadd <2 x float> %8, %7 623 %10 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 3, i32 7> 624 %11 = fadd <2 x float> %10, %9 625 %12 = shufflevector <2 x float> %11, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 626 %13 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 627 %14 = fadd <4 x float> %13, %2 628 %15 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef> 629 %16 = fadd <4 x float> %15, %14 630 %17 = shufflevector <4 x float> %2, <4 x float> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> 631 %18 = fadd <4 x float> %17, %16 632 %19 = shufflevector <4 x float> %12, <4 x float> %18, <4 x i32> <i32 0, i32 1, i32 4, i32 undef> 633 %20 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 634 %21 = fadd <4 x float> %20, %3 635 %22 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef> 636 %23 = fadd <4 x float> %22, %21 637 %24 = shufflevector <4 x float> %3, <4 x float> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> 638 %25 = fadd <4 x float> %24, %23 639 %26 = shufflevector <4 x float> %19, <4 x float> %25, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 640 ret <4 x float> %26 641} 642 643define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) { 644; SSSE3-SLOW-LABEL: sequential_sum_v4i32_v4i32: 645; SSSE3-SLOW: # %bb.0: 646; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm4 647; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm4 648; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 649; SSSE3-SLOW-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 650; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4 651; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] 652; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1] 653; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm5 654; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5 655; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] 656; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] 657; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6 658; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm6 659; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3] 660; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] 661; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] 662; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0] 663; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 664; SSSE3-SLOW-NEXT: retq 665; 666; SSSE3-FAST-LABEL: sequential_sum_v4i32_v4i32: 667; SSSE3-FAST: # %bb.0: 668; SSSE3-FAST-NEXT: movdqa %xmm0, %xmm4 669; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm4 670; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 671; SSSE3-FAST-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 672; SSSE3-FAST-NEXT: paddd %xmm0, %xmm4 673; SSSE3-FAST-NEXT: movdqa %xmm2, %xmm1 674; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1 675; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1 676; SSSE3-FAST-NEXT: movdqa %xmm3, %xmm5 677; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm5 678; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] 679; SSSE3-FAST-NEXT: paddd %xmm5, %xmm6 680; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3] 681; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] 682; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3] 683; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0] 684; SSSE3-FAST-NEXT: paddd %xmm4, %xmm0 685; SSSE3-FAST-NEXT: retq 686; 687; AVX1-SLOW-LABEL: sequential_sum_v4i32_v4i32: 688; AVX1-SLOW: # %bb.0: 689; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm4 690; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 691; AVX1-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 692; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 693; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 694; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 695; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] 696; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 697; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] 698; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] 699; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] 700; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 701; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7] 702; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 703; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] 704; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,0,0] 705; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] 706; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2 707; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2 708; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 709; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 710; AVX1-SLOW-NEXT: retq 711; 712; AVX1-FAST-LABEL: sequential_sum_v4i32_v4i32: 713; AVX1-FAST: # %bb.0: 714; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm4 715; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 716; AVX1-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 717; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 718; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 719; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 720; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1 721; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] 722; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] 723; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] 724; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 725; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7] 726; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 727; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1 728; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] 729; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 730; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 731; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 732; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 733; AVX1-FAST-NEXT: retq 734; 735; AVX2-SLOW-LABEL: sequential_sum_v4i32_v4i32: 736; AVX2-SLOW: # %bb.0: 737; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm4 738; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 739; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 740; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 741; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 742; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 743; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] 744; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 745; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] 746; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] 747; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] 748; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 749; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3] 750; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 751; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %xmm1 752; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm2 753; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] 754; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2 755; AVX2-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2 756; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 757; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 758; AVX2-SLOW-NEXT: retq 759; 760; AVX2-FAST-LABEL: sequential_sum_v4i32_v4i32: 761; AVX2-FAST: # %bb.0: 762; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm4 763; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 764; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 765; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 766; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 767; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 768; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1 769; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] 770; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] 771; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] 772; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 773; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3] 774; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 775; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1 776; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] 777; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1 778; AVX2-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 779; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 780; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 781; AVX2-FAST-NEXT: retq 782 %5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 0, i32 4> 783 %6 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 1, i32 5> 784 %7 = add <2 x i32> %5, %6 785 %8 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 2, i32 6> 786 %9 = add <2 x i32> %8, %7 787 %10 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 3, i32 7> 788 %11 = add <2 x i32> %10, %9 789 %12 = shufflevector <2 x i32> %11, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 790 %13 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 791 %14 = add <4 x i32> %13, %2 792 %15 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef> 793 %16 = add <4 x i32> %15, %14 794 %17 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> 795 %18 = add <4 x i32> %17, %16 796 %19 = shufflevector <4 x i32> %12, <4 x i32> %18, <4 x i32> <i32 0, i32 1, i32 4, i32 undef> 797 %20 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 798 %21 = add <4 x i32> %20, %3 799 %22 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef> 800 %23 = add <4 x i32> %22, %21 801 %24 = shufflevector <4 x i32> %3, <4 x i32> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> 802 %25 = add <4 x i32> %24, %23 803 %26 = shufflevector <4 x i32> %19, <4 x i32> %25, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 804 ret <4 x i32> %26 805} 806 807; Vectorized Reductions 808; e.g. 809; VTYPE sum4(VTYPE A0, VTYPE A1, VTYPE A2, VTYPE A3) { 810; return (VTYPE) { reduce( A0 ), reduce( A1 ), reduce( A2 ), reduce( A3 ) }; 811; } 812 813define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) { 814; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32: 815; SSSE3-SLOW: # %bb.0: 816; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 817; SSSE3-SLOW-NEXT: addss %xmm0, %xmm4 818; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5 819; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 820; SSSE3-SLOW-NEXT: addss %xmm4, %xmm5 821; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 822; SSSE3-SLOW-NEXT: addss %xmm5, %xmm0 823; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 824; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4 825; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5 826; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 827; SSSE3-SLOW-NEXT: addss %xmm4, %xmm5 828; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 829; SSSE3-SLOW-NEXT: addss %xmm5, %xmm1 830; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 831; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 832; SSSE3-SLOW-NEXT: addss %xmm2, %xmm1 833; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm4 834; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] 835; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4 836; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 837; SSSE3-SLOW-NEXT: addss %xmm4, %xmm2 838; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 839; SSSE3-SLOW-NEXT: addss %xmm3, %xmm1 840; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm4 841; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 842; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4 843; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 844; SSSE3-SLOW-NEXT: addss %xmm4, %xmm3 845; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 846; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 847; SSSE3-SLOW-NEXT: retq 848; 849; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32: 850; SSSE3-FAST: # %bb.0: 851; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 852; SSSE3-FAST-NEXT: haddps %xmm0, %xmm4 853; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5 854; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] 855; SSSE3-FAST-NEXT: addss %xmm4, %xmm5 856; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 857; SSSE3-FAST-NEXT: addss %xmm5, %xmm0 858; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4 859; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4 860; SSSE3-FAST-NEXT: movaps %xmm1, %xmm5 861; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 862; SSSE3-FAST-NEXT: addss %xmm4, %xmm5 863; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 864; SSSE3-FAST-NEXT: addss %xmm5, %xmm1 865; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 866; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1 867; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1 868; SSSE3-FAST-NEXT: movaps %xmm2, %xmm4 869; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] 870; SSSE3-FAST-NEXT: addss %xmm1, %xmm4 871; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] 872; SSSE3-FAST-NEXT: addss %xmm4, %xmm2 873; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1 874; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1 875; SSSE3-FAST-NEXT: movaps %xmm3, %xmm4 876; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 877; SSSE3-FAST-NEXT: addss %xmm1, %xmm4 878; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] 879; SSSE3-FAST-NEXT: addss %xmm4, %xmm3 880; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 881; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 882; SSSE3-FAST-NEXT: retq 883; 884; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32: 885; AVX-SLOW: # %bb.0: 886; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 887; AVX-SLOW-NEXT: vaddss %xmm4, %xmm0, %xmm4 888; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0] 889; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4 890; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 891; AVX-SLOW-NEXT: vaddss %xmm0, %xmm4, %xmm0 892; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 893; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm4 894; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] 895; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4 896; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 897; AVX-SLOW-NEXT: vaddss %xmm1, %xmm4, %xmm1 898; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 899; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] 900; AVX-SLOW-NEXT: vaddss %xmm1, %xmm2, %xmm1 901; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] 902; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm1 903; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] 904; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 905; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 906; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] 907; AVX-SLOW-NEXT: vaddss %xmm1, %xmm3, %xmm1 908; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] 909; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 910; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3] 911; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 912; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 913; AVX-SLOW-NEXT: retq 914; 915; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32: 916; AVX-FAST: # %bb.0: 917; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm4 918; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0] 919; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4 920; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] 921; AVX-FAST-NEXT: vaddss %xmm0, %xmm4, %xmm0 922; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm4 923; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] 924; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4 925; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] 926; AVX-FAST-NEXT: vaddss %xmm1, %xmm4, %xmm1 927; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 928; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1 929; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] 930; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1 931; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] 932; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 933; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 934; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1 935; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] 936; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 937; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3] 938; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 939; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 940; AVX-FAST-NEXT: retq 941 %5 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0) 942 %6 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1) 943 %7 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2) 944 %8 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3) 945 %9 = insertelement <4 x float> undef, float %5, i32 0 946 %10 = insertelement <4 x float> %9, float %6, i32 1 947 %11 = insertelement <4 x float> %10, float %7, i32 2 948 %12 = insertelement <4 x float> %11, float %8, i32 3 949 ret <4 x float> %12 950} 951declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) 952 953define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) { 954; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc: 955; SSSE3-SLOW: # %bb.0: 956; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4 957; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] 958; SSSE3-SLOW-NEXT: addps %xmm0, %xmm4 959; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3] 960; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5 961; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] 962; SSSE3-SLOW-NEXT: addps %xmm1, %xmm5 963; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm5[1,1,3,3] 964; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 965; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 966; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 967; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1 968; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2 969; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] 970; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2 971; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm3 972; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] 973; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 974; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] 975; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] 976; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] 977; SSSE3-SLOW-NEXT: addps %xmm0, %xmm4 978; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm0 979; SSSE3-SLOW-NEXT: retq 980; 981; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: 982; SSSE3-FAST: # %bb.0: 983; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 984; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] 985; SSSE3-FAST-NEXT: addps %xmm0, %xmm4 986; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0 987; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 988; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 989; SSSE3-FAST-NEXT: haddps %xmm0, %xmm4 990; SSSE3-FAST-NEXT: movaps %xmm2, %xmm0 991; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] 992; SSSE3-FAST-NEXT: addps %xmm2, %xmm0 993; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1 994; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] 995; SSSE3-FAST-NEXT: addps %xmm3, %xmm1 996; SSSE3-FAST-NEXT: haddps %xmm0, %xmm1 997; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,0] 998; SSSE3-FAST-NEXT: movaps %xmm4, %xmm0 999; SSSE3-FAST-NEXT: retq 1000; 1001; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc: 1002; AVX-SLOW: # %bb.0: 1003; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] 1004; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 1005; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] 1006; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1 1007; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] 1008; AVX-SLOW-NEXT: vaddps %xmm4, %xmm2, %xmm2 1009; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] 1010; AVX-SLOW-NEXT: vaddps %xmm4, %xmm3, %xmm3 1011; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1],xmm2[1,1] 1012; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero 1013; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0] 1014; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] 1015; AVX-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1016; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] 1017; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 1018; AVX-SLOW-NEXT: retq 1019; 1020; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: 1021; AVX-FAST: # %bb.0: 1022; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] 1023; AVX-FAST-NEXT: vaddps %xmm4, %xmm0, %xmm0 1024; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] 1025; AVX-FAST-NEXT: vaddps %xmm4, %xmm1, %xmm1 1026; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 1027; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] 1028; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1 1029; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] 1030; AVX-FAST-NEXT: vaddps %xmm2, %xmm3, %xmm2 1031; AVX-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1 1032; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,0] 1033; AVX-FAST-NEXT: retq 1034 %5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0) 1035 %6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1) 1036 %7 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2) 1037 %8 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3) 1038 %9 = insertelement <4 x float> undef, float %5, i32 0 1039 %10 = insertelement <4 x float> %9, float %6, i32 1 1040 %11 = insertelement <4 x float> %10, float %7, i32 2 1041 %12 = insertelement <4 x float> %11, float %8, i32 3 1042 ret <4 x float> %12 1043} 1044 1045define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) { 1046; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32: 1047; SSSE3-SLOW: # %bb.0: 1048; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 1049; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4 1050; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] 1051; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] 1052; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5 1053; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] 1054; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1055; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 1056; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 1057; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 1058; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] 1059; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6 1060; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] 1061; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1062; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1063; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] 1064; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 1065; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] 1066; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4 1067; SSSE3-SLOW-NEXT: movdqa %xmm4, %xmm0 1068; SSSE3-SLOW-NEXT: retq 1069; 1070; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32: 1071; SSSE3-FAST: # %bb.0: 1072; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 1073; SSSE3-FAST-NEXT: paddd %xmm4, %xmm0 1074; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] 1075; SSSE3-FAST-NEXT: paddd %xmm1, %xmm4 1076; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm0 1077; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 1078; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1 1079; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 1080; SSSE3-FAST-NEXT: paddd %xmm3, %xmm2 1081; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1 1082; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1083; SSSE3-FAST-NEXT: retq 1084; 1085; AVX-SLOW-LABEL: reduction_sum_v4i32_v4i32: 1086; AVX-SLOW: # %bb.0: 1087; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 1088; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 1089; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 1090; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] 1091; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 1092; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] 1093; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 1094; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] 1095; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 1096; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] 1097; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] 1098; AVX-SLOW-NEXT: vpaddd %xmm6, %xmm3, %xmm3 1099; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] 1100; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 1101; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1102; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 1103; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1104; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 1105; AVX-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1106; AVX-SLOW-NEXT: retq 1107; 1108; AVX1-FAST-LABEL: reduction_sum_v4i32_v4i32: 1109; AVX1-FAST: # %bb.0: 1110; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 1111; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0 1112; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] 1113; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1 1114; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 1115; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 1116; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 1117; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 1118; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 1119; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1 1120; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,0,2] 1121; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1122; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1123; AVX1-FAST-NEXT: retq 1124; 1125; AVX2-FAST-LABEL: reduction_sum_v4i32_v4i32: 1126; AVX2-FAST: # %bb.0: 1127; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] 1128; AVX2-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0 1129; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] 1130; AVX2-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1 1131; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 1132; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 1133; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 1134; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 1135; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 1136; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1 1137; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,0,2] 1138; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1139; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1140; AVX2-FAST-NEXT: retq 1141 %5 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %0) 1142 %6 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %1) 1143 %7 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %2) 1144 %8 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %3) 1145 %9 = insertelement <4 x i32> undef, i32 %5, i32 0 1146 %10 = insertelement <4 x i32> %9, i32 %6, i32 1 1147 %11 = insertelement <4 x i32> %10, i32 %7, i32 2 1148 %12 = insertelement <4 x i32> %11, i32 %8, i32 3 1149 ret <4 x i32> %12 1150} 1151declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>) 1152