1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3-SLOW 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefix=SSSE3-FAST 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1-SLOW 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefix=AVX1-FAST 7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 8 9; PR37890 - subvector reduction followed by shuffle reduction 10 11define i32 @PR37890_v4i32(<4 x i32> %a) { 12; SSE2-LABEL: PR37890_v4i32: 13; SSE2: # %bb.0: 14; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 15; SSE2-NEXT: paddd %xmm0, %xmm1 16; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 17; SSE2-NEXT: paddd %xmm1, %xmm0 18; SSE2-NEXT: movd %xmm0, %eax 19; SSE2-NEXT: retq 20; 21; SSSE3-SLOW-LABEL: PR37890_v4i32: 22; SSSE3-SLOW: # %bb.0: 23; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 24; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 25; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 26; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 27; SSSE3-SLOW-NEXT: movd %xmm0, %eax 28; SSSE3-SLOW-NEXT: retq 29; 30; SSSE3-FAST-LABEL: PR37890_v4i32: 31; SSSE3-FAST: # %bb.0: 32; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 33; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 34; SSSE3-FAST-NEXT: movd %xmm0, %eax 35; SSSE3-FAST-NEXT: retq 36; 37; AVX1-SLOW-LABEL: PR37890_v4i32: 38; AVX1-SLOW: # %bb.0: 39; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 40; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 41; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 42; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 43; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 44; AVX1-SLOW-NEXT: retq 45; 46; AVX1-FAST-LABEL: PR37890_v4i32: 47; AVX1-FAST: # %bb.0: 48; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 49; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 50; AVX1-FAST-NEXT: vmovd %xmm0, %eax 51; AVX1-FAST-NEXT: retq 52; 53; AVX2-LABEL: PR37890_v4i32: 54; AVX2: # %bb.0: 55; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 56; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 57; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 58; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 59; AVX2-NEXT: vmovd %xmm0, %eax 60; AVX2-NEXT: retq 61 %hi0 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 62 %lo0 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 63 %sum0 = add <2 x i32> %lo0, %hi0 64 %hi1 = shufflevector <2 x i32> %sum0, <2 x i32> undef, <2 x i32> <i32 1, i32 undef> 65 %sum1 = add <2 x i32> %sum0, %hi1 66 %e = extractelement <2 x i32> %sum1, i32 0 67 ret i32 %e 68} 69 70define i16 @PR37890_v8i16(<8 x i16> %a) { 71; SSE2-LABEL: PR37890_v8i16: 72; SSE2: # %bb.0: 73; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 74; SSE2-NEXT: paddw %xmm0, %xmm1 75; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 76; SSE2-NEXT: paddw %xmm1, %xmm0 77; SSE2-NEXT: movdqa %xmm0, %xmm1 78; SSE2-NEXT: psrld $16, %xmm1 79; SSE2-NEXT: paddw %xmm0, %xmm1 80; SSE2-NEXT: movd %xmm1, %eax 81; SSE2-NEXT: # kill: def $ax killed $ax killed $eax 82; SSE2-NEXT: retq 83; 84; SSSE3-SLOW-LABEL: PR37890_v8i16: 85; SSSE3-SLOW: # %bb.0: 86; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 87; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1 88; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 89; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 90; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1 91; SSSE3-SLOW-NEXT: psrld $16, %xmm1 92; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1 93; SSSE3-SLOW-NEXT: movd %xmm1, %eax 94; SSSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 95; SSSE3-SLOW-NEXT: retq 96; 97; SSSE3-FAST-LABEL: PR37890_v8i16: 98; SSSE3-FAST: # %bb.0: 99; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 100; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 101; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 102; SSSE3-FAST-NEXT: movd %xmm0, %eax 103; SSSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax 104; SSSE3-FAST-NEXT: retq 105; 106; AVX1-SLOW-LABEL: PR37890_v8i16: 107; AVX1-SLOW: # %bb.0: 108; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 109; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 110; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 111; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 112; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 113; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 114; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 115; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 116; AVX1-SLOW-NEXT: retq 117; 118; AVX1-FAST-LABEL: PR37890_v8i16: 119; AVX1-FAST: # %bb.0: 120; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 121; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 122; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 123; AVX1-FAST-NEXT: vmovd %xmm0, %eax 124; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 125; AVX1-FAST-NEXT: retq 126; 127; AVX2-LABEL: PR37890_v8i16: 128; AVX2: # %bb.0: 129; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 130; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 131; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 132; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 133; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 134; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 135; AVX2-NEXT: vmovd %xmm0, %eax 136; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 137; AVX2-NEXT: retq 138 %hi0 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 139 %lo0 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 140 %sum0 = add <4 x i16> %lo0, %hi0 141 %hi1 = shufflevector <4 x i16> %sum0, <4 x i16> undef, <2 x i32> <i32 2, i32 3> 142 %lo1 = shufflevector <4 x i16> %sum0, <4 x i16> undef, <2 x i32> <i32 0, i32 1> 143 %sum1 = add <2 x i16> %lo1, %hi1 144 %hi2 = shufflevector <2 x i16> %sum1, <2 x i16> undef, <2 x i32> <i32 1, i32 undef> 145 %sum2 = add <2 x i16> %sum1, %hi2 146 %e = extractelement <2 x i16> %sum2, i32 0 147 ret i16 %e 148} 149 150define i32 @PR37890_v8i32(<8 x i32> %a) { 151; SSE2-LABEL: PR37890_v8i32: 152; SSE2: # %bb.0: 153; SSE2-NEXT: paddd %xmm1, %xmm0 154; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 155; SSE2-NEXT: paddd %xmm0, %xmm1 156; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 157; SSE2-NEXT: paddd %xmm1, %xmm0 158; SSE2-NEXT: movd %xmm0, %eax 159; SSE2-NEXT: retq 160; 161; SSSE3-SLOW-LABEL: PR37890_v8i32: 162; SSSE3-SLOW: # %bb.0: 163; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 164; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 165; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 166; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 167; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 168; SSSE3-SLOW-NEXT: movd %xmm0, %eax 169; SSSE3-SLOW-NEXT: retq 170; 171; SSSE3-FAST-LABEL: PR37890_v8i32: 172; SSSE3-FAST: # %bb.0: 173; SSSE3-FAST-NEXT: paddd %xmm1, %xmm0 174; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 175; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 176; SSSE3-FAST-NEXT: movd %xmm0, %eax 177; SSSE3-FAST-NEXT: retq 178; 179; AVX1-SLOW-LABEL: PR37890_v8i32: 180; AVX1-SLOW: # %bb.0: 181; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 182; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 183; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 184; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 185; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 186; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 187; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 188; AVX1-SLOW-NEXT: vzeroupper 189; AVX1-SLOW-NEXT: retq 190; 191; AVX1-FAST-LABEL: PR37890_v8i32: 192; AVX1-FAST: # %bb.0: 193; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 194; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0 195; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 196; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 197; AVX1-FAST-NEXT: vmovd %xmm0, %eax 198; AVX1-FAST-NEXT: vzeroupper 199; AVX1-FAST-NEXT: retq 200; 201; AVX2-LABEL: PR37890_v8i32: 202; AVX2: # %bb.0: 203; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 204; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 205; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 206; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 207; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 208; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 209; AVX2-NEXT: vmovd %xmm0, %eax 210; AVX2-NEXT: vzeroupper 211; AVX2-NEXT: retq 212 %hi0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 213 %lo0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 214 %sum0 = add <4 x i32> %lo0, %hi0 215 %hi1 = shufflevector <4 x i32> %sum0, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 216 %lo1 = shufflevector <4 x i32> %sum0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 217 %sum1 = add <2 x i32> %lo1, %hi1 218 %hi2 = shufflevector <2 x i32> %sum1, <2 x i32> undef, <2 x i32> <i32 1, i32 undef> 219 %sum2 = add <2 x i32> %sum1, %hi2 220 %e = extractelement <2 x i32> %sum2, i32 0 221 ret i32 %e 222} 223 224define i16 @PR37890_v16i16(<16 x i16> %a) { 225; SSE2-LABEL: PR37890_v16i16: 226; SSE2: # %bb.0: 227; SSE2-NEXT: paddw %xmm1, %xmm0 228; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 229; SSE2-NEXT: paddw %xmm0, %xmm1 230; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 231; SSE2-NEXT: paddw %xmm1, %xmm0 232; SSE2-NEXT: movdqa %xmm0, %xmm1 233; SSE2-NEXT: psrld $16, %xmm1 234; SSE2-NEXT: paddw %xmm0, %xmm1 235; SSE2-NEXT: movd %xmm1, %eax 236; SSE2-NEXT: # kill: def $ax killed $ax killed $eax 237; SSE2-NEXT: retq 238; 239; SSSE3-SLOW-LABEL: PR37890_v16i16: 240; SSSE3-SLOW: # %bb.0: 241; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 242; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 243; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1 244; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 245; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 246; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1 247; SSSE3-SLOW-NEXT: psrld $16, %xmm1 248; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1 249; SSSE3-SLOW-NEXT: movd %xmm1, %eax 250; SSSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 251; SSSE3-SLOW-NEXT: retq 252; 253; SSSE3-FAST-LABEL: PR37890_v16i16: 254; SSSE3-FAST: # %bb.0: 255; SSSE3-FAST-NEXT: paddw %xmm1, %xmm0 256; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 257; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 258; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 259; SSSE3-FAST-NEXT: movd %xmm0, %eax 260; SSSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax 261; SSSE3-FAST-NEXT: retq 262; 263; AVX1-SLOW-LABEL: PR37890_v16i16: 264; AVX1-SLOW: # %bb.0: 265; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 266; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 267; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 268; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 269; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 270; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 271; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 272; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 273; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 274; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 275; AVX1-SLOW-NEXT: vzeroupper 276; AVX1-SLOW-NEXT: retq 277; 278; AVX1-FAST-LABEL: PR37890_v16i16: 279; AVX1-FAST: # %bb.0: 280; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 281; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0 282; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 283; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 284; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 285; AVX1-FAST-NEXT: vmovd %xmm0, %eax 286; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 287; AVX1-FAST-NEXT: vzeroupper 288; AVX1-FAST-NEXT: retq 289; 290; AVX2-LABEL: PR37890_v16i16: 291; AVX2: # %bb.0: 292; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 293; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 294; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 295; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 296; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 297; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 298; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 299; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 300; AVX2-NEXT: vmovd %xmm0, %eax 301; AVX2-NEXT: # kill: def $ax killed $ax killed $eax 302; AVX2-NEXT: vzeroupper 303; AVX2-NEXT: retq 304 %hi0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 305 %lo0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 306 %sum0 = add <8 x i16> %lo0, %hi0 307 %hi1 = shufflevector <8 x i16> %sum0, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 308 %lo1 = shufflevector <8 x i16> %sum0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 309 %sum1 = add <4 x i16> %lo1, %hi1 310 %hi2 = shufflevector <4 x i16> %sum1, <4 x i16> undef, <2 x i32> <i32 2, i32 3> 311 %lo2 = shufflevector <4 x i16> %sum1, <4 x i16> undef, <2 x i32> <i32 0, i32 1> 312 %sum2 = add <2 x i16> %lo2, %hi2 313 %hi3 = shufflevector <2 x i16> %sum2, <2 x i16> undef, <2 x i32> <i32 1, i32 undef> 314 %sum3 = add <2 x i16> %sum2, %hi3 315 %e = extractelement <2 x i16> %sum3, i32 0 316 ret i16 %e 317} 318 319define i32 @PR37890_v16i32(<16 x i32> %a) { 320; SSE2-LABEL: PR37890_v16i32: 321; SSE2: # %bb.0: 322; SSE2-NEXT: paddd %xmm3, %xmm1 323; SSE2-NEXT: paddd %xmm2, %xmm1 324; SSE2-NEXT: paddd %xmm0, %xmm1 325; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 326; SSE2-NEXT: paddd %xmm1, %xmm0 327; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 328; SSE2-NEXT: paddd %xmm0, %xmm1 329; SSE2-NEXT: movd %xmm1, %eax 330; SSE2-NEXT: retq 331; 332; SSSE3-SLOW-LABEL: PR37890_v16i32: 333; SSSE3-SLOW: # %bb.0: 334; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1 335; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 336; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 337; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 338; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 339; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 340; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 341; SSSE3-SLOW-NEXT: movd %xmm1, %eax 342; SSSE3-SLOW-NEXT: retq 343; 344; SSSE3-FAST-LABEL: PR37890_v16i32: 345; SSSE3-FAST: # %bb.0: 346; SSSE3-FAST-NEXT: paddd %xmm3, %xmm1 347; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1 348; SSSE3-FAST-NEXT: paddd %xmm0, %xmm1 349; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 350; SSSE3-FAST-NEXT: paddd %xmm1, %xmm0 351; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 352; SSSE3-FAST-NEXT: movd %xmm0, %eax 353; SSSE3-FAST-NEXT: retq 354; 355; AVX1-SLOW-LABEL: PR37890_v16i32: 356; AVX1-SLOW: # %bb.0: 357; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 358; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 359; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 360; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 361; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 362; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 363; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 364; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 365; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 366; AVX1-SLOW-NEXT: vmovd %xmm0, %eax 367; AVX1-SLOW-NEXT: vzeroupper 368; AVX1-SLOW-NEXT: retq 369; 370; AVX1-FAST-LABEL: PR37890_v16i32: 371; AVX1-FAST: # %bb.0: 372; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm2 373; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 374; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 375; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 376; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm0, %xmm0 377; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 378; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 379; AVX1-FAST-NEXT: vmovd %xmm0, %eax 380; AVX1-FAST-NEXT: vzeroupper 381; AVX1-FAST-NEXT: retq 382; 383; AVX2-LABEL: PR37890_v16i32: 384; AVX2: # %bb.0: 385; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 386; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 387; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 388; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 389; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 390; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 391; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 392; AVX2-NEXT: vmovd %xmm0, %eax 393; AVX2-NEXT: vzeroupper 394; AVX2-NEXT: retq 395 %hi0 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 396 %lo0 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 397 %sum0 = add <8 x i32> %lo0, %hi0 398 %hi1 = shufflevector <8 x i32> %sum0, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 399 %lo1 = shufflevector <8 x i32> %sum0, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 400 %sum1 = add <4 x i32> %lo1, %hi1 401 %hi2 = shufflevector <4 x i32> %sum1, <4 x i32> undef, <2 x i32> <i32 2, i32 3> 402 %lo2 = shufflevector <4 x i32> %sum1, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 403 %sum2 = add <2 x i32> %lo2, %hi2 404 %hi3 = shufflevector <2 x i32> %sum2, <2 x i32> undef, <2 x i32> <i32 1, i32 undef> 405 %sum3 = add <2 x i32> %sum2, %hi3 406 %e = extractelement <2 x i32> %sum3, i32 0 407 ret i32 %e 408} 409