1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE3,SSE3-FAST 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2-SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2-FAST 8; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512-SLOW 9; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512-FAST 10 11; 128-bit vectors, 16/32-bit, add/sub 12 13define i32 @extract_extract01_v4i32_add_i32(<4 x i32> %x) { 14; SSE3-SLOW-LABEL: extract_extract01_v4i32_add_i32: 15; SSE3-SLOW: # %bb.0: 16; SSE3-SLOW-NEXT: movd %xmm0, %ecx 17; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 18; SSE3-SLOW-NEXT: movd %xmm0, %eax 19; SSE3-SLOW-NEXT: addl %ecx, %eax 20; SSE3-SLOW-NEXT: retq 21; 22; SSE3-FAST-LABEL: extract_extract01_v4i32_add_i32: 23; SSE3-FAST: # %bb.0: 24; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 25; SSE3-FAST-NEXT: movd %xmm0, %eax 26; SSE3-FAST-NEXT: retq 27; 28; AVX-SLOW-LABEL: extract_extract01_v4i32_add_i32: 29; AVX-SLOW: # %bb.0: 30; AVX-SLOW-NEXT: vmovd %xmm0, %ecx 31; AVX-SLOW-NEXT: vpextrd $1, %xmm0, %eax 32; AVX-SLOW-NEXT: addl %ecx, %eax 33; AVX-SLOW-NEXT: retq 34; 35; AVX-FAST-LABEL: extract_extract01_v4i32_add_i32: 36; AVX-FAST: # %bb.0: 37; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 38; AVX-FAST-NEXT: vmovd %xmm0, %eax 39; AVX-FAST-NEXT: retq 40 %x0 = extractelement <4 x i32> %x, i32 0 41 %x1 = extractelement <4 x i32> %x, i32 1 42 %x01 = add i32 %x0, %x1 43 ret i32 %x01 44} 45 46define i32 @extract_extract23_v4i32_add_i32(<4 x i32> %x) { 47; SSE3-SLOW-LABEL: extract_extract23_v4i32_add_i32: 48; SSE3-SLOW: # %bb.0: 49; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 50; SSE3-SLOW-NEXT: movd %xmm1, %ecx 51; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 52; SSE3-SLOW-NEXT: movd %xmm0, %eax 53; SSE3-SLOW-NEXT: addl %ecx, %eax 54; SSE3-SLOW-NEXT: retq 55; 56; SSE3-FAST-LABEL: extract_extract23_v4i32_add_i32: 57; SSE3-FAST: # %bb.0: 58; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 59; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 60; SSE3-FAST-NEXT: movd %xmm0, %eax 61; SSE3-FAST-NEXT: retq 62; 63; AVX-SLOW-LABEL: extract_extract23_v4i32_add_i32: 64; AVX-SLOW: # %bb.0: 65; AVX-SLOW-NEXT: vextractps $2, %xmm0, %ecx 66; AVX-SLOW-NEXT: vextractps $3, %xmm0, %eax 67; AVX-SLOW-NEXT: addl %ecx, %eax 68; AVX-SLOW-NEXT: retq 69; 70; AVX-FAST-LABEL: extract_extract23_v4i32_add_i32: 71; AVX-FAST: # %bb.0: 72; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 73; AVX-FAST-NEXT: vpextrd $1, %xmm0, %eax 74; AVX-FAST-NEXT: retq 75 %x0 = extractelement <4 x i32> %x, i32 2 76 %x1 = extractelement <4 x i32> %x, i32 3 77 %x01 = add i32 %x0, %x1 78 ret i32 %x01 79} 80 81define i32 @extract_extract01_v4i32_add_i32_commute(<4 x i32> %x) { 82; SSE3-SLOW-LABEL: extract_extract01_v4i32_add_i32_commute: 83; SSE3-SLOW: # %bb.0: 84; SSE3-SLOW-NEXT: movd %xmm0, %ecx 85; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 86; SSE3-SLOW-NEXT: movd %xmm0, %eax 87; SSE3-SLOW-NEXT: addl %ecx, %eax 88; SSE3-SLOW-NEXT: retq 89; 90; SSE3-FAST-LABEL: extract_extract01_v4i32_add_i32_commute: 91; SSE3-FAST: # %bb.0: 92; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 93; SSE3-FAST-NEXT: movd %xmm0, %eax 94; SSE3-FAST-NEXT: retq 95; 96; AVX-SLOW-LABEL: extract_extract01_v4i32_add_i32_commute: 97; AVX-SLOW: # %bb.0: 98; AVX-SLOW-NEXT: vmovd %xmm0, %ecx 99; AVX-SLOW-NEXT: vpextrd $1, %xmm0, %eax 100; AVX-SLOW-NEXT: addl %ecx, %eax 101; AVX-SLOW-NEXT: retq 102; 103; AVX-FAST-LABEL: extract_extract01_v4i32_add_i32_commute: 104; AVX-FAST: # %bb.0: 105; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 106; AVX-FAST-NEXT: vmovd %xmm0, %eax 107; AVX-FAST-NEXT: retq 108 %x0 = extractelement <4 x i32> %x, i32 0 109 %x1 = extractelement <4 x i32> %x, i32 1 110 %x01 = add i32 %x1, %x0 111 ret i32 %x01 112} 113 114define i32 @extract_extract23_v4i32_add_i32_commute(<4 x i32> %x) { 115; SSE3-SLOW-LABEL: extract_extract23_v4i32_add_i32_commute: 116; SSE3-SLOW: # %bb.0: 117; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 118; SSE3-SLOW-NEXT: movd %xmm1, %ecx 119; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 120; SSE3-SLOW-NEXT: movd %xmm0, %eax 121; SSE3-SLOW-NEXT: addl %ecx, %eax 122; SSE3-SLOW-NEXT: retq 123; 124; SSE3-FAST-LABEL: extract_extract23_v4i32_add_i32_commute: 125; SSE3-FAST: # %bb.0: 126; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 127; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 128; SSE3-FAST-NEXT: movd %xmm0, %eax 129; SSE3-FAST-NEXT: retq 130; 131; AVX-SLOW-LABEL: extract_extract23_v4i32_add_i32_commute: 132; AVX-SLOW: # %bb.0: 133; AVX-SLOW-NEXT: vextractps $2, %xmm0, %ecx 134; AVX-SLOW-NEXT: vextractps $3, %xmm0, %eax 135; AVX-SLOW-NEXT: addl %ecx, %eax 136; AVX-SLOW-NEXT: retq 137; 138; AVX-FAST-LABEL: extract_extract23_v4i32_add_i32_commute: 139; AVX-FAST: # %bb.0: 140; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 141; AVX-FAST-NEXT: vpextrd $1, %xmm0, %eax 142; AVX-FAST-NEXT: retq 143 %x0 = extractelement <4 x i32> %x, i32 2 144 %x1 = extractelement <4 x i32> %x, i32 3 145 %x01 = add i32 %x1, %x0 146 ret i32 %x01 147} 148 149define i16 @extract_extract01_v8i16_add_i16(<8 x i16> %x) { 150; SSE3-SLOW-LABEL: extract_extract01_v8i16_add_i16: 151; SSE3-SLOW: # %bb.0: 152; SSE3-SLOW-NEXT: movd %xmm0, %ecx 153; SSE3-SLOW-NEXT: pextrw $1, %xmm0, %eax 154; SSE3-SLOW-NEXT: addl %ecx, %eax 155; SSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 156; SSE3-SLOW-NEXT: retq 157; 158; SSE3-FAST-LABEL: extract_extract01_v8i16_add_i16: 159; SSE3-FAST: # %bb.0: 160; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 161; SSE3-FAST-NEXT: movd %xmm0, %eax 162; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax 163; SSE3-FAST-NEXT: retq 164; 165; AVX-SLOW-LABEL: extract_extract01_v8i16_add_i16: 166; AVX-SLOW: # %bb.0: 167; AVX-SLOW-NEXT: vmovd %xmm0, %ecx 168; AVX-SLOW-NEXT: vpextrw $1, %xmm0, %eax 169; AVX-SLOW-NEXT: addl %ecx, %eax 170; AVX-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 171; AVX-SLOW-NEXT: retq 172; 173; AVX-FAST-LABEL: extract_extract01_v8i16_add_i16: 174; AVX-FAST: # %bb.0: 175; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 176; AVX-FAST-NEXT: vmovd %xmm0, %eax 177; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax 178; AVX-FAST-NEXT: retq 179 %x0 = extractelement <8 x i16> %x, i32 0 180 %x1 = extractelement <8 x i16> %x, i32 1 181 %x01 = add i16 %x0, %x1 182 ret i16 %x01 183} 184 185define i16 @extract_extract45_v8i16_add_i16(<8 x i16> %x) { 186; SSE3-SLOW-LABEL: extract_extract45_v8i16_add_i16: 187; SSE3-SLOW: # %bb.0: 188; SSE3-SLOW-NEXT: pextrw $4, %xmm0, %ecx 189; SSE3-SLOW-NEXT: pextrw $5, %xmm0, %eax 190; SSE3-SLOW-NEXT: addl %ecx, %eax 191; SSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 192; SSE3-SLOW-NEXT: retq 193; 194; SSE3-FAST-LABEL: extract_extract45_v8i16_add_i16: 195; SSE3-FAST: # %bb.0: 196; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 197; SSE3-FAST-NEXT: pextrw $2, %xmm0, %eax 198; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax 199; SSE3-FAST-NEXT: retq 200; 201; AVX-SLOW-LABEL: extract_extract45_v8i16_add_i16: 202; AVX-SLOW: # %bb.0: 203; AVX-SLOW-NEXT: vpextrw $4, %xmm0, %ecx 204; AVX-SLOW-NEXT: vpextrw $5, %xmm0, %eax 205; AVX-SLOW-NEXT: addl %ecx, %eax 206; AVX-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 207; AVX-SLOW-NEXT: retq 208; 209; AVX-FAST-LABEL: extract_extract45_v8i16_add_i16: 210; AVX-FAST: # %bb.0: 211; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 212; AVX-FAST-NEXT: vpextrw $2, %xmm0, %eax 213; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax 214; AVX-FAST-NEXT: retq 215 %x0 = extractelement <8 x i16> %x, i32 4 216 %x1 = extractelement <8 x i16> %x, i32 5 217 %x01 = add i16 %x0, %x1 218 ret i16 %x01 219} 220 221define i16 @extract_extract01_v8i16_add_i16_commute(<8 x i16> %x) { 222; SSE3-SLOW-LABEL: extract_extract01_v8i16_add_i16_commute: 223; SSE3-SLOW: # %bb.0: 224; SSE3-SLOW-NEXT: movd %xmm0, %ecx 225; SSE3-SLOW-NEXT: pextrw $1, %xmm0, %eax 226; SSE3-SLOW-NEXT: addl %ecx, %eax 227; SSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 228; SSE3-SLOW-NEXT: retq 229; 230; SSE3-FAST-LABEL: extract_extract01_v8i16_add_i16_commute: 231; SSE3-FAST: # %bb.0: 232; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 233; SSE3-FAST-NEXT: movd %xmm0, %eax 234; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax 235; SSE3-FAST-NEXT: retq 236; 237; AVX-SLOW-LABEL: extract_extract01_v8i16_add_i16_commute: 238; AVX-SLOW: # %bb.0: 239; AVX-SLOW-NEXT: vmovd %xmm0, %ecx 240; AVX-SLOW-NEXT: vpextrw $1, %xmm0, %eax 241; AVX-SLOW-NEXT: addl %ecx, %eax 242; AVX-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 243; AVX-SLOW-NEXT: retq 244; 245; AVX-FAST-LABEL: extract_extract01_v8i16_add_i16_commute: 246; AVX-FAST: # %bb.0: 247; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 248; AVX-FAST-NEXT: vmovd %xmm0, %eax 249; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax 250; AVX-FAST-NEXT: retq 251 %x0 = extractelement <8 x i16> %x, i32 0 252 %x1 = extractelement <8 x i16> %x, i32 1 253 %x01 = add i16 %x1, %x0 254 ret i16 %x01 255} 256 257define i16 @extract_extract45_v8i16_add_i16_commute(<8 x i16> %x) { 258; SSE3-SLOW-LABEL: extract_extract45_v8i16_add_i16_commute: 259; SSE3-SLOW: # %bb.0: 260; SSE3-SLOW-NEXT: pextrw $4, %xmm0, %ecx 261; SSE3-SLOW-NEXT: pextrw $5, %xmm0, %eax 262; SSE3-SLOW-NEXT: addl %ecx, %eax 263; SSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 264; SSE3-SLOW-NEXT: retq 265; 266; SSE3-FAST-LABEL: extract_extract45_v8i16_add_i16_commute: 267; SSE3-FAST: # %bb.0: 268; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 269; SSE3-FAST-NEXT: pextrw $2, %xmm0, %eax 270; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax 271; SSE3-FAST-NEXT: retq 272; 273; AVX-SLOW-LABEL: extract_extract45_v8i16_add_i16_commute: 274; AVX-SLOW: # %bb.0: 275; AVX-SLOW-NEXT: vpextrw $4, %xmm0, %ecx 276; AVX-SLOW-NEXT: vpextrw $5, %xmm0, %eax 277; AVX-SLOW-NEXT: addl %ecx, %eax 278; AVX-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 279; AVX-SLOW-NEXT: retq 280; 281; AVX-FAST-LABEL: extract_extract45_v8i16_add_i16_commute: 282; AVX-FAST: # %bb.0: 283; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 284; AVX-FAST-NEXT: vpextrw $2, %xmm0, %eax 285; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax 286; AVX-FAST-NEXT: retq 287 %x0 = extractelement <8 x i16> %x, i32 4 288 %x1 = extractelement <8 x i16> %x, i32 5 289 %x01 = add i16 %x1, %x0 290 ret i16 %x01 291} 292 293define i32 @extract_extract01_v4i32_sub_i32(<4 x i32> %x) { 294; SSE3-SLOW-LABEL: extract_extract01_v4i32_sub_i32: 295; SSE3-SLOW: # %bb.0: 296; SSE3-SLOW-NEXT: movd %xmm0, %eax 297; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 298; SSE3-SLOW-NEXT: movd %xmm0, %ecx 299; SSE3-SLOW-NEXT: subl %ecx, %eax 300; SSE3-SLOW-NEXT: retq 301; 302; SSE3-FAST-LABEL: extract_extract01_v4i32_sub_i32: 303; SSE3-FAST: # %bb.0: 304; SSE3-FAST-NEXT: phsubd %xmm0, %xmm0 305; SSE3-FAST-NEXT: movd %xmm0, %eax 306; SSE3-FAST-NEXT: retq 307; 308; AVX-SLOW-LABEL: extract_extract01_v4i32_sub_i32: 309; AVX-SLOW: # %bb.0: 310; AVX-SLOW-NEXT: vmovd %xmm0, %eax 311; AVX-SLOW-NEXT: vpextrd $1, %xmm0, %ecx 312; AVX-SLOW-NEXT: subl %ecx, %eax 313; AVX-SLOW-NEXT: retq 314; 315; AVX-FAST-LABEL: extract_extract01_v4i32_sub_i32: 316; AVX-FAST: # %bb.0: 317; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 318; AVX-FAST-NEXT: vmovd %xmm0, %eax 319; AVX-FAST-NEXT: retq 320 %x0 = extractelement <4 x i32> %x, i32 0 321 %x1 = extractelement <4 x i32> %x, i32 1 322 %x01 = sub i32 %x0, %x1 323 ret i32 %x01 324} 325 326define i32 @extract_extract23_v4i32_sub_i32(<4 x i32> %x) { 327; SSE3-SLOW-LABEL: extract_extract23_v4i32_sub_i32: 328; SSE3-SLOW: # %bb.0: 329; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 330; SSE3-SLOW-NEXT: movd %xmm1, %eax 331; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 332; SSE3-SLOW-NEXT: movd %xmm0, %ecx 333; SSE3-SLOW-NEXT: subl %ecx, %eax 334; SSE3-SLOW-NEXT: retq 335; 336; SSE3-FAST-LABEL: extract_extract23_v4i32_sub_i32: 337; SSE3-FAST: # %bb.0: 338; SSE3-FAST-NEXT: phsubd %xmm0, %xmm0 339; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 340; SSE3-FAST-NEXT: movd %xmm0, %eax 341; SSE3-FAST-NEXT: retq 342; 343; AVX-SLOW-LABEL: extract_extract23_v4i32_sub_i32: 344; AVX-SLOW: # %bb.0: 345; AVX-SLOW-NEXT: vextractps $2, %xmm0, %eax 346; AVX-SLOW-NEXT: vextractps $3, %xmm0, %ecx 347; AVX-SLOW-NEXT: subl %ecx, %eax 348; AVX-SLOW-NEXT: retq 349; 350; AVX-FAST-LABEL: extract_extract23_v4i32_sub_i32: 351; AVX-FAST: # %bb.0: 352; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 353; AVX-FAST-NEXT: vpextrd $1, %xmm0, %eax 354; AVX-FAST-NEXT: retq 355 %x0 = extractelement <4 x i32> %x, i32 2 356 %x1 = extractelement <4 x i32> %x, i32 3 357 %x01 = sub i32 %x0, %x1 358 ret i32 %x01 359} 360 361define i32 @extract_extract01_v4i32_sub_i32_commute(<4 x i32> %x) { 362; SSE3-LABEL: extract_extract01_v4i32_sub_i32_commute: 363; SSE3: # %bb.0: 364; SSE3-NEXT: movd %xmm0, %ecx 365; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 366; SSE3-NEXT: movd %xmm0, %eax 367; SSE3-NEXT: subl %ecx, %eax 368; SSE3-NEXT: retq 369; 370; AVX-LABEL: extract_extract01_v4i32_sub_i32_commute: 371; AVX: # %bb.0: 372; AVX-NEXT: vmovd %xmm0, %ecx 373; AVX-NEXT: vpextrd $1, %xmm0, %eax 374; AVX-NEXT: subl %ecx, %eax 375; AVX-NEXT: retq 376 %x0 = extractelement <4 x i32> %x, i32 0 377 %x1 = extractelement <4 x i32> %x, i32 1 378 %x01 = sub i32 %x1, %x0 379 ret i32 %x01 380} 381 382define i32 @extract_extract23_v4i32_sub_i32_commute(<4 x i32> %x) { 383; SSE3-LABEL: extract_extract23_v4i32_sub_i32_commute: 384; SSE3: # %bb.0: 385; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 386; SSE3-NEXT: movd %xmm1, %ecx 387; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 388; SSE3-NEXT: movd %xmm0, %eax 389; SSE3-NEXT: subl %ecx, %eax 390; SSE3-NEXT: retq 391; 392; AVX-LABEL: extract_extract23_v4i32_sub_i32_commute: 393; AVX: # %bb.0: 394; AVX-NEXT: vextractps $2, %xmm0, %ecx 395; AVX-NEXT: vextractps $3, %xmm0, %eax 396; AVX-NEXT: subl %ecx, %eax 397; AVX-NEXT: retq 398 %x0 = extractelement <4 x i32> %x, i32 2 399 %x1 = extractelement <4 x i32> %x, i32 3 400 %x01 = sub i32 %x1, %x0 401 ret i32 %x01 402} 403 404define i16 @extract_extract01_v8i16_sub_i16(<8 x i16> %x) { 405; SSE3-SLOW-LABEL: extract_extract01_v8i16_sub_i16: 406; SSE3-SLOW: # %bb.0: 407; SSE3-SLOW-NEXT: movd %xmm0, %eax 408; SSE3-SLOW-NEXT: pextrw $1, %xmm0, %ecx 409; SSE3-SLOW-NEXT: subl %ecx, %eax 410; SSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 411; SSE3-SLOW-NEXT: retq 412; 413; SSE3-FAST-LABEL: extract_extract01_v8i16_sub_i16: 414; SSE3-FAST: # %bb.0: 415; SSE3-FAST-NEXT: phsubw %xmm0, %xmm0 416; SSE3-FAST-NEXT: movd %xmm0, %eax 417; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax 418; SSE3-FAST-NEXT: retq 419; 420; AVX-SLOW-LABEL: extract_extract01_v8i16_sub_i16: 421; AVX-SLOW: # %bb.0: 422; AVX-SLOW-NEXT: vmovd %xmm0, %eax 423; AVX-SLOW-NEXT: vpextrw $1, %xmm0, %ecx 424; AVX-SLOW-NEXT: subl %ecx, %eax 425; AVX-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 426; AVX-SLOW-NEXT: retq 427; 428; AVX-FAST-LABEL: extract_extract01_v8i16_sub_i16: 429; AVX-FAST: # %bb.0: 430; AVX-FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0 431; AVX-FAST-NEXT: vmovd %xmm0, %eax 432; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax 433; AVX-FAST-NEXT: retq 434 %x0 = extractelement <8 x i16> %x, i32 0 435 %x1 = extractelement <8 x i16> %x, i32 1 436 %x01 = sub i16 %x0, %x1 437 ret i16 %x01 438} 439 440define i16 @extract_extract23_v8i16_sub_i16(<8 x i16> %x) { 441; SSE3-SLOW-LABEL: extract_extract23_v8i16_sub_i16: 442; SSE3-SLOW: # %bb.0: 443; SSE3-SLOW-NEXT: pextrw $2, %xmm0, %eax 444; SSE3-SLOW-NEXT: pextrw $3, %xmm0, %ecx 445; SSE3-SLOW-NEXT: subl %ecx, %eax 446; SSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 447; SSE3-SLOW-NEXT: retq 448; 449; SSE3-FAST-LABEL: extract_extract23_v8i16_sub_i16: 450; SSE3-FAST: # %bb.0: 451; SSE3-FAST-NEXT: phsubw %xmm0, %xmm0 452; SSE3-FAST-NEXT: pextrw $1, %xmm0, %eax 453; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax 454; SSE3-FAST-NEXT: retq 455; 456; AVX-SLOW-LABEL: extract_extract23_v8i16_sub_i16: 457; AVX-SLOW: # %bb.0: 458; AVX-SLOW-NEXT: vpextrw $2, %xmm0, %eax 459; AVX-SLOW-NEXT: vpextrw $3, %xmm0, %ecx 460; AVX-SLOW-NEXT: subl %ecx, %eax 461; AVX-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 462; AVX-SLOW-NEXT: retq 463; 464; AVX-FAST-LABEL: extract_extract23_v8i16_sub_i16: 465; AVX-FAST: # %bb.0: 466; AVX-FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0 467; AVX-FAST-NEXT: vpextrw $1, %xmm0, %eax 468; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax 469; AVX-FAST-NEXT: retq 470 %x0 = extractelement <8 x i16> %x, i32 2 471 %x1 = extractelement <8 x i16> %x, i32 3 472 %x01 = sub i16 %x0, %x1 473 ret i16 %x01 474} 475 476define i16 @extract_extract01_v8i16_sub_i16_commute(<8 x i16> %x) { 477; SSE3-LABEL: extract_extract01_v8i16_sub_i16_commute: 478; SSE3: # %bb.0: 479; SSE3-NEXT: movd %xmm0, %ecx 480; SSE3-NEXT: pextrw $1, %xmm0, %eax 481; SSE3-NEXT: subl %ecx, %eax 482; SSE3-NEXT: # kill: def $ax killed $ax killed $eax 483; SSE3-NEXT: retq 484; 485; AVX-LABEL: extract_extract01_v8i16_sub_i16_commute: 486; AVX: # %bb.0: 487; AVX-NEXT: vmovd %xmm0, %ecx 488; AVX-NEXT: vpextrw $1, %xmm0, %eax 489; AVX-NEXT: subl %ecx, %eax 490; AVX-NEXT: # kill: def $ax killed $ax killed $eax 491; AVX-NEXT: retq 492 %x0 = extractelement <8 x i16> %x, i32 0 493 %x1 = extractelement <8 x i16> %x, i32 1 494 %x01 = sub i16 %x1, %x0 495 ret i16 %x01 496} 497 498define i16 @extract_extract23_v8i16_sub_i16_commute(<8 x i16> %x) { 499; SSE3-LABEL: extract_extract23_v8i16_sub_i16_commute: 500; SSE3: # %bb.0: 501; SSE3-NEXT: pextrw $2, %xmm0, %ecx 502; SSE3-NEXT: pextrw $3, %xmm0, %eax 503; SSE3-NEXT: subl %ecx, %eax 504; SSE3-NEXT: # kill: def $ax killed $ax killed $eax 505; SSE3-NEXT: retq 506; 507; AVX-LABEL: extract_extract23_v8i16_sub_i16_commute: 508; AVX: # %bb.0: 509; AVX-NEXT: vpextrw $2, %xmm0, %ecx 510; AVX-NEXT: vpextrw $3, %xmm0, %eax 511; AVX-NEXT: subl %ecx, %eax 512; AVX-NEXT: # kill: def $ax killed $ax killed $eax 513; AVX-NEXT: retq 514 %x0 = extractelement <8 x i16> %x, i32 2 515 %x1 = extractelement <8 x i16> %x, i32 3 516 %x01 = sub i16 %x1, %x0 517 ret i16 %x01 518} 519 520; 256-bit vectors, i32/i16, add/sub 521 522define i32 @extract_extract01_v8i32_add_i32(<8 x i32> %x) { 523; SSE3-SLOW-LABEL: extract_extract01_v8i32_add_i32: 524; SSE3-SLOW: # %bb.0: 525; SSE3-SLOW-NEXT: movd %xmm0, %ecx 526; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 527; SSE3-SLOW-NEXT: movd %xmm0, %eax 528; SSE3-SLOW-NEXT: addl %ecx, %eax 529; SSE3-SLOW-NEXT: retq 530; 531; SSE3-FAST-LABEL: extract_extract01_v8i32_add_i32: 532; SSE3-FAST: # %bb.0: 533; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 534; SSE3-FAST-NEXT: movd %xmm0, %eax 535; SSE3-FAST-NEXT: retq 536; 537; AVX-SLOW-LABEL: extract_extract01_v8i32_add_i32: 538; AVX-SLOW: # %bb.0: 539; AVX-SLOW-NEXT: vmovd %xmm0, %ecx 540; AVX-SLOW-NEXT: vpextrd $1, %xmm0, %eax 541; AVX-SLOW-NEXT: addl %ecx, %eax 542; AVX-SLOW-NEXT: vzeroupper 543; AVX-SLOW-NEXT: retq 544; 545; AVX-FAST-LABEL: extract_extract01_v8i32_add_i32: 546; AVX-FAST: # %bb.0: 547; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 548; AVX-FAST-NEXT: vmovd %xmm0, %eax 549; AVX-FAST-NEXT: vzeroupper 550; AVX-FAST-NEXT: retq 551 %x0 = extractelement <8 x i32> %x, i32 0 552 %x1 = extractelement <8 x i32> %x, i32 1 553 %x01 = add i32 %x0, %x1 554 ret i32 %x01 555} 556 557define i32 @extract_extract23_v8i32_add_i32(<8 x i32> %x) { 558; SSE3-SLOW-LABEL: extract_extract23_v8i32_add_i32: 559; SSE3-SLOW: # %bb.0: 560; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 561; SSE3-SLOW-NEXT: movd %xmm1, %ecx 562; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 563; SSE3-SLOW-NEXT: movd %xmm0, %eax 564; SSE3-SLOW-NEXT: addl %ecx, %eax 565; SSE3-SLOW-NEXT: retq 566; 567; SSE3-FAST-LABEL: extract_extract23_v8i32_add_i32: 568; SSE3-FAST: # %bb.0: 569; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 570; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 571; SSE3-FAST-NEXT: movd %xmm0, %eax 572; SSE3-FAST-NEXT: retq 573; 574; AVX-SLOW-LABEL: extract_extract23_v8i32_add_i32: 575; AVX-SLOW: # %bb.0: 576; AVX-SLOW-NEXT: vextractps $2, %xmm0, %ecx 577; AVX-SLOW-NEXT: vextractps $3, %xmm0, %eax 578; AVX-SLOW-NEXT: addl %ecx, %eax 579; AVX-SLOW-NEXT: vzeroupper 580; AVX-SLOW-NEXT: retq 581; 582; AVX-FAST-LABEL: extract_extract23_v8i32_add_i32: 583; AVX-FAST: # %bb.0: 584; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 585; AVX-FAST-NEXT: vpextrd $1, %xmm0, %eax 586; AVX-FAST-NEXT: vzeroupper 587; AVX-FAST-NEXT: retq 588 %x0 = extractelement <8 x i32> %x, i32 2 589 %x1 = extractelement <8 x i32> %x, i32 3 590 %x01 = add i32 %x0, %x1 591 ret i32 %x01 592} 593 594define i32 @extract_extract67_v8i32_add_i32(<8 x i32> %x) { 595; SSE3-SLOW-LABEL: extract_extract67_v8i32_add_i32: 596; SSE3-SLOW: # %bb.0: 597; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 598; SSE3-SLOW-NEXT: movd %xmm0, %ecx 599; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 600; SSE3-SLOW-NEXT: movd %xmm0, %eax 601; SSE3-SLOW-NEXT: addl %ecx, %eax 602; SSE3-SLOW-NEXT: retq 603; 604; SSE3-FAST-LABEL: extract_extract67_v8i32_add_i32: 605; SSE3-FAST: # %bb.0: 606; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1 607; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 608; SSE3-FAST-NEXT: movd %xmm0, %eax 609; SSE3-FAST-NEXT: retq 610; 611; AVX-SLOW-LABEL: extract_extract67_v8i32_add_i32: 612; AVX-SLOW: # %bb.0: 613; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 614; AVX-SLOW-NEXT: vextractps $2, %xmm0, %ecx 615; AVX-SLOW-NEXT: vextractps $3, %xmm0, %eax 616; AVX-SLOW-NEXT: addl %ecx, %eax 617; AVX-SLOW-NEXT: vzeroupper 618; AVX-SLOW-NEXT: retq 619; 620; AVX1-FAST-LABEL: extract_extract67_v8i32_add_i32: 621; AVX1-FAST: # %bb.0: 622; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 623; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 624; AVX1-FAST-NEXT: vpextrd $1, %xmm0, %eax 625; AVX1-FAST-NEXT: vzeroupper 626; AVX1-FAST-NEXT: retq 627; 628; AVX2-FAST-LABEL: extract_extract67_v8i32_add_i32: 629; AVX2-FAST: # %bb.0: 630; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 631; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 632; AVX2-FAST-NEXT: vpextrd $1, %xmm0, %eax 633; AVX2-FAST-NEXT: vzeroupper 634; AVX2-FAST-NEXT: retq 635; 636; AVX512-FAST-LABEL: extract_extract67_v8i32_add_i32: 637; AVX512-FAST: # %bb.0: 638; AVX512-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 639; AVX512-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 640; AVX512-FAST-NEXT: vpextrd $1, %xmm0, %eax 641; AVX512-FAST-NEXT: vzeroupper 642; AVX512-FAST-NEXT: retq 643 %x0 = extractelement <8 x i32> %x, i32 6 644 %x1 = extractelement <8 x i32> %x, i32 7 645 %x01 = add i32 %x0, %x1 646 ret i32 %x01 647} 648 649define i32 @extract_extract01_v8i32_add_i32_commute(<8 x i32> %x) { 650; SSE3-SLOW-LABEL: extract_extract01_v8i32_add_i32_commute: 651; SSE3-SLOW: # %bb.0: 652; SSE3-SLOW-NEXT: movd %xmm0, %ecx 653; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 654; SSE3-SLOW-NEXT: movd %xmm0, %eax 655; SSE3-SLOW-NEXT: addl %ecx, %eax 656; SSE3-SLOW-NEXT: retq 657; 658; SSE3-FAST-LABEL: extract_extract01_v8i32_add_i32_commute: 659; SSE3-FAST: # %bb.0: 660; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 661; SSE3-FAST-NEXT: movd %xmm0, %eax 662; SSE3-FAST-NEXT: retq 663; 664; AVX-SLOW-LABEL: extract_extract01_v8i32_add_i32_commute: 665; AVX-SLOW: # %bb.0: 666; AVX-SLOW-NEXT: vmovd %xmm0, %ecx 667; AVX-SLOW-NEXT: vpextrd $1, %xmm0, %eax 668; AVX-SLOW-NEXT: addl %ecx, %eax 669; AVX-SLOW-NEXT: vzeroupper 670; AVX-SLOW-NEXT: retq 671; 672; AVX-FAST-LABEL: extract_extract01_v8i32_add_i32_commute: 673; AVX-FAST: # %bb.0: 674; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 675; AVX-FAST-NEXT: vmovd %xmm0, %eax 676; AVX-FAST-NEXT: vzeroupper 677; AVX-FAST-NEXT: retq 678 %x0 = extractelement <8 x i32> %x, i32 0 679 %x1 = extractelement <8 x i32> %x, i32 1 680 %x01 = add i32 %x1, %x0 681 ret i32 %x01 682} 683 684define i32 @extract_extract23_v8i32_add_i32_commute(<8 x i32> %x) { 685; SSE3-SLOW-LABEL: extract_extract23_v8i32_add_i32_commute: 686; SSE3-SLOW: # %bb.0: 687; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 688; SSE3-SLOW-NEXT: movd %xmm1, %ecx 689; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 690; SSE3-SLOW-NEXT: movd %xmm0, %eax 691; SSE3-SLOW-NEXT: addl %ecx, %eax 692; SSE3-SLOW-NEXT: retq 693; 694; SSE3-FAST-LABEL: extract_extract23_v8i32_add_i32_commute: 695; SSE3-FAST: # %bb.0: 696; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 697; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 698; SSE3-FAST-NEXT: movd %xmm0, %eax 699; SSE3-FAST-NEXT: retq 700; 701; AVX-SLOW-LABEL: extract_extract23_v8i32_add_i32_commute: 702; AVX-SLOW: # %bb.0: 703; AVX-SLOW-NEXT: vextractps $2, %xmm0, %ecx 704; AVX-SLOW-NEXT: vextractps $3, %xmm0, %eax 705; AVX-SLOW-NEXT: addl %ecx, %eax 706; AVX-SLOW-NEXT: vzeroupper 707; AVX-SLOW-NEXT: retq 708; 709; AVX-FAST-LABEL: extract_extract23_v8i32_add_i32_commute: 710; AVX-FAST: # %bb.0: 711; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 712; AVX-FAST-NEXT: vpextrd $1, %xmm0, %eax 713; AVX-FAST-NEXT: vzeroupper 714; AVX-FAST-NEXT: retq 715 %x0 = extractelement <8 x i32> %x, i32 2 716 %x1 = extractelement <8 x i32> %x, i32 3 717 %x01 = add i32 %x1, %x0 718 ret i32 %x01 719} 720 721define i32 @extract_extract67_v8i32_add_i32_commute(<8 x i32> %x) { 722; SSE3-SLOW-LABEL: extract_extract67_v8i32_add_i32_commute: 723; SSE3-SLOW: # %bb.0: 724; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 725; SSE3-SLOW-NEXT: movd %xmm0, %ecx 726; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 727; SSE3-SLOW-NEXT: movd %xmm0, %eax 728; SSE3-SLOW-NEXT: addl %ecx, %eax 729; SSE3-SLOW-NEXT: retq 730; 731; SSE3-FAST-LABEL: extract_extract67_v8i32_add_i32_commute: 732; SSE3-FAST: # %bb.0: 733; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1 734; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 735; SSE3-FAST-NEXT: movd %xmm0, %eax 736; SSE3-FAST-NEXT: retq 737; 738; AVX-SLOW-LABEL: extract_extract67_v8i32_add_i32_commute: 739; AVX-SLOW: # %bb.0: 740; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 741; AVX-SLOW-NEXT: vextractps $2, %xmm0, %ecx 742; AVX-SLOW-NEXT: vextractps $3, %xmm0, %eax 743; AVX-SLOW-NEXT: addl %ecx, %eax 744; AVX-SLOW-NEXT: vzeroupper 745; AVX-SLOW-NEXT: retq 746; 747; AVX1-FAST-LABEL: extract_extract67_v8i32_add_i32_commute: 748; AVX1-FAST: # %bb.0: 749; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 750; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 751; AVX1-FAST-NEXT: vpextrd $1, %xmm0, %eax 752; AVX1-FAST-NEXT: vzeroupper 753; AVX1-FAST-NEXT: retq 754; 755; AVX2-FAST-LABEL: extract_extract67_v8i32_add_i32_commute: 756; AVX2-FAST: # %bb.0: 757; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 758; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 759; AVX2-FAST-NEXT: vpextrd $1, %xmm0, %eax 760; AVX2-FAST-NEXT: vzeroupper 761; AVX2-FAST-NEXT: retq 762; 763; AVX512-FAST-LABEL: extract_extract67_v8i32_add_i32_commute: 764; AVX512-FAST: # %bb.0: 765; AVX512-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 766; AVX512-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 767; AVX512-FAST-NEXT: vpextrd $1, %xmm0, %eax 768; AVX512-FAST-NEXT: vzeroupper 769; AVX512-FAST-NEXT: retq 770 %x0 = extractelement <8 x i32> %x, i32 6 771 %x1 = extractelement <8 x i32> %x, i32 7 772 %x01 = add i32 %x1, %x0 773 ret i32 %x01 774} 775 776define i16 @extract_extract01_v16i16_add_i16(<16 x i16> %x) { 777; SSE3-SLOW-LABEL: extract_extract01_v16i16_add_i16: 778; SSE3-SLOW: # %bb.0: 779; SSE3-SLOW-NEXT: movd %xmm0, %ecx 780; SSE3-SLOW-NEXT: pextrw $1, %xmm0, %eax 781; SSE3-SLOW-NEXT: addl %ecx, %eax 782; SSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 783; SSE3-SLOW-NEXT: retq 784; 785; SSE3-FAST-LABEL: extract_extract01_v16i16_add_i16: 786; SSE3-FAST: # %bb.0: 787; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 788; SSE3-FAST-NEXT: movd %xmm0, %eax 789; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax 790; SSE3-FAST-NEXT: retq 791; 792; AVX-SLOW-LABEL: extract_extract01_v16i16_add_i16: 793; AVX-SLOW: # %bb.0: 794; AVX-SLOW-NEXT: vmovd %xmm0, %ecx 795; AVX-SLOW-NEXT: vpextrw $1, %xmm0, %eax 796; AVX-SLOW-NEXT: addl %ecx, %eax 797; AVX-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 798; AVX-SLOW-NEXT: vzeroupper 799; AVX-SLOW-NEXT: retq 800; 801; AVX-FAST-LABEL: extract_extract01_v16i16_add_i16: 802; AVX-FAST: # %bb.0: 803; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 804; AVX-FAST-NEXT: vmovd %xmm0, %eax 805; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax 806; AVX-FAST-NEXT: vzeroupper 807; AVX-FAST-NEXT: retq 808 %x0 = extractelement <16 x i16> %x, i32 0 809 %x1 = extractelement <16 x i16> %x, i32 1 810 %x01 = add i16 %x0, %x1 811 ret i16 %x01 812} 813 814define i16 @extract_extract23_v16i16_add_i16(<16 x i16> %x) { 815; SSE3-SLOW-LABEL: extract_extract23_v16i16_add_i16: 816; SSE3-SLOW: # %bb.0: 817; SSE3-SLOW-NEXT: pextrw $2, %xmm0, %ecx 818; SSE3-SLOW-NEXT: pextrw $3, %xmm0, %eax 819; SSE3-SLOW-NEXT: addl %ecx, %eax 820; SSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 821; SSE3-SLOW-NEXT: retq 822; 823; SSE3-FAST-LABEL: extract_extract23_v16i16_add_i16: 824; SSE3-FAST: # %bb.0: 825; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 826; SSE3-FAST-NEXT: pextrw $1, %xmm0, %eax 827; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax 828; SSE3-FAST-NEXT: retq 829; 830; AVX-SLOW-LABEL: extract_extract23_v16i16_add_i16: 831; AVX-SLOW: # %bb.0: 832; AVX-SLOW-NEXT: vpextrw $2, %xmm0, %ecx 833; AVX-SLOW-NEXT: vpextrw $3, %xmm0, %eax 834; AVX-SLOW-NEXT: addl %ecx, %eax 835; AVX-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 836; AVX-SLOW-NEXT: vzeroupper 837; AVX-SLOW-NEXT: retq 838; 839; AVX-FAST-LABEL: extract_extract23_v16i16_add_i16: 840; AVX-FAST: # %bb.0: 841; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 842; AVX-FAST-NEXT: vpextrw $1, %xmm0, %eax 843; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax 844; AVX-FAST-NEXT: vzeroupper 845; AVX-FAST-NEXT: retq 846 %x0 = extractelement <16 x i16> %x, i32 2 847 %x1 = extractelement <16 x i16> %x, i32 3 848 %x01 = add i16 %x0, %x1 849 ret i16 %x01 850} 851 852define i16 @extract_extract89_v16i16_add_i16(<16 x i16> %x) { 853; SSE3-SLOW-LABEL: extract_extract89_v16i16_add_i16: 854; SSE3-SLOW: # %bb.0: 855; SSE3-SLOW-NEXT: movd %xmm1, %ecx 856; SSE3-SLOW-NEXT: pextrw $1, %xmm1, %eax 857; SSE3-SLOW-NEXT: addl %ecx, %eax 858; SSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 859; SSE3-SLOW-NEXT: retq 860; 861; SSE3-FAST-LABEL: extract_extract89_v16i16_add_i16: 862; SSE3-FAST: # %bb.0: 863; SSE3-FAST-NEXT: phaddw %xmm1, %xmm1 864; SSE3-FAST-NEXT: movd %xmm1, %eax 865; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax 866; SSE3-FAST-NEXT: retq 867; 868; AVX1-SLOW-LABEL: extract_extract89_v16i16_add_i16: 869; AVX1-SLOW: # %bb.0: 870; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 871; AVX1-SLOW-NEXT: vmovd %xmm0, %ecx 872; AVX1-SLOW-NEXT: vpextrw $1, %xmm0, %eax 873; AVX1-SLOW-NEXT: addl %ecx, %eax 874; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 875; AVX1-SLOW-NEXT: vzeroupper 876; AVX1-SLOW-NEXT: retq 877; 878; AVX1-FAST-LABEL: extract_extract89_v16i16_add_i16: 879; AVX1-FAST: # %bb.0: 880; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 881; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 882; AVX1-FAST-NEXT: vmovd %xmm0, %eax 883; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 884; AVX1-FAST-NEXT: vzeroupper 885; AVX1-FAST-NEXT: retq 886; 887; AVX2-SLOW-LABEL: extract_extract89_v16i16_add_i16: 888; AVX2-SLOW: # %bb.0: 889; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 890; AVX2-SLOW-NEXT: vmovd %xmm0, %ecx 891; AVX2-SLOW-NEXT: vpextrw $1, %xmm0, %eax 892; AVX2-SLOW-NEXT: addl %ecx, %eax 893; AVX2-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 894; AVX2-SLOW-NEXT: vzeroupper 895; AVX2-SLOW-NEXT: retq 896; 897; AVX2-FAST-LABEL: extract_extract89_v16i16_add_i16: 898; AVX2-FAST: # %bb.0: 899; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 900; AVX2-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 901; AVX2-FAST-NEXT: vmovd %xmm0, %eax 902; AVX2-FAST-NEXT: # kill: def $ax killed $ax killed $eax 903; AVX2-FAST-NEXT: vzeroupper 904; AVX2-FAST-NEXT: retq 905; 906; AVX512-SLOW-LABEL: extract_extract89_v16i16_add_i16: 907; AVX512-SLOW: # %bb.0: 908; AVX512-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 909; AVX512-SLOW-NEXT: vmovd %xmm0, %ecx 910; AVX512-SLOW-NEXT: vpextrw $1, %xmm0, %eax 911; AVX512-SLOW-NEXT: addl %ecx, %eax 912; AVX512-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 913; AVX512-SLOW-NEXT: vzeroupper 914; AVX512-SLOW-NEXT: retq 915; 916; AVX512-FAST-LABEL: extract_extract89_v16i16_add_i16: 917; AVX512-FAST: # %bb.0: 918; AVX512-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 919; AVX512-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 920; AVX512-FAST-NEXT: vmovd %xmm0, %eax 921; AVX512-FAST-NEXT: # kill: def $ax killed $ax killed $eax 922; AVX512-FAST-NEXT: vzeroupper 923; AVX512-FAST-NEXT: retq 924 %x0 = extractelement <16 x i16> %x, i32 8 925 %x1 = extractelement <16 x i16> %x, i32 9 926 %x01 = add i16 %x0, %x1 927 ret i16 %x01 928} 929 930define i16 @extract_extract01_v16i16_add_i16_commute(<16 x i16> %x) { 931; SSE3-SLOW-LABEL: extract_extract01_v16i16_add_i16_commute: 932; SSE3-SLOW: # %bb.0: 933; SSE3-SLOW-NEXT: movd %xmm0, %ecx 934; SSE3-SLOW-NEXT: pextrw $1, %xmm0, %eax 935; SSE3-SLOW-NEXT: addl %ecx, %eax 936; SSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 937; SSE3-SLOW-NEXT: retq 938; 939; SSE3-FAST-LABEL: extract_extract01_v16i16_add_i16_commute: 940; SSE3-FAST: # %bb.0: 941; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 942; SSE3-FAST-NEXT: movd %xmm0, %eax 943; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax 944; SSE3-FAST-NEXT: retq 945; 946; AVX-SLOW-LABEL: extract_extract01_v16i16_add_i16_commute: 947; AVX-SLOW: # %bb.0: 948; AVX-SLOW-NEXT: vmovd %xmm0, %ecx 949; AVX-SLOW-NEXT: vpextrw $1, %xmm0, %eax 950; AVX-SLOW-NEXT: addl %ecx, %eax 951; AVX-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 952; AVX-SLOW-NEXT: vzeroupper 953; AVX-SLOW-NEXT: retq 954; 955; AVX-FAST-LABEL: extract_extract01_v16i16_add_i16_commute: 956; AVX-FAST: # %bb.0: 957; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 958; AVX-FAST-NEXT: vmovd %xmm0, %eax 959; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax 960; AVX-FAST-NEXT: vzeroupper 961; AVX-FAST-NEXT: retq 962 %x0 = extractelement <16 x i16> %x, i32 0 963 %x1 = extractelement <16 x i16> %x, i32 1 964 %x01 = add i16 %x1, %x0 965 ret i16 %x01 966} 967 968define i16 @extract_extract45_v16i16_add_i16_commute(<16 x i16> %x) { 969; SSE3-SLOW-LABEL: extract_extract45_v16i16_add_i16_commute: 970; SSE3-SLOW: # %bb.0: 971; SSE3-SLOW-NEXT: pextrw $4, %xmm0, %ecx 972; SSE3-SLOW-NEXT: pextrw $5, %xmm0, %eax 973; SSE3-SLOW-NEXT: addl %ecx, %eax 974; SSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 975; SSE3-SLOW-NEXT: retq 976; 977; SSE3-FAST-LABEL: extract_extract45_v16i16_add_i16_commute: 978; SSE3-FAST: # %bb.0: 979; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 980; SSE3-FAST-NEXT: pextrw $2, %xmm0, %eax 981; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax 982; SSE3-FAST-NEXT: retq 983; 984; AVX-SLOW-LABEL: extract_extract45_v16i16_add_i16_commute: 985; AVX-SLOW: # %bb.0: 986; AVX-SLOW-NEXT: vpextrw $4, %xmm0, %ecx 987; AVX-SLOW-NEXT: vpextrw $5, %xmm0, %eax 988; AVX-SLOW-NEXT: addl %ecx, %eax 989; AVX-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 990; AVX-SLOW-NEXT: vzeroupper 991; AVX-SLOW-NEXT: retq 992; 993; AVX-FAST-LABEL: extract_extract45_v16i16_add_i16_commute: 994; AVX-FAST: # %bb.0: 995; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 996; AVX-FAST-NEXT: vpextrw $2, %xmm0, %eax 997; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax 998; AVX-FAST-NEXT: vzeroupper 999; AVX-FAST-NEXT: retq 1000 %x0 = extractelement <16 x i16> %x, i32 4 1001 %x1 = extractelement <16 x i16> %x, i32 5 1002 %x01 = add i16 %x1, %x0 1003 ret i16 %x01 1004} 1005 1006define i16 @extract_extract89_v16i16_add_i16_commute(<16 x i16> %x) { 1007; SSE3-SLOW-LABEL: extract_extract89_v16i16_add_i16_commute: 1008; SSE3-SLOW: # %bb.0: 1009; SSE3-SLOW-NEXT: movd %xmm1, %ecx 1010; SSE3-SLOW-NEXT: pextrw $1, %xmm1, %eax 1011; SSE3-SLOW-NEXT: addl %ecx, %eax 1012; SSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 1013; SSE3-SLOW-NEXT: retq 1014; 1015; SSE3-FAST-LABEL: extract_extract89_v16i16_add_i16_commute: 1016; SSE3-FAST: # %bb.0: 1017; SSE3-FAST-NEXT: phaddw %xmm1, %xmm1 1018; SSE3-FAST-NEXT: movd %xmm1, %eax 1019; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax 1020; SSE3-FAST-NEXT: retq 1021; 1022; AVX1-SLOW-LABEL: extract_extract89_v16i16_add_i16_commute: 1023; AVX1-SLOW: # %bb.0: 1024; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 1025; AVX1-SLOW-NEXT: vmovd %xmm0, %ecx 1026; AVX1-SLOW-NEXT: vpextrw $1, %xmm0, %eax 1027; AVX1-SLOW-NEXT: addl %ecx, %eax 1028; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 1029; AVX1-SLOW-NEXT: vzeroupper 1030; AVX1-SLOW-NEXT: retq 1031; 1032; AVX1-FAST-LABEL: extract_extract89_v16i16_add_i16_commute: 1033; AVX1-FAST: # %bb.0: 1034; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 1035; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 1036; AVX1-FAST-NEXT: vmovd %xmm0, %eax 1037; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax 1038; AVX1-FAST-NEXT: vzeroupper 1039; AVX1-FAST-NEXT: retq 1040; 1041; AVX2-SLOW-LABEL: extract_extract89_v16i16_add_i16_commute: 1042; AVX2-SLOW: # %bb.0: 1043; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 1044; AVX2-SLOW-NEXT: vmovd %xmm0, %ecx 1045; AVX2-SLOW-NEXT: vpextrw $1, %xmm0, %eax 1046; AVX2-SLOW-NEXT: addl %ecx, %eax 1047; AVX2-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 1048; AVX2-SLOW-NEXT: vzeroupper 1049; AVX2-SLOW-NEXT: retq 1050; 1051; AVX2-FAST-LABEL: extract_extract89_v16i16_add_i16_commute: 1052; AVX2-FAST: # %bb.0: 1053; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 1054; AVX2-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 1055; AVX2-FAST-NEXT: vmovd %xmm0, %eax 1056; AVX2-FAST-NEXT: # kill: def $ax killed $ax killed $eax 1057; AVX2-FAST-NEXT: vzeroupper 1058; AVX2-FAST-NEXT: retq 1059; 1060; AVX512-SLOW-LABEL: extract_extract89_v16i16_add_i16_commute: 1061; AVX512-SLOW: # %bb.0: 1062; AVX512-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 1063; AVX512-SLOW-NEXT: vmovd %xmm0, %ecx 1064; AVX512-SLOW-NEXT: vpextrw $1, %xmm0, %eax 1065; AVX512-SLOW-NEXT: addl %ecx, %eax 1066; AVX512-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 1067; AVX512-SLOW-NEXT: vzeroupper 1068; AVX512-SLOW-NEXT: retq 1069; 1070; AVX512-FAST-LABEL: extract_extract89_v16i16_add_i16_commute: 1071; AVX512-FAST: # %bb.0: 1072; AVX512-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 1073; AVX512-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 1074; AVX512-FAST-NEXT: vmovd %xmm0, %eax 1075; AVX512-FAST-NEXT: # kill: def $ax killed $ax killed $eax 1076; AVX512-FAST-NEXT: vzeroupper 1077; AVX512-FAST-NEXT: retq 1078 %x0 = extractelement <16 x i16> %x, i32 8 1079 %x1 = extractelement <16 x i16> %x, i32 9 1080 %x01 = add i16 %x1, %x0 1081 ret i16 %x01 1082} 1083 1084define i32 @extract_extract01_v8i32_sub_i32(<8 x i32> %x) { 1085; SSE3-SLOW-LABEL: extract_extract01_v8i32_sub_i32: 1086; SSE3-SLOW: # %bb.0: 1087; SSE3-SLOW-NEXT: movd %xmm0, %eax 1088; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1089; SSE3-SLOW-NEXT: movd %xmm0, %ecx 1090; SSE3-SLOW-NEXT: subl %ecx, %eax 1091; SSE3-SLOW-NEXT: retq 1092; 1093; SSE3-FAST-LABEL: extract_extract01_v8i32_sub_i32: 1094; SSE3-FAST: # %bb.0: 1095; SSE3-FAST-NEXT: phsubd %xmm0, %xmm0 1096; SSE3-FAST-NEXT: movd %xmm0, %eax 1097; SSE3-FAST-NEXT: retq 1098; 1099; AVX-SLOW-LABEL: extract_extract01_v8i32_sub_i32: 1100; AVX-SLOW: # %bb.0: 1101; AVX-SLOW-NEXT: vmovd %xmm0, %eax 1102; AVX-SLOW-NEXT: vpextrd $1, %xmm0, %ecx 1103; AVX-SLOW-NEXT: subl %ecx, %eax 1104; AVX-SLOW-NEXT: vzeroupper 1105; AVX-SLOW-NEXT: retq 1106; 1107; AVX-FAST-LABEL: extract_extract01_v8i32_sub_i32: 1108; AVX-FAST: # %bb.0: 1109; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 1110; AVX-FAST-NEXT: vmovd %xmm0, %eax 1111; AVX-FAST-NEXT: vzeroupper 1112; AVX-FAST-NEXT: retq 1113 %x0 = extractelement <8 x i32> %x, i32 0 1114 %x1 = extractelement <8 x i32> %x, i32 1 1115 %x01 = sub i32 %x0, %x1 1116 ret i32 %x01 1117} 1118 1119define i32 @extract_extract23_v8i32_sub_i32(<8 x i32> %x) { 1120; SSE3-SLOW-LABEL: extract_extract23_v8i32_sub_i32: 1121; SSE3-SLOW: # %bb.0: 1122; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1123; SSE3-SLOW-NEXT: movd %xmm1, %eax 1124; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 1125; SSE3-SLOW-NEXT: movd %xmm0, %ecx 1126; SSE3-SLOW-NEXT: subl %ecx, %eax 1127; SSE3-SLOW-NEXT: retq 1128; 1129; SSE3-FAST-LABEL: extract_extract23_v8i32_sub_i32: 1130; SSE3-FAST: # %bb.0: 1131; SSE3-FAST-NEXT: phsubd %xmm0, %xmm0 1132; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1133; SSE3-FAST-NEXT: movd %xmm0, %eax 1134; SSE3-FAST-NEXT: retq 1135; 1136; AVX-SLOW-LABEL: extract_extract23_v8i32_sub_i32: 1137; AVX-SLOW: # %bb.0: 1138; AVX-SLOW-NEXT: vextractps $2, %xmm0, %eax 1139; AVX-SLOW-NEXT: vextractps $3, %xmm0, %ecx 1140; AVX-SLOW-NEXT: subl %ecx, %eax 1141; AVX-SLOW-NEXT: vzeroupper 1142; AVX-SLOW-NEXT: retq 1143; 1144; AVX-FAST-LABEL: extract_extract23_v8i32_sub_i32: 1145; AVX-FAST: # %bb.0: 1146; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 1147; AVX-FAST-NEXT: vpextrd $1, %xmm0, %eax 1148; AVX-FAST-NEXT: vzeroupper 1149; AVX-FAST-NEXT: retq 1150 %x0 = extractelement <8 x i32> %x, i32 2 1151 %x1 = extractelement <8 x i32> %x, i32 3 1152 %x01 = sub i32 %x0, %x1 1153 ret i32 %x01 1154} 1155 1156define i32 @extract_extract67_v8i32_sub_i32(<8 x i32> %x) { 1157; SSE3-SLOW-LABEL: extract_extract67_v8i32_sub_i32: 1158; SSE3-SLOW: # %bb.0: 1159; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] 1160; SSE3-SLOW-NEXT: movd %xmm0, %eax 1161; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] 1162; SSE3-SLOW-NEXT: movd %xmm0, %ecx 1163; SSE3-SLOW-NEXT: subl %ecx, %eax 1164; SSE3-SLOW-NEXT: retq 1165; 1166; SSE3-FAST-LABEL: extract_extract67_v8i32_sub_i32: 1167; SSE3-FAST: # %bb.0: 1168; SSE3-FAST-NEXT: phsubd %xmm1, %xmm1 1169; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1170; SSE3-FAST-NEXT: movd %xmm0, %eax 1171; SSE3-FAST-NEXT: retq 1172; 1173; AVX-SLOW-LABEL: extract_extract67_v8i32_sub_i32: 1174; AVX-SLOW: # %bb.0: 1175; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 1176; AVX-SLOW-NEXT: vextractps $2, %xmm0, %eax 1177; AVX-SLOW-NEXT: vextractps $3, %xmm0, %ecx 1178; AVX-SLOW-NEXT: subl %ecx, %eax 1179; AVX-SLOW-NEXT: vzeroupper 1180; AVX-SLOW-NEXT: retq 1181; 1182; AVX1-FAST-LABEL: extract_extract67_v8i32_sub_i32: 1183; AVX1-FAST: # %bb.0: 1184; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 1185; AVX1-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 1186; AVX1-FAST-NEXT: vpextrd $1, %xmm0, %eax 1187; AVX1-FAST-NEXT: vzeroupper 1188; AVX1-FAST-NEXT: retq 1189; 1190; AVX2-FAST-LABEL: extract_extract67_v8i32_sub_i32: 1191; AVX2-FAST: # %bb.0: 1192; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 1193; AVX2-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 1194; AVX2-FAST-NEXT: vpextrd $1, %xmm0, %eax 1195; AVX2-FAST-NEXT: vzeroupper 1196; AVX2-FAST-NEXT: retq 1197; 1198; AVX512-FAST-LABEL: extract_extract67_v8i32_sub_i32: 1199; AVX512-FAST: # %bb.0: 1200; AVX512-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 1201; AVX512-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 1202; AVX512-FAST-NEXT: vpextrd $1, %xmm0, %eax 1203; AVX512-FAST-NEXT: vzeroupper 1204; AVX512-FAST-NEXT: retq 1205 %x0 = extractelement <8 x i32> %x, i32 6 1206 %x1 = extractelement <8 x i32> %x, i32 7 1207 %x01 = sub i32 %x0, %x1 1208 ret i32 %x01 1209} 1210 1211; Negative test...or get hoppy and negate? 1212 1213define i32 @extract_extract01_v8i32_sub_i32_commute(<8 x i32> %x) { 1214; SSE3-LABEL: extract_extract01_v8i32_sub_i32_commute: 1215; SSE3: # %bb.0: 1216; SSE3-NEXT: movd %xmm0, %ecx 1217; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1218; SSE3-NEXT: movd %xmm0, %eax 1219; SSE3-NEXT: subl %ecx, %eax 1220; SSE3-NEXT: retq 1221; 1222; AVX-LABEL: extract_extract01_v8i32_sub_i32_commute: 1223; AVX: # %bb.0: 1224; AVX-NEXT: vmovd %xmm0, %ecx 1225; AVX-NEXT: vpextrd $1, %xmm0, %eax 1226; AVX-NEXT: subl %ecx, %eax 1227; AVX-NEXT: vzeroupper 1228; AVX-NEXT: retq 1229 %x0 = extractelement <8 x i32> %x, i32 0 1230 %x1 = extractelement <8 x i32> %x, i32 1 1231 %x01 = sub i32 %x1, %x0 1232 ret i32 %x01 1233} 1234 1235define i16 @extract_extract01_v16i16_sub_i16(<16 x i16> %x) { 1236; SSE3-SLOW-LABEL: extract_extract01_v16i16_sub_i16: 1237; SSE3-SLOW: # %bb.0: 1238; SSE3-SLOW-NEXT: movd %xmm0, %eax 1239; SSE3-SLOW-NEXT: pextrw $1, %xmm0, %ecx 1240; SSE3-SLOW-NEXT: subl %ecx, %eax 1241; SSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 1242; SSE3-SLOW-NEXT: retq 1243; 1244; SSE3-FAST-LABEL: extract_extract01_v16i16_sub_i16: 1245; SSE3-FAST: # %bb.0: 1246; SSE3-FAST-NEXT: phsubw %xmm0, %xmm0 1247; SSE3-FAST-NEXT: movd %xmm0, %eax 1248; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax 1249; SSE3-FAST-NEXT: retq 1250; 1251; AVX-SLOW-LABEL: extract_extract01_v16i16_sub_i16: 1252; AVX-SLOW: # %bb.0: 1253; AVX-SLOW-NEXT: vmovd %xmm0, %eax 1254; AVX-SLOW-NEXT: vpextrw $1, %xmm0, %ecx 1255; AVX-SLOW-NEXT: subl %ecx, %eax 1256; AVX-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 1257; AVX-SLOW-NEXT: vzeroupper 1258; AVX-SLOW-NEXT: retq 1259; 1260; AVX-FAST-LABEL: extract_extract01_v16i16_sub_i16: 1261; AVX-FAST: # %bb.0: 1262; AVX-FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0 1263; AVX-FAST-NEXT: vmovd %xmm0, %eax 1264; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax 1265; AVX-FAST-NEXT: vzeroupper 1266; AVX-FAST-NEXT: retq 1267 %x0 = extractelement <16 x i16> %x, i32 0 1268 %x1 = extractelement <16 x i16> %x, i32 1 1269 %x01 = sub i16 %x0, %x1 1270 ret i16 %x01 1271} 1272 1273; Negative test...or get hoppy and negate? 1274 1275define i16 @extract_extract01_v16i16_sub_i16_commute(<16 x i16> %x) { 1276; SSE3-LABEL: extract_extract01_v16i16_sub_i16_commute: 1277; SSE3: # %bb.0: 1278; SSE3-NEXT: movd %xmm0, %ecx 1279; SSE3-NEXT: pextrw $1, %xmm0, %eax 1280; SSE3-NEXT: subl %ecx, %eax 1281; SSE3-NEXT: # kill: def $ax killed $ax killed $eax 1282; SSE3-NEXT: retq 1283; 1284; AVX-LABEL: extract_extract01_v16i16_sub_i16_commute: 1285; AVX: # %bb.0: 1286; AVX-NEXT: vmovd %xmm0, %ecx 1287; AVX-NEXT: vpextrw $1, %xmm0, %eax 1288; AVX-NEXT: subl %ecx, %eax 1289; AVX-NEXT: # kill: def $ax killed $ax killed $eax 1290; AVX-NEXT: vzeroupper 1291; AVX-NEXT: retq 1292 %x0 = extractelement <16 x i16> %x, i32 0 1293 %x1 = extractelement <16 x i16> %x, i32 1 1294 %x01 = sub i16 %x1, %x0 1295 ret i16 %x01 1296} 1297 1298; 512-bit vectors, i32/i16, add/sub 1299 1300define i32 @extract_extract01_v16i32_add_i32(<16 x i32> %x) { 1301; SSE3-SLOW-LABEL: extract_extract01_v16i32_add_i32: 1302; SSE3-SLOW: # %bb.0: 1303; SSE3-SLOW-NEXT: movd %xmm0, %ecx 1304; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1305; SSE3-SLOW-NEXT: movd %xmm0, %eax 1306; SSE3-SLOW-NEXT: addl %ecx, %eax 1307; SSE3-SLOW-NEXT: retq 1308; 1309; SSE3-FAST-LABEL: extract_extract01_v16i32_add_i32: 1310; SSE3-FAST: # %bb.0: 1311; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 1312; SSE3-FAST-NEXT: movd %xmm0, %eax 1313; SSE3-FAST-NEXT: retq 1314; 1315; AVX-SLOW-LABEL: extract_extract01_v16i32_add_i32: 1316; AVX-SLOW: # %bb.0: 1317; AVX-SLOW-NEXT: vmovd %xmm0, %ecx 1318; AVX-SLOW-NEXT: vpextrd $1, %xmm0, %eax 1319; AVX-SLOW-NEXT: addl %ecx, %eax 1320; AVX-SLOW-NEXT: vzeroupper 1321; AVX-SLOW-NEXT: retq 1322; 1323; AVX-FAST-LABEL: extract_extract01_v16i32_add_i32: 1324; AVX-FAST: # %bb.0: 1325; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 1326; AVX-FAST-NEXT: vmovd %xmm0, %eax 1327; AVX-FAST-NEXT: vzeroupper 1328; AVX-FAST-NEXT: retq 1329 %x0 = extractelement <16 x i32> %x, i32 0 1330 %x1 = extractelement <16 x i32> %x, i32 1 1331 %x01 = add i32 %x0, %x1 1332 ret i32 %x01 1333} 1334 1335define i32 @extract_extract01_v16i32_add_i32_commute(<16 x i32> %x) { 1336; SSE3-SLOW-LABEL: extract_extract01_v16i32_add_i32_commute: 1337; SSE3-SLOW: # %bb.0: 1338; SSE3-SLOW-NEXT: movd %xmm0, %ecx 1339; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1340; SSE3-SLOW-NEXT: movd %xmm0, %eax 1341; SSE3-SLOW-NEXT: addl %ecx, %eax 1342; SSE3-SLOW-NEXT: retq 1343; 1344; SSE3-FAST-LABEL: extract_extract01_v16i32_add_i32_commute: 1345; SSE3-FAST: # %bb.0: 1346; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 1347; SSE3-FAST-NEXT: movd %xmm0, %eax 1348; SSE3-FAST-NEXT: retq 1349; 1350; AVX-SLOW-LABEL: extract_extract01_v16i32_add_i32_commute: 1351; AVX-SLOW: # %bb.0: 1352; AVX-SLOW-NEXT: vmovd %xmm0, %ecx 1353; AVX-SLOW-NEXT: vpextrd $1, %xmm0, %eax 1354; AVX-SLOW-NEXT: addl %ecx, %eax 1355; AVX-SLOW-NEXT: vzeroupper 1356; AVX-SLOW-NEXT: retq 1357; 1358; AVX-FAST-LABEL: extract_extract01_v16i32_add_i32_commute: 1359; AVX-FAST: # %bb.0: 1360; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 1361; AVX-FAST-NEXT: vmovd %xmm0, %eax 1362; AVX-FAST-NEXT: vzeroupper 1363; AVX-FAST-NEXT: retq 1364 %x0 = extractelement <16 x i32> %x, i32 0 1365 %x1 = extractelement <16 x i32> %x, i32 1 1366 %x01 = add i32 %x1, %x0 1367 ret i32 %x01 1368} 1369 1370define i16 @extract_extract01_v32i16_add_i16(<32 x i16> %x) { 1371; SSE3-SLOW-LABEL: extract_extract01_v32i16_add_i16: 1372; SSE3-SLOW: # %bb.0: 1373; SSE3-SLOW-NEXT: movd %xmm0, %ecx 1374; SSE3-SLOW-NEXT: pextrw $1, %xmm0, %eax 1375; SSE3-SLOW-NEXT: addl %ecx, %eax 1376; SSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 1377; SSE3-SLOW-NEXT: retq 1378; 1379; SSE3-FAST-LABEL: extract_extract01_v32i16_add_i16: 1380; SSE3-FAST: # %bb.0: 1381; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 1382; SSE3-FAST-NEXT: movd %xmm0, %eax 1383; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax 1384; SSE3-FAST-NEXT: retq 1385; 1386; AVX-SLOW-LABEL: extract_extract01_v32i16_add_i16: 1387; AVX-SLOW: # %bb.0: 1388; AVX-SLOW-NEXT: vmovd %xmm0, %ecx 1389; AVX-SLOW-NEXT: vpextrw $1, %xmm0, %eax 1390; AVX-SLOW-NEXT: addl %ecx, %eax 1391; AVX-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 1392; AVX-SLOW-NEXT: vzeroupper 1393; AVX-SLOW-NEXT: retq 1394; 1395; AVX-FAST-LABEL: extract_extract01_v32i16_add_i16: 1396; AVX-FAST: # %bb.0: 1397; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 1398; AVX-FAST-NEXT: vmovd %xmm0, %eax 1399; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax 1400; AVX-FAST-NEXT: vzeroupper 1401; AVX-FAST-NEXT: retq 1402 %x0 = extractelement <32 x i16> %x, i32 0 1403 %x1 = extractelement <32 x i16> %x, i32 1 1404 %x01 = add i16 %x0, %x1 1405 ret i16 %x01 1406} 1407 1408define i16 @extract_extract01_v32i16_add_i16_commute(<32 x i16> %x) { 1409; SSE3-SLOW-LABEL: extract_extract01_v32i16_add_i16_commute: 1410; SSE3-SLOW: # %bb.0: 1411; SSE3-SLOW-NEXT: movd %xmm0, %ecx 1412; SSE3-SLOW-NEXT: pextrw $1, %xmm0, %eax 1413; SSE3-SLOW-NEXT: addl %ecx, %eax 1414; SSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 1415; SSE3-SLOW-NEXT: retq 1416; 1417; SSE3-FAST-LABEL: extract_extract01_v32i16_add_i16_commute: 1418; SSE3-FAST: # %bb.0: 1419; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 1420; SSE3-FAST-NEXT: movd %xmm0, %eax 1421; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax 1422; SSE3-FAST-NEXT: retq 1423; 1424; AVX-SLOW-LABEL: extract_extract01_v32i16_add_i16_commute: 1425; AVX-SLOW: # %bb.0: 1426; AVX-SLOW-NEXT: vmovd %xmm0, %ecx 1427; AVX-SLOW-NEXT: vpextrw $1, %xmm0, %eax 1428; AVX-SLOW-NEXT: addl %ecx, %eax 1429; AVX-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 1430; AVX-SLOW-NEXT: vzeroupper 1431; AVX-SLOW-NEXT: retq 1432; 1433; AVX-FAST-LABEL: extract_extract01_v32i16_add_i16_commute: 1434; AVX-FAST: # %bb.0: 1435; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 1436; AVX-FAST-NEXT: vmovd %xmm0, %eax 1437; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax 1438; AVX-FAST-NEXT: vzeroupper 1439; AVX-FAST-NEXT: retq 1440 %x0 = extractelement <32 x i16> %x, i32 0 1441 %x1 = extractelement <32 x i16> %x, i32 1 1442 %x01 = add i16 %x1, %x0 1443 ret i16 %x01 1444} 1445 1446define i32 @extract_extract01_v16i32_sub_i32(<16 x i32> %x) { 1447; SSE3-SLOW-LABEL: extract_extract01_v16i32_sub_i32: 1448; SSE3-SLOW: # %bb.0: 1449; SSE3-SLOW-NEXT: movd %xmm0, %eax 1450; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1451; SSE3-SLOW-NEXT: movd %xmm0, %ecx 1452; SSE3-SLOW-NEXT: subl %ecx, %eax 1453; SSE3-SLOW-NEXT: retq 1454; 1455; SSE3-FAST-LABEL: extract_extract01_v16i32_sub_i32: 1456; SSE3-FAST: # %bb.0: 1457; SSE3-FAST-NEXT: phsubd %xmm0, %xmm0 1458; SSE3-FAST-NEXT: movd %xmm0, %eax 1459; SSE3-FAST-NEXT: retq 1460; 1461; AVX-SLOW-LABEL: extract_extract01_v16i32_sub_i32: 1462; AVX-SLOW: # %bb.0: 1463; AVX-SLOW-NEXT: vmovd %xmm0, %eax 1464; AVX-SLOW-NEXT: vpextrd $1, %xmm0, %ecx 1465; AVX-SLOW-NEXT: subl %ecx, %eax 1466; AVX-SLOW-NEXT: vzeroupper 1467; AVX-SLOW-NEXT: retq 1468; 1469; AVX-FAST-LABEL: extract_extract01_v16i32_sub_i32: 1470; AVX-FAST: # %bb.0: 1471; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 1472; AVX-FAST-NEXT: vmovd %xmm0, %eax 1473; AVX-FAST-NEXT: vzeroupper 1474; AVX-FAST-NEXT: retq 1475 %x0 = extractelement <16 x i32> %x, i32 0 1476 %x1 = extractelement <16 x i32> %x, i32 1 1477 %x01 = sub i32 %x0, %x1 1478 ret i32 %x01 1479} 1480 1481define i32 @extract_extract01_v16i32_sub_i32_commute(<16 x i32> %x) { 1482; SSE3-LABEL: extract_extract01_v16i32_sub_i32_commute: 1483; SSE3: # %bb.0: 1484; SSE3-NEXT: movd %xmm0, %ecx 1485; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1486; SSE3-NEXT: movd %xmm0, %eax 1487; SSE3-NEXT: subl %ecx, %eax 1488; SSE3-NEXT: retq 1489; 1490; AVX-LABEL: extract_extract01_v16i32_sub_i32_commute: 1491; AVX: # %bb.0: 1492; AVX-NEXT: vmovd %xmm0, %ecx 1493; AVX-NEXT: vpextrd $1, %xmm0, %eax 1494; AVX-NEXT: subl %ecx, %eax 1495; AVX-NEXT: vzeroupper 1496; AVX-NEXT: retq 1497 %x0 = extractelement <16 x i32> %x, i32 0 1498 %x1 = extractelement <16 x i32> %x, i32 1 1499 %x01 = sub i32 %x1, %x0 1500 ret i32 %x01 1501} 1502 1503define i16 @extract_extract01_v32i16_sub_i16(<32 x i16> %x) { 1504; SSE3-SLOW-LABEL: extract_extract01_v32i16_sub_i16: 1505; SSE3-SLOW: # %bb.0: 1506; SSE3-SLOW-NEXT: movd %xmm0, %eax 1507; SSE3-SLOW-NEXT: pextrw $1, %xmm0, %ecx 1508; SSE3-SLOW-NEXT: subl %ecx, %eax 1509; SSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 1510; SSE3-SLOW-NEXT: retq 1511; 1512; SSE3-FAST-LABEL: extract_extract01_v32i16_sub_i16: 1513; SSE3-FAST: # %bb.0: 1514; SSE3-FAST-NEXT: phsubw %xmm0, %xmm0 1515; SSE3-FAST-NEXT: movd %xmm0, %eax 1516; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax 1517; SSE3-FAST-NEXT: retq 1518; 1519; AVX-SLOW-LABEL: extract_extract01_v32i16_sub_i16: 1520; AVX-SLOW: # %bb.0: 1521; AVX-SLOW-NEXT: vmovd %xmm0, %eax 1522; AVX-SLOW-NEXT: vpextrw $1, %xmm0, %ecx 1523; AVX-SLOW-NEXT: subl %ecx, %eax 1524; AVX-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 1525; AVX-SLOW-NEXT: vzeroupper 1526; AVX-SLOW-NEXT: retq 1527; 1528; AVX-FAST-LABEL: extract_extract01_v32i16_sub_i16: 1529; AVX-FAST: # %bb.0: 1530; AVX-FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0 1531; AVX-FAST-NEXT: vmovd %xmm0, %eax 1532; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax 1533; AVX-FAST-NEXT: vzeroupper 1534; AVX-FAST-NEXT: retq 1535 %x0 = extractelement <32 x i16> %x, i32 0 1536 %x1 = extractelement <32 x i16> %x, i32 1 1537 %x01 = sub i16 %x0, %x1 1538 ret i16 %x01 1539} 1540 1541define i16 @extract_extract01_v32i16_sub_i16_commute(<32 x i16> %x) { 1542; SSE3-LABEL: extract_extract01_v32i16_sub_i16_commute: 1543; SSE3: # %bb.0: 1544; SSE3-NEXT: movd %xmm0, %ecx 1545; SSE3-NEXT: pextrw $1, %xmm0, %eax 1546; SSE3-NEXT: subl %ecx, %eax 1547; SSE3-NEXT: # kill: def $ax killed $ax killed $eax 1548; SSE3-NEXT: retq 1549; 1550; AVX-LABEL: extract_extract01_v32i16_sub_i16_commute: 1551; AVX: # %bb.0: 1552; AVX-NEXT: vmovd %xmm0, %ecx 1553; AVX-NEXT: vpextrw $1, %xmm0, %eax 1554; AVX-NEXT: subl %ecx, %eax 1555; AVX-NEXT: # kill: def $ax killed $ax killed $eax 1556; AVX-NEXT: vzeroupper 1557; AVX-NEXT: retq 1558 %x0 = extractelement <32 x i16> %x, i32 0 1559 %x1 = extractelement <32 x i16> %x, i32 1 1560 %x01 = sub i16 %x1, %x0 1561 ret i16 %x01 1562} 1563 1564; Check output when 1 or both extracts have extra uses. 1565 1566define i32 @extract_extract01_v4i32_add_i32_uses1(<4 x i32> %x, i32* %p) { 1567; SSE3-SLOW-LABEL: extract_extract01_v4i32_add_i32_uses1: 1568; SSE3-SLOW: # %bb.0: 1569; SSE3-SLOW-NEXT: movd %xmm0, %ecx 1570; SSE3-SLOW-NEXT: movd %xmm0, (%rdi) 1571; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1572; SSE3-SLOW-NEXT: movd %xmm0, %eax 1573; SSE3-SLOW-NEXT: addl %ecx, %eax 1574; SSE3-SLOW-NEXT: retq 1575; 1576; SSE3-FAST-LABEL: extract_extract01_v4i32_add_i32_uses1: 1577; SSE3-FAST: # %bb.0: 1578; SSE3-FAST-NEXT: movd %xmm0, (%rdi) 1579; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 1580; SSE3-FAST-NEXT: movd %xmm0, %eax 1581; SSE3-FAST-NEXT: retq 1582; 1583; AVX-SLOW-LABEL: extract_extract01_v4i32_add_i32_uses1: 1584; AVX-SLOW: # %bb.0: 1585; AVX-SLOW-NEXT: vmovd %xmm0, %ecx 1586; AVX-SLOW-NEXT: vmovd %xmm0, (%rdi) 1587; AVX-SLOW-NEXT: vpextrd $1, %xmm0, %eax 1588; AVX-SLOW-NEXT: addl %ecx, %eax 1589; AVX-SLOW-NEXT: retq 1590; 1591; AVX-FAST-LABEL: extract_extract01_v4i32_add_i32_uses1: 1592; AVX-FAST: # %bb.0: 1593; AVX-FAST-NEXT: vmovd %xmm0, (%rdi) 1594; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 1595; AVX-FAST-NEXT: vmovd %xmm0, %eax 1596; AVX-FAST-NEXT: retq 1597 %x0 = extractelement <4 x i32> %x, i32 0 1598 store i32 %x0, i32* %p 1599 %x1 = extractelement <4 x i32> %x, i32 1 1600 %x01 = add i32 %x0, %x1 1601 ret i32 %x01 1602} 1603 1604define i32 @extract_extract01_v4i32_add_i32_uses2(<4 x i32> %x, i32* %p) { 1605; SSE3-SLOW-LABEL: extract_extract01_v4i32_add_i32_uses2: 1606; SSE3-SLOW: # %bb.0: 1607; SSE3-SLOW-NEXT: movd %xmm0, %ecx 1608; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1609; SSE3-SLOW-NEXT: movd %xmm0, %eax 1610; SSE3-SLOW-NEXT: addl %ecx, %eax 1611; SSE3-SLOW-NEXT: movd %xmm0, (%rdi) 1612; SSE3-SLOW-NEXT: retq 1613; 1614; SSE3-FAST-LABEL: extract_extract01_v4i32_add_i32_uses2: 1615; SSE3-FAST: # %bb.0: 1616; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1617; SSE3-FAST-NEXT: movd %xmm1, (%rdi) 1618; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 1619; SSE3-FAST-NEXT: movd %xmm0, %eax 1620; SSE3-FAST-NEXT: retq 1621; 1622; AVX-SLOW-LABEL: extract_extract01_v4i32_add_i32_uses2: 1623; AVX-SLOW: # %bb.0: 1624; AVX-SLOW-NEXT: vmovd %xmm0, %ecx 1625; AVX-SLOW-NEXT: vpextrd $1, %xmm0, %eax 1626; AVX-SLOW-NEXT: addl %ecx, %eax 1627; AVX-SLOW-NEXT: vpextrd $1, %xmm0, (%rdi) 1628; AVX-SLOW-NEXT: retq 1629; 1630; AVX-FAST-LABEL: extract_extract01_v4i32_add_i32_uses2: 1631; AVX-FAST: # %bb.0: 1632; AVX-FAST-NEXT: vpextrd $1, %xmm0, (%rdi) 1633; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 1634; AVX-FAST-NEXT: vmovd %xmm0, %eax 1635; AVX-FAST-NEXT: retq 1636 %x0 = extractelement <4 x i32> %x, i32 0 1637 %x1 = extractelement <4 x i32> %x, i32 1 1638 store i32 %x1, i32* %p 1639 %x01 = add i32 %x0, %x1 1640 ret i32 %x01 1641} 1642 1643define i32 @extract_extract01_v4i32_add_i32_uses3(<4 x i32> %x, i32* %p1, i32* %p2) { 1644; SSE3-LABEL: extract_extract01_v4i32_add_i32_uses3: 1645; SSE3: # %bb.0: 1646; SSE3-NEXT: movd %xmm0, %ecx 1647; SSE3-NEXT: movd %xmm0, (%rdi) 1648; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1649; SSE3-NEXT: movd %xmm0, %eax 1650; SSE3-NEXT: addl %ecx, %eax 1651; SSE3-NEXT: movd %xmm0, (%rsi) 1652; SSE3-NEXT: retq 1653; 1654; AVX-LABEL: extract_extract01_v4i32_add_i32_uses3: 1655; AVX: # %bb.0: 1656; AVX-NEXT: vmovd %xmm0, %ecx 1657; AVX-NEXT: vmovd %xmm0, (%rdi) 1658; AVX-NEXT: vpextrd $1, %xmm0, %eax 1659; AVX-NEXT: addl %ecx, %eax 1660; AVX-NEXT: vpextrd $1, %xmm0, (%rsi) 1661; AVX-NEXT: retq 1662 %x0 = extractelement <4 x i32> %x, i32 0 1663 store i32 %x0, i32* %p1 1664 %x1 = extractelement <4 x i32> %x, i32 1 1665 store i32 %x1, i32* %p2 1666 %x01 = add i32 %x0, %x1 1667 ret i32 %x01 1668} 1669 1670; PR33758: https://bugs.llvm.org/show_bug.cgi?id=33758 1671 1672define i32 @partial_reduction_add_v8i32(<8 x i32> %x) { 1673; SSE3-SLOW-LABEL: partial_reduction_add_v8i32: 1674; SSE3-SLOW: # %bb.0: 1675; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1676; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1 1677; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1678; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0 1679; SSE3-SLOW-NEXT: movd %xmm0, %eax 1680; SSE3-SLOW-NEXT: retq 1681; 1682; SSE3-FAST-LABEL: partial_reduction_add_v8i32: 1683; SSE3-FAST: # %bb.0: 1684; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1685; SSE3-FAST-NEXT: paddd %xmm0, %xmm1 1686; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1 1687; SSE3-FAST-NEXT: movd %xmm1, %eax 1688; SSE3-FAST-NEXT: retq 1689; 1690; AVX-SLOW-LABEL: partial_reduction_add_v8i32: 1691; AVX-SLOW: # %bb.0: 1692; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1693; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1694; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1695; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1696; AVX-SLOW-NEXT: vmovd %xmm0, %eax 1697; AVX-SLOW-NEXT: vzeroupper 1698; AVX-SLOW-NEXT: retq 1699; 1700; AVX-FAST-LABEL: partial_reduction_add_v8i32: 1701; AVX-FAST: # %bb.0: 1702; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 1703; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 1704; AVX-FAST-NEXT: vmovd %xmm0, %eax 1705; AVX-FAST-NEXT: vzeroupper 1706; AVX-FAST-NEXT: retq 1707 %x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1708 %x0213 = add <8 x i32> %x, %x23 1709 %x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1710 %x0123 = add <8 x i32> %x0213, %x13 1711 %r = extractelement <8 x i32> %x0123, i32 0 1712 ret i32 %r 1713} 1714 1715define i32 @partial_reduction_add_v16i32(<16 x i32> %x) { 1716; SSE3-SLOW-LABEL: partial_reduction_add_v16i32: 1717; SSE3-SLOW: # %bb.0: 1718; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1719; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1 1720; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1721; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0 1722; SSE3-SLOW-NEXT: movd %xmm0, %eax 1723; SSE3-SLOW-NEXT: retq 1724; 1725; SSE3-FAST-LABEL: partial_reduction_add_v16i32: 1726; SSE3-FAST: # %bb.0: 1727; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1728; SSE3-FAST-NEXT: paddd %xmm0, %xmm1 1729; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1 1730; SSE3-FAST-NEXT: movd %xmm1, %eax 1731; SSE3-FAST-NEXT: retq 1732; 1733; AVX-SLOW-LABEL: partial_reduction_add_v16i32: 1734; AVX-SLOW: # %bb.0: 1735; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1736; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1737; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1738; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1739; AVX-SLOW-NEXT: vmovd %xmm0, %eax 1740; AVX-SLOW-NEXT: vzeroupper 1741; AVX-SLOW-NEXT: retq 1742; 1743; AVX-FAST-LABEL: partial_reduction_add_v16i32: 1744; AVX-FAST: # %bb.0: 1745; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 1746; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 1747; AVX-FAST-NEXT: vmovd %xmm0, %eax 1748; AVX-FAST-NEXT: vzeroupper 1749; AVX-FAST-NEXT: retq 1750 %x23 = shufflevector <16 x i32> %x, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1751 %x0213 = add <16 x i32> %x, %x23 1752 %x13 = shufflevector <16 x i32> %x0213, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1753 %x0123 = add <16 x i32> %x0213, %x13 1754 %r = extractelement <16 x i32> %x0123, i32 0 1755 ret i32 %r 1756} 1757 1758define i32 @partial_reduction_sub_v8i32(<8 x i32> %x) { 1759; SSE3-SLOW-LABEL: partial_reduction_sub_v8i32: 1760; SSE3-SLOW: # %bb.0: 1761; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1762; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0 1763; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1764; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0 1765; SSE3-SLOW-NEXT: movd %xmm0, %eax 1766; SSE3-SLOW-NEXT: retq 1767; 1768; SSE3-FAST-LABEL: partial_reduction_sub_v8i32: 1769; SSE3-FAST: # %bb.0: 1770; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1771; SSE3-FAST-NEXT: psubd %xmm1, %xmm0 1772; SSE3-FAST-NEXT: phsubd %xmm0, %xmm0 1773; SSE3-FAST-NEXT: movd %xmm0, %eax 1774; SSE3-FAST-NEXT: retq 1775; 1776; AVX-SLOW-LABEL: partial_reduction_sub_v8i32: 1777; AVX-SLOW: # %bb.0: 1778; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1779; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 1780; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1781; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 1782; AVX-SLOW-NEXT: vmovd %xmm0, %eax 1783; AVX-SLOW-NEXT: vzeroupper 1784; AVX-SLOW-NEXT: retq 1785; 1786; AVX-FAST-LABEL: partial_reduction_sub_v8i32: 1787; AVX-FAST: # %bb.0: 1788; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1789; AVX-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 1790; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 1791; AVX-FAST-NEXT: vmovd %xmm0, %eax 1792; AVX-FAST-NEXT: vzeroupper 1793; AVX-FAST-NEXT: retq 1794 %x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1795 %x0213 = sub <8 x i32> %x, %x23 1796 %x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1797 %x0123 = sub <8 x i32> %x0213, %x13 1798 %r = extractelement <8 x i32> %x0123, i32 0 1799 ret i32 %r 1800} 1801 1802define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) { 1803; SSE3-SLOW-LABEL: partial_reduction_sub_v16i32: 1804; SSE3-SLOW: # %bb.0: 1805; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1806; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0 1807; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1808; SSE3-SLOW-NEXT: psubd %xmm1, %xmm0 1809; SSE3-SLOW-NEXT: movd %xmm0, %eax 1810; SSE3-SLOW-NEXT: retq 1811; 1812; SSE3-FAST-LABEL: partial_reduction_sub_v16i32: 1813; SSE3-FAST: # %bb.0: 1814; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1815; SSE3-FAST-NEXT: psubd %xmm1, %xmm0 1816; SSE3-FAST-NEXT: phsubd %xmm0, %xmm0 1817; SSE3-FAST-NEXT: movd %xmm0, %eax 1818; SSE3-FAST-NEXT: retq 1819; 1820; AVX-SLOW-LABEL: partial_reduction_sub_v16i32: 1821; AVX-SLOW: # %bb.0: 1822; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1823; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 1824; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1825; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 1826; AVX-SLOW-NEXT: vmovd %xmm0, %eax 1827; AVX-SLOW-NEXT: vzeroupper 1828; AVX-SLOW-NEXT: retq 1829; 1830; AVX1-FAST-LABEL: partial_reduction_sub_v16i32: 1831; AVX1-FAST: # %bb.0: 1832; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1833; AVX1-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 1834; AVX1-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 1835; AVX1-FAST-NEXT: vmovd %xmm0, %eax 1836; AVX1-FAST-NEXT: vzeroupper 1837; AVX1-FAST-NEXT: retq 1838; 1839; AVX2-FAST-LABEL: partial_reduction_sub_v16i32: 1840; AVX2-FAST: # %bb.0: 1841; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1842; AVX2-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 1843; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1844; AVX2-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 1845; AVX2-FAST-NEXT: vmovd %xmm0, %eax 1846; AVX2-FAST-NEXT: vzeroupper 1847; AVX2-FAST-NEXT: retq 1848; 1849; AVX512-FAST-LABEL: partial_reduction_sub_v16i32: 1850; AVX512-FAST: # %bb.0: 1851; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1852; AVX512-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 1853; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1854; AVX512-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 1855; AVX512-FAST-NEXT: vmovd %xmm0, %eax 1856; AVX512-FAST-NEXT: vzeroupper 1857; AVX512-FAST-NEXT: retq 1858 %x23 = shufflevector <16 x i32> %x, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1859 %x0213 = sub <16 x i32> %x, %x23 1860 %x13 = shufflevector <16 x i32> %x0213, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1861 %x0123 = sub <16 x i32> %x0213, %x13 1862 %r = extractelement <16 x i32> %x0123, i32 0 1863 ret i32 %r 1864} 1865 1866; https://bugs.chromium.org/p/chromium/issues/detail?id=1195353 1867define <2 x i64> @negative_extract_v16i16_v8i16(<4 x i64> %a0) { 1868; SSE3-LABEL: negative_extract_v16i16_v8i16: 1869; SSE3: # %bb.0: 1870; SSE3-NEXT: paddw %xmm1, %xmm0 1871; SSE3-NEXT: retq 1872; 1873; AVX1-SLOW-LABEL: negative_extract_v16i16_v8i16: 1874; AVX1-SLOW: # %bb.0: 1875; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1876; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1877; AVX1-SLOW-NEXT: vzeroupper 1878; AVX1-SLOW-NEXT: retq 1879; 1880; AVX1-FAST-LABEL: negative_extract_v16i16_v8i16: 1881; AVX1-FAST: # %bb.0: 1882; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 1883; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1884; AVX1-FAST-NEXT: vzeroupper 1885; AVX1-FAST-NEXT: retq 1886; 1887; AVX2-SLOW-LABEL: negative_extract_v16i16_v8i16: 1888; AVX2-SLOW: # %bb.0: 1889; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 1890; AVX2-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1891; AVX2-SLOW-NEXT: vzeroupper 1892; AVX2-SLOW-NEXT: retq 1893; 1894; AVX2-FAST-LABEL: negative_extract_v16i16_v8i16: 1895; AVX2-FAST: # %bb.0: 1896; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 1897; AVX2-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1898; AVX2-FAST-NEXT: vzeroupper 1899; AVX2-FAST-NEXT: retq 1900; 1901; AVX512-SLOW-LABEL: negative_extract_v16i16_v8i16: 1902; AVX512-SLOW: # %bb.0: 1903; AVX512-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 1904; AVX512-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1905; AVX512-SLOW-NEXT: vzeroupper 1906; AVX512-SLOW-NEXT: retq 1907; 1908; AVX512-FAST-LABEL: negative_extract_v16i16_v8i16: 1909; AVX512-FAST: # %bb.0: 1910; AVX512-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 1911; AVX512-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1912; AVX512-FAST-NEXT: vzeroupper 1913; AVX512-FAST-NEXT: retq 1914 %s = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 1915 %b = bitcast <4 x i64> %a0 to <16 x i16> 1916 %c = bitcast <4 x i64> %s to <16 x i16> 1917 %d = add <16 x i16> %b, %c 1918 %e = bitcast <16 x i16> %d to <4 x i64> 1919 %f = shufflevector <4 x i64> %e, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 1920 ret <2 x i64> %f 1921} 1922 1923; PR42023 - https://bugs.llvm.org/show_bug.cgi?id=42023 1924 1925define i16 @hadd16_8(<8 x i16> %x223) { 1926; SSE3-SLOW-LABEL: hadd16_8: 1927; SSE3-SLOW: # %bb.0: 1928; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1929; SSE3-SLOW-NEXT: paddw %xmm0, %xmm1 1930; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1931; SSE3-SLOW-NEXT: paddw %xmm1, %xmm0 1932; SSE3-SLOW-NEXT: movdqa %xmm0, %xmm1 1933; SSE3-SLOW-NEXT: psrld $16, %xmm1 1934; SSE3-SLOW-NEXT: paddw %xmm0, %xmm1 1935; SSE3-SLOW-NEXT: movd %xmm1, %eax 1936; SSE3-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 1937; SSE3-SLOW-NEXT: retq 1938; 1939; SSE3-FAST-LABEL: hadd16_8: 1940; SSE3-FAST: # %bb.0: 1941; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 1942; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 1943; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 1944; SSE3-FAST-NEXT: movd %xmm0, %eax 1945; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax 1946; SSE3-FAST-NEXT: retq 1947; 1948; AVX-SLOW-LABEL: hadd16_8: 1949; AVX-SLOW: # %bb.0: 1950; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1951; AVX-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1952; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 1953; AVX-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1954; AVX-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 1955; AVX-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 1956; AVX-SLOW-NEXT: vmovd %xmm0, %eax 1957; AVX-SLOW-NEXT: # kill: def $ax killed $ax killed $eax 1958; AVX-SLOW-NEXT: retq 1959; 1960; AVX-FAST-LABEL: hadd16_8: 1961; AVX-FAST: # %bb.0: 1962; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 1963; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 1964; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 1965; AVX-FAST-NEXT: vmovd %xmm0, %eax 1966; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax 1967; AVX-FAST-NEXT: retq 1968 %x224 = shufflevector <8 x i16> %x223, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 1969 %x225 = add <8 x i16> %x223, %x224 1970 %x226 = shufflevector <8 x i16> %x225, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1971 %x227 = add <8 x i16> %x225, %x226 1972 %x228 = shufflevector <8 x i16> %x227, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 1973 %x229 = add <8 x i16> %x227, %x228 1974 %x230 = extractelement <8 x i16> %x229, i32 0 1975 ret i16 %x230 1976} 1977 1978define i32 @hadd32_4(<4 x i32> %x225) { 1979; SSE3-SLOW-LABEL: hadd32_4: 1980; SSE3-SLOW: # %bb.0: 1981; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1982; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1 1983; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1984; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0 1985; SSE3-SLOW-NEXT: movd %xmm0, %eax 1986; SSE3-SLOW-NEXT: retq 1987; 1988; SSE3-FAST-LABEL: hadd32_4: 1989; SSE3-FAST: # %bb.0: 1990; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 1991; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 1992; SSE3-FAST-NEXT: movd %xmm0, %eax 1993; SSE3-FAST-NEXT: retq 1994; 1995; AVX-SLOW-LABEL: hadd32_4: 1996; AVX-SLOW: # %bb.0: 1997; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 1998; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 1999; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2000; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2001; AVX-SLOW-NEXT: vmovd %xmm0, %eax 2002; AVX-SLOW-NEXT: retq 2003; 2004; AVX-FAST-LABEL: hadd32_4: 2005; AVX-FAST: # %bb.0: 2006; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 2007; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 2008; AVX-FAST-NEXT: vmovd %xmm0, %eax 2009; AVX-FAST-NEXT: retq 2010 %x226 = shufflevector <4 x i32> %x225, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 2011 %x227 = add <4 x i32> %x225, %x226 2012 %x228 = shufflevector <4 x i32> %x227, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 2013 %x229 = add <4 x i32> %x227, %x228 2014 %x230 = extractelement <4 x i32> %x229, i32 0 2015 ret i32 %x230 2016} 2017 2018define i32 @hadd32_8(<8 x i32> %x225) { 2019; SSE3-SLOW-LABEL: hadd32_8: 2020; SSE3-SLOW: # %bb.0: 2021; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2022; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1 2023; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 2024; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0 2025; SSE3-SLOW-NEXT: movd %xmm0, %eax 2026; SSE3-SLOW-NEXT: retq 2027; 2028; SSE3-FAST-LABEL: hadd32_8: 2029; SSE3-FAST: # %bb.0: 2030; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2031; SSE3-FAST-NEXT: paddd %xmm0, %xmm1 2032; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1 2033; SSE3-FAST-NEXT: movd %xmm1, %eax 2034; SSE3-FAST-NEXT: retq 2035; 2036; AVX-SLOW-LABEL: hadd32_8: 2037; AVX-SLOW: # %bb.0: 2038; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2039; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2040; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2041; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2042; AVX-SLOW-NEXT: vmovd %xmm0, %eax 2043; AVX-SLOW-NEXT: vzeroupper 2044; AVX-SLOW-NEXT: retq 2045; 2046; AVX-FAST-LABEL: hadd32_8: 2047; AVX-FAST: # %bb.0: 2048; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 2049; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 2050; AVX-FAST-NEXT: vmovd %xmm0, %eax 2051; AVX-FAST-NEXT: vzeroupper 2052; AVX-FAST-NEXT: retq 2053 %x226 = shufflevector <8 x i32> %x225, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2054 %x227 = add <8 x i32> %x225, %x226 2055 %x228 = shufflevector <8 x i32> %x227, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2056 %x229 = add <8 x i32> %x227, %x228 2057 %x230 = extractelement <8 x i32> %x229, i32 0 2058 ret i32 %x230 2059} 2060 2061define i32 @hadd32_16(<16 x i32> %x225) { 2062; SSE3-SLOW-LABEL: hadd32_16: 2063; SSE3-SLOW: # %bb.0: 2064; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2065; SSE3-SLOW-NEXT: paddd %xmm0, %xmm1 2066; SSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 2067; SSE3-SLOW-NEXT: paddd %xmm1, %xmm0 2068; SSE3-SLOW-NEXT: movd %xmm0, %eax 2069; SSE3-SLOW-NEXT: retq 2070; 2071; SSE3-FAST-LABEL: hadd32_16: 2072; SSE3-FAST: # %bb.0: 2073; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2074; SSE3-FAST-NEXT: paddd %xmm0, %xmm1 2075; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1 2076; SSE3-FAST-NEXT: movd %xmm1, %eax 2077; SSE3-FAST-NEXT: retq 2078; 2079; AVX-SLOW-LABEL: hadd32_16: 2080; AVX-SLOW: # %bb.0: 2081; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2082; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2083; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 2084; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2085; AVX-SLOW-NEXT: vmovd %xmm0, %eax 2086; AVX-SLOW-NEXT: vzeroupper 2087; AVX-SLOW-NEXT: retq 2088; 2089; AVX-FAST-LABEL: hadd32_16: 2090; AVX-FAST: # %bb.0: 2091; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 2092; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 2093; AVX-FAST-NEXT: vmovd %xmm0, %eax 2094; AVX-FAST-NEXT: vzeroupper 2095; AVX-FAST-NEXT: retq 2096 %x226 = shufflevector <16 x i32> %x225, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2097 %x227 = add <16 x i32> %x225, %x226 2098 %x228 = shufflevector <16 x i32> %x227, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2099 %x229 = add <16 x i32> %x227, %x228 2100 %x230 = extractelement <16 x i32> %x229, i32 0 2101 ret i32 %x230 2102} 2103 2104define i16 @hadd16_8_optsize(<8 x i16> %x223) optsize { 2105; SSE3-LABEL: hadd16_8_optsize: 2106; SSE3: # %bb.0: 2107; SSE3-NEXT: phaddw %xmm0, %xmm0 2108; SSE3-NEXT: phaddw %xmm0, %xmm0 2109; SSE3-NEXT: phaddw %xmm0, %xmm0 2110; SSE3-NEXT: movd %xmm0, %eax 2111; SSE3-NEXT: # kill: def $ax killed $ax killed $eax 2112; SSE3-NEXT: retq 2113; 2114; AVX-LABEL: hadd16_8_optsize: 2115; AVX: # %bb.0: 2116; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 2117; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 2118; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 2119; AVX-NEXT: vmovd %xmm0, %eax 2120; AVX-NEXT: # kill: def $ax killed $ax killed $eax 2121; AVX-NEXT: retq 2122 %x224 = shufflevector <8 x i16> %x223, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 2123 %x225 = add <8 x i16> %x223, %x224 2124 %x226 = shufflevector <8 x i16> %x225, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2125 %x227 = add <8 x i16> %x225, %x226 2126 %x228 = shufflevector <8 x i16> %x227, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2127 %x229 = add <8 x i16> %x227, %x228 2128 %x230 = extractelement <8 x i16> %x229, i32 0 2129 ret i16 %x230 2130} 2131 2132define i32 @hadd32_4_optsize(<4 x i32> %x225) optsize { 2133; SSE3-LABEL: hadd32_4_optsize: 2134; SSE3: # %bb.0: 2135; SSE3-NEXT: phaddd %xmm0, %xmm0 2136; SSE3-NEXT: phaddd %xmm0, %xmm0 2137; SSE3-NEXT: movd %xmm0, %eax 2138; SSE3-NEXT: retq 2139; 2140; AVX-LABEL: hadd32_4_optsize: 2141; AVX: # %bb.0: 2142; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 2143; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 2144; AVX-NEXT: vmovd %xmm0, %eax 2145; AVX-NEXT: retq 2146 %x226 = shufflevector <4 x i32> %x225, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 2147 %x227 = add <4 x i32> %x225, %x226 2148 %x228 = shufflevector <4 x i32> %x227, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 2149 %x229 = add <4 x i32> %x227, %x228 2150 %x230 = extractelement <4 x i32> %x229, i32 0 2151 ret i32 %x230 2152} 2153 2154define i32 @hadd32_4_pgso(<4 x i32> %x225) !prof !14 { 2155; SSE3-LABEL: hadd32_4_pgso: 2156; SSE3: # %bb.0: 2157; SSE3-NEXT: phaddd %xmm0, %xmm0 2158; SSE3-NEXT: phaddd %xmm0, %xmm0 2159; SSE3-NEXT: movd %xmm0, %eax 2160; SSE3-NEXT: retq 2161; 2162; AVX-LABEL: hadd32_4_pgso: 2163; AVX: # %bb.0: 2164; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 2165; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 2166; AVX-NEXT: vmovd %xmm0, %eax 2167; AVX-NEXT: retq 2168 %x226 = shufflevector <4 x i32> %x225, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> 2169 %x227 = add <4 x i32> %x225, %x226 2170 %x228 = shufflevector <4 x i32> %x227, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 2171 %x229 = add <4 x i32> %x227, %x228 2172 %x230 = extractelement <4 x i32> %x229, i32 0 2173 ret i32 %x230 2174} 2175 2176define i32 @hadd32_8_optsize(<8 x i32> %x225) optsize { 2177; SSE3-LABEL: hadd32_8_optsize: 2178; SSE3: # %bb.0: 2179; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2180; SSE3-NEXT: paddd %xmm0, %xmm1 2181; SSE3-NEXT: phaddd %xmm1, %xmm1 2182; SSE3-NEXT: movd %xmm1, %eax 2183; SSE3-NEXT: retq 2184; 2185; AVX-LABEL: hadd32_8_optsize: 2186; AVX: # %bb.0: 2187; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 2188; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 2189; AVX-NEXT: vmovd %xmm0, %eax 2190; AVX-NEXT: vzeroupper 2191; AVX-NEXT: retq 2192 %x226 = shufflevector <8 x i32> %x225, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2193 %x227 = add <8 x i32> %x225, %x226 2194 %x228 = shufflevector <8 x i32> %x227, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2195 %x229 = add <8 x i32> %x227, %x228 2196 %x230 = extractelement <8 x i32> %x229, i32 0 2197 ret i32 %x230 2198} 2199 2200define i32 @hadd32_16_optsize(<16 x i32> %x225) optsize { 2201; SSE3-LABEL: hadd32_16_optsize: 2202; SSE3: # %bb.0: 2203; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 2204; SSE3-NEXT: paddd %xmm0, %xmm1 2205; SSE3-NEXT: phaddd %xmm1, %xmm1 2206; SSE3-NEXT: movd %xmm1, %eax 2207; SSE3-NEXT: retq 2208; 2209; AVX-LABEL: hadd32_16_optsize: 2210; AVX: # %bb.0: 2211; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 2212; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 2213; AVX-NEXT: vmovd %xmm0, %eax 2214; AVX-NEXT: vzeroupper 2215; AVX-NEXT: retq 2216 %x226 = shufflevector <16 x i32> %x225, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2217 %x227 = add <16 x i32> %x225, %x226 2218 %x228 = shufflevector <16 x i32> %x227, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2219 %x229 = add <16 x i32> %x227, %x228 2220 %x230 = extractelement <16 x i32> %x229, i32 0 2221 ret i32 %x230 2222} 2223 2224!llvm.module.flags = !{!0} 2225!0 = !{i32 1, !"ProfileSummary", !1} 2226!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} 2227!2 = !{!"ProfileFormat", !"InstrProf"} 2228!3 = !{!"TotalCount", i64 10000} 2229!4 = !{!"MaxCount", i64 10} 2230!5 = !{!"MaxInternalCount", i64 1} 2231!6 = !{!"MaxFunctionCount", i64 1000} 2232!7 = !{!"NumCounts", i64 3} 2233!8 = !{!"NumFunctions", i64 3} 2234!9 = !{!"DetailedSummary", !10} 2235!10 = !{!11, !12, !13} 2236!11 = !{i32 10000, i64 100, i32 1} 2237!12 = !{i32 999000, i64 100, i32 1} 2238!13 = !{i32 999999, i64 1, i32 2} 2239!14 = !{!"function_entry_count", i64 0} 2240