1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3              | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops    | FileCheck %s --check-prefixes=SSE3,SSE3-FAST
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx                | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops      | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2               | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2-SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops     | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2-FAST
8; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl           | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512-SLOW
9; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512-FAST
10
11; 128-bit vectors, 16/32-bit, add/sub
12
13define i32 @extract_extract01_v4i32_add_i32(<4 x i32> %x) {
14; SSE3-SLOW-LABEL: extract_extract01_v4i32_add_i32:
15; SSE3-SLOW:       # %bb.0:
16; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
17; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
18; SSE3-SLOW-NEXT:    movd %xmm0, %eax
19; SSE3-SLOW-NEXT:    addl %ecx, %eax
20; SSE3-SLOW-NEXT:    retq
21;
22; SSE3-FAST-LABEL: extract_extract01_v4i32_add_i32:
23; SSE3-FAST:       # %bb.0:
24; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
25; SSE3-FAST-NEXT:    movd %xmm0, %eax
26; SSE3-FAST-NEXT:    retq
27;
28; AVX-SLOW-LABEL: extract_extract01_v4i32_add_i32:
29; AVX-SLOW:       # %bb.0:
30; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx
31; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %eax
32; AVX-SLOW-NEXT:    addl %ecx, %eax
33; AVX-SLOW-NEXT:    retq
34;
35; AVX-FAST-LABEL: extract_extract01_v4i32_add_i32:
36; AVX-FAST:       # %bb.0:
37; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
38; AVX-FAST-NEXT:    vmovd %xmm0, %eax
39; AVX-FAST-NEXT:    retq
40  %x0 = extractelement <4 x i32> %x, i32 0
41  %x1 = extractelement <4 x i32> %x, i32 1
42  %x01 = add i32 %x0, %x1
43  ret i32 %x01
44}
45
46define i32 @extract_extract23_v4i32_add_i32(<4 x i32> %x) {
47; SSE3-SLOW-LABEL: extract_extract23_v4i32_add_i32:
48; SSE3-SLOW:       # %bb.0:
49; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
50; SSE3-SLOW-NEXT:    movd %xmm1, %ecx
51; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
52; SSE3-SLOW-NEXT:    movd %xmm0, %eax
53; SSE3-SLOW-NEXT:    addl %ecx, %eax
54; SSE3-SLOW-NEXT:    retq
55;
56; SSE3-FAST-LABEL: extract_extract23_v4i32_add_i32:
57; SSE3-FAST:       # %bb.0:
58; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
59; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
60; SSE3-FAST-NEXT:    movd %xmm0, %eax
61; SSE3-FAST-NEXT:    retq
62;
63; AVX-SLOW-LABEL: extract_extract23_v4i32_add_i32:
64; AVX-SLOW:       # %bb.0:
65; AVX-SLOW-NEXT:    vextractps $2, %xmm0, %ecx
66; AVX-SLOW-NEXT:    vextractps $3, %xmm0, %eax
67; AVX-SLOW-NEXT:    addl %ecx, %eax
68; AVX-SLOW-NEXT:    retq
69;
70; AVX-FAST-LABEL: extract_extract23_v4i32_add_i32:
71; AVX-FAST:       # %bb.0:
72; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
73; AVX-FAST-NEXT:    vpextrd $1, %xmm0, %eax
74; AVX-FAST-NEXT:    retq
75  %x0 = extractelement <4 x i32> %x, i32 2
76  %x1 = extractelement <4 x i32> %x, i32 3
77  %x01 = add i32 %x0, %x1
78  ret i32 %x01
79}
80
81define i32 @extract_extract01_v4i32_add_i32_commute(<4 x i32> %x) {
82; SSE3-SLOW-LABEL: extract_extract01_v4i32_add_i32_commute:
83; SSE3-SLOW:       # %bb.0:
84; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
85; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
86; SSE3-SLOW-NEXT:    movd %xmm0, %eax
87; SSE3-SLOW-NEXT:    addl %ecx, %eax
88; SSE3-SLOW-NEXT:    retq
89;
90; SSE3-FAST-LABEL: extract_extract01_v4i32_add_i32_commute:
91; SSE3-FAST:       # %bb.0:
92; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
93; SSE3-FAST-NEXT:    movd %xmm0, %eax
94; SSE3-FAST-NEXT:    retq
95;
96; AVX-SLOW-LABEL: extract_extract01_v4i32_add_i32_commute:
97; AVX-SLOW:       # %bb.0:
98; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx
99; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %eax
100; AVX-SLOW-NEXT:    addl %ecx, %eax
101; AVX-SLOW-NEXT:    retq
102;
103; AVX-FAST-LABEL: extract_extract01_v4i32_add_i32_commute:
104; AVX-FAST:       # %bb.0:
105; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
106; AVX-FAST-NEXT:    vmovd %xmm0, %eax
107; AVX-FAST-NEXT:    retq
108  %x0 = extractelement <4 x i32> %x, i32 0
109  %x1 = extractelement <4 x i32> %x, i32 1
110  %x01 = add i32 %x1, %x0
111  ret i32 %x01
112}
113
114define i32 @extract_extract23_v4i32_add_i32_commute(<4 x i32> %x) {
115; SSE3-SLOW-LABEL: extract_extract23_v4i32_add_i32_commute:
116; SSE3-SLOW:       # %bb.0:
117; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
118; SSE3-SLOW-NEXT:    movd %xmm1, %ecx
119; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
120; SSE3-SLOW-NEXT:    movd %xmm0, %eax
121; SSE3-SLOW-NEXT:    addl %ecx, %eax
122; SSE3-SLOW-NEXT:    retq
123;
124; SSE3-FAST-LABEL: extract_extract23_v4i32_add_i32_commute:
125; SSE3-FAST:       # %bb.0:
126; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
127; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
128; SSE3-FAST-NEXT:    movd %xmm0, %eax
129; SSE3-FAST-NEXT:    retq
130;
131; AVX-SLOW-LABEL: extract_extract23_v4i32_add_i32_commute:
132; AVX-SLOW:       # %bb.0:
133; AVX-SLOW-NEXT:    vextractps $2, %xmm0, %ecx
134; AVX-SLOW-NEXT:    vextractps $3, %xmm0, %eax
135; AVX-SLOW-NEXT:    addl %ecx, %eax
136; AVX-SLOW-NEXT:    retq
137;
138; AVX-FAST-LABEL: extract_extract23_v4i32_add_i32_commute:
139; AVX-FAST:       # %bb.0:
140; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
141; AVX-FAST-NEXT:    vpextrd $1, %xmm0, %eax
142; AVX-FAST-NEXT:    retq
143  %x0 = extractelement <4 x i32> %x, i32 2
144  %x1 = extractelement <4 x i32> %x, i32 3
145  %x01 = add i32 %x1, %x0
146  ret i32 %x01
147}
148
149define i16 @extract_extract01_v8i16_add_i16(<8 x i16> %x) {
150; SSE3-SLOW-LABEL: extract_extract01_v8i16_add_i16:
151; SSE3-SLOW:       # %bb.0:
152; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
153; SSE3-SLOW-NEXT:    pextrw $1, %xmm0, %eax
154; SSE3-SLOW-NEXT:    addl %ecx, %eax
155; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
156; SSE3-SLOW-NEXT:    retq
157;
158; SSE3-FAST-LABEL: extract_extract01_v8i16_add_i16:
159; SSE3-FAST:       # %bb.0:
160; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
161; SSE3-FAST-NEXT:    movd %xmm0, %eax
162; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
163; SSE3-FAST-NEXT:    retq
164;
165; AVX-SLOW-LABEL: extract_extract01_v8i16_add_i16:
166; AVX-SLOW:       # %bb.0:
167; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx
168; AVX-SLOW-NEXT:    vpextrw $1, %xmm0, %eax
169; AVX-SLOW-NEXT:    addl %ecx, %eax
170; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
171; AVX-SLOW-NEXT:    retq
172;
173; AVX-FAST-LABEL: extract_extract01_v8i16_add_i16:
174; AVX-FAST:       # %bb.0:
175; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
176; AVX-FAST-NEXT:    vmovd %xmm0, %eax
177; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
178; AVX-FAST-NEXT:    retq
179  %x0 = extractelement <8 x i16> %x, i32 0
180  %x1 = extractelement <8 x i16> %x, i32 1
181  %x01 = add i16 %x0, %x1
182  ret i16 %x01
183}
184
185define i16 @extract_extract45_v8i16_add_i16(<8 x i16> %x) {
186; SSE3-SLOW-LABEL: extract_extract45_v8i16_add_i16:
187; SSE3-SLOW:       # %bb.0:
188; SSE3-SLOW-NEXT:    pextrw $4, %xmm0, %ecx
189; SSE3-SLOW-NEXT:    pextrw $5, %xmm0, %eax
190; SSE3-SLOW-NEXT:    addl %ecx, %eax
191; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
192; SSE3-SLOW-NEXT:    retq
193;
194; SSE3-FAST-LABEL: extract_extract45_v8i16_add_i16:
195; SSE3-FAST:       # %bb.0:
196; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
197; SSE3-FAST-NEXT:    pextrw $2, %xmm0, %eax
198; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
199; SSE3-FAST-NEXT:    retq
200;
201; AVX-SLOW-LABEL: extract_extract45_v8i16_add_i16:
202; AVX-SLOW:       # %bb.0:
203; AVX-SLOW-NEXT:    vpextrw $4, %xmm0, %ecx
204; AVX-SLOW-NEXT:    vpextrw $5, %xmm0, %eax
205; AVX-SLOW-NEXT:    addl %ecx, %eax
206; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
207; AVX-SLOW-NEXT:    retq
208;
209; AVX-FAST-LABEL: extract_extract45_v8i16_add_i16:
210; AVX-FAST:       # %bb.0:
211; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
212; AVX-FAST-NEXT:    vpextrw $2, %xmm0, %eax
213; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
214; AVX-FAST-NEXT:    retq
215  %x0 = extractelement <8 x i16> %x, i32 4
216  %x1 = extractelement <8 x i16> %x, i32 5
217  %x01 = add i16 %x0, %x1
218  ret i16 %x01
219}
220
221define i16 @extract_extract01_v8i16_add_i16_commute(<8 x i16> %x) {
222; SSE3-SLOW-LABEL: extract_extract01_v8i16_add_i16_commute:
223; SSE3-SLOW:       # %bb.0:
224; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
225; SSE3-SLOW-NEXT:    pextrw $1, %xmm0, %eax
226; SSE3-SLOW-NEXT:    addl %ecx, %eax
227; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
228; SSE3-SLOW-NEXT:    retq
229;
230; SSE3-FAST-LABEL: extract_extract01_v8i16_add_i16_commute:
231; SSE3-FAST:       # %bb.0:
232; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
233; SSE3-FAST-NEXT:    movd %xmm0, %eax
234; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
235; SSE3-FAST-NEXT:    retq
236;
237; AVX-SLOW-LABEL: extract_extract01_v8i16_add_i16_commute:
238; AVX-SLOW:       # %bb.0:
239; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx
240; AVX-SLOW-NEXT:    vpextrw $1, %xmm0, %eax
241; AVX-SLOW-NEXT:    addl %ecx, %eax
242; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
243; AVX-SLOW-NEXT:    retq
244;
245; AVX-FAST-LABEL: extract_extract01_v8i16_add_i16_commute:
246; AVX-FAST:       # %bb.0:
247; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
248; AVX-FAST-NEXT:    vmovd %xmm0, %eax
249; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
250; AVX-FAST-NEXT:    retq
251  %x0 = extractelement <8 x i16> %x, i32 0
252  %x1 = extractelement <8 x i16> %x, i32 1
253  %x01 = add i16 %x1, %x0
254  ret i16 %x01
255}
256
257define i16 @extract_extract45_v8i16_add_i16_commute(<8 x i16> %x) {
258; SSE3-SLOW-LABEL: extract_extract45_v8i16_add_i16_commute:
259; SSE3-SLOW:       # %bb.0:
260; SSE3-SLOW-NEXT:    pextrw $4, %xmm0, %ecx
261; SSE3-SLOW-NEXT:    pextrw $5, %xmm0, %eax
262; SSE3-SLOW-NEXT:    addl %ecx, %eax
263; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
264; SSE3-SLOW-NEXT:    retq
265;
266; SSE3-FAST-LABEL: extract_extract45_v8i16_add_i16_commute:
267; SSE3-FAST:       # %bb.0:
268; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
269; SSE3-FAST-NEXT:    pextrw $2, %xmm0, %eax
270; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
271; SSE3-FAST-NEXT:    retq
272;
273; AVX-SLOW-LABEL: extract_extract45_v8i16_add_i16_commute:
274; AVX-SLOW:       # %bb.0:
275; AVX-SLOW-NEXT:    vpextrw $4, %xmm0, %ecx
276; AVX-SLOW-NEXT:    vpextrw $5, %xmm0, %eax
277; AVX-SLOW-NEXT:    addl %ecx, %eax
278; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
279; AVX-SLOW-NEXT:    retq
280;
281; AVX-FAST-LABEL: extract_extract45_v8i16_add_i16_commute:
282; AVX-FAST:       # %bb.0:
283; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
284; AVX-FAST-NEXT:    vpextrw $2, %xmm0, %eax
285; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
286; AVX-FAST-NEXT:    retq
287  %x0 = extractelement <8 x i16> %x, i32 4
288  %x1 = extractelement <8 x i16> %x, i32 5
289  %x01 = add i16 %x1, %x0
290  ret i16 %x01
291}
292
293define i32 @extract_extract01_v4i32_sub_i32(<4 x i32> %x) {
294; SSE3-SLOW-LABEL: extract_extract01_v4i32_sub_i32:
295; SSE3-SLOW:       # %bb.0:
296; SSE3-SLOW-NEXT:    movd %xmm0, %eax
297; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
298; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
299; SSE3-SLOW-NEXT:    subl %ecx, %eax
300; SSE3-SLOW-NEXT:    retq
301;
302; SSE3-FAST-LABEL: extract_extract01_v4i32_sub_i32:
303; SSE3-FAST:       # %bb.0:
304; SSE3-FAST-NEXT:    phsubd %xmm0, %xmm0
305; SSE3-FAST-NEXT:    movd %xmm0, %eax
306; SSE3-FAST-NEXT:    retq
307;
308; AVX-SLOW-LABEL: extract_extract01_v4i32_sub_i32:
309; AVX-SLOW:       # %bb.0:
310; AVX-SLOW-NEXT:    vmovd %xmm0, %eax
311; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %ecx
312; AVX-SLOW-NEXT:    subl %ecx, %eax
313; AVX-SLOW-NEXT:    retq
314;
315; AVX-FAST-LABEL: extract_extract01_v4i32_sub_i32:
316; AVX-FAST:       # %bb.0:
317; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
318; AVX-FAST-NEXT:    vmovd %xmm0, %eax
319; AVX-FAST-NEXT:    retq
320  %x0 = extractelement <4 x i32> %x, i32 0
321  %x1 = extractelement <4 x i32> %x, i32 1
322  %x01 = sub i32 %x0, %x1
323  ret i32 %x01
324}
325
326define i32 @extract_extract23_v4i32_sub_i32(<4 x i32> %x) {
327; SSE3-SLOW-LABEL: extract_extract23_v4i32_sub_i32:
328; SSE3-SLOW:       # %bb.0:
329; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
330; SSE3-SLOW-NEXT:    movd %xmm1, %eax
331; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
332; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
333; SSE3-SLOW-NEXT:    subl %ecx, %eax
334; SSE3-SLOW-NEXT:    retq
335;
336; SSE3-FAST-LABEL: extract_extract23_v4i32_sub_i32:
337; SSE3-FAST:       # %bb.0:
338; SSE3-FAST-NEXT:    phsubd %xmm0, %xmm0
339; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
340; SSE3-FAST-NEXT:    movd %xmm0, %eax
341; SSE3-FAST-NEXT:    retq
342;
343; AVX-SLOW-LABEL: extract_extract23_v4i32_sub_i32:
344; AVX-SLOW:       # %bb.0:
345; AVX-SLOW-NEXT:    vextractps $2, %xmm0, %eax
346; AVX-SLOW-NEXT:    vextractps $3, %xmm0, %ecx
347; AVX-SLOW-NEXT:    subl %ecx, %eax
348; AVX-SLOW-NEXT:    retq
349;
350; AVX-FAST-LABEL: extract_extract23_v4i32_sub_i32:
351; AVX-FAST:       # %bb.0:
352; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
353; AVX-FAST-NEXT:    vpextrd $1, %xmm0, %eax
354; AVX-FAST-NEXT:    retq
355  %x0 = extractelement <4 x i32> %x, i32 2
356  %x1 = extractelement <4 x i32> %x, i32 3
357  %x01 = sub i32 %x0, %x1
358  ret i32 %x01
359}
360
361define i32 @extract_extract01_v4i32_sub_i32_commute(<4 x i32> %x) {
362; SSE3-LABEL: extract_extract01_v4i32_sub_i32_commute:
363; SSE3:       # %bb.0:
364; SSE3-NEXT:    movd %xmm0, %ecx
365; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
366; SSE3-NEXT:    movd %xmm0, %eax
367; SSE3-NEXT:    subl %ecx, %eax
368; SSE3-NEXT:    retq
369;
370; AVX-LABEL: extract_extract01_v4i32_sub_i32_commute:
371; AVX:       # %bb.0:
372; AVX-NEXT:    vmovd %xmm0, %ecx
373; AVX-NEXT:    vpextrd $1, %xmm0, %eax
374; AVX-NEXT:    subl %ecx, %eax
375; AVX-NEXT:    retq
376  %x0 = extractelement <4 x i32> %x, i32 0
377  %x1 = extractelement <4 x i32> %x, i32 1
378  %x01 = sub i32 %x1, %x0
379  ret i32 %x01
380}
381
382define i32 @extract_extract23_v4i32_sub_i32_commute(<4 x i32> %x) {
383; SSE3-LABEL: extract_extract23_v4i32_sub_i32_commute:
384; SSE3:       # %bb.0:
385; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
386; SSE3-NEXT:    movd %xmm1, %ecx
387; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
388; SSE3-NEXT:    movd %xmm0, %eax
389; SSE3-NEXT:    subl %ecx, %eax
390; SSE3-NEXT:    retq
391;
392; AVX-LABEL: extract_extract23_v4i32_sub_i32_commute:
393; AVX:       # %bb.0:
394; AVX-NEXT:    vextractps $2, %xmm0, %ecx
395; AVX-NEXT:    vextractps $3, %xmm0, %eax
396; AVX-NEXT:    subl %ecx, %eax
397; AVX-NEXT:    retq
398  %x0 = extractelement <4 x i32> %x, i32 2
399  %x1 = extractelement <4 x i32> %x, i32 3
400  %x01 = sub i32 %x1, %x0
401  ret i32 %x01
402}
403
404define i16 @extract_extract01_v8i16_sub_i16(<8 x i16> %x) {
405; SSE3-SLOW-LABEL: extract_extract01_v8i16_sub_i16:
406; SSE3-SLOW:       # %bb.0:
407; SSE3-SLOW-NEXT:    movd %xmm0, %eax
408; SSE3-SLOW-NEXT:    pextrw $1, %xmm0, %ecx
409; SSE3-SLOW-NEXT:    subl %ecx, %eax
410; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
411; SSE3-SLOW-NEXT:    retq
412;
413; SSE3-FAST-LABEL: extract_extract01_v8i16_sub_i16:
414; SSE3-FAST:       # %bb.0:
415; SSE3-FAST-NEXT:    phsubw %xmm0, %xmm0
416; SSE3-FAST-NEXT:    movd %xmm0, %eax
417; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
418; SSE3-FAST-NEXT:    retq
419;
420; AVX-SLOW-LABEL: extract_extract01_v8i16_sub_i16:
421; AVX-SLOW:       # %bb.0:
422; AVX-SLOW-NEXT:    vmovd %xmm0, %eax
423; AVX-SLOW-NEXT:    vpextrw $1, %xmm0, %ecx
424; AVX-SLOW-NEXT:    subl %ecx, %eax
425; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
426; AVX-SLOW-NEXT:    retq
427;
428; AVX-FAST-LABEL: extract_extract01_v8i16_sub_i16:
429; AVX-FAST:       # %bb.0:
430; AVX-FAST-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
431; AVX-FAST-NEXT:    vmovd %xmm0, %eax
432; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
433; AVX-FAST-NEXT:    retq
434  %x0 = extractelement <8 x i16> %x, i32 0
435  %x1 = extractelement <8 x i16> %x, i32 1
436  %x01 = sub i16 %x0, %x1
437  ret i16 %x01
438}
439
440define i16 @extract_extract23_v8i16_sub_i16(<8 x i16> %x) {
441; SSE3-SLOW-LABEL: extract_extract23_v8i16_sub_i16:
442; SSE3-SLOW:       # %bb.0:
443; SSE3-SLOW-NEXT:    pextrw $2, %xmm0, %eax
444; SSE3-SLOW-NEXT:    pextrw $3, %xmm0, %ecx
445; SSE3-SLOW-NEXT:    subl %ecx, %eax
446; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
447; SSE3-SLOW-NEXT:    retq
448;
449; SSE3-FAST-LABEL: extract_extract23_v8i16_sub_i16:
450; SSE3-FAST:       # %bb.0:
451; SSE3-FAST-NEXT:    phsubw %xmm0, %xmm0
452; SSE3-FAST-NEXT:    pextrw $1, %xmm0, %eax
453; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
454; SSE3-FAST-NEXT:    retq
455;
456; AVX-SLOW-LABEL: extract_extract23_v8i16_sub_i16:
457; AVX-SLOW:       # %bb.0:
458; AVX-SLOW-NEXT:    vpextrw $2, %xmm0, %eax
459; AVX-SLOW-NEXT:    vpextrw $3, %xmm0, %ecx
460; AVX-SLOW-NEXT:    subl %ecx, %eax
461; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
462; AVX-SLOW-NEXT:    retq
463;
464; AVX-FAST-LABEL: extract_extract23_v8i16_sub_i16:
465; AVX-FAST:       # %bb.0:
466; AVX-FAST-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
467; AVX-FAST-NEXT:    vpextrw $1, %xmm0, %eax
468; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
469; AVX-FAST-NEXT:    retq
470  %x0 = extractelement <8 x i16> %x, i32 2
471  %x1 = extractelement <8 x i16> %x, i32 3
472  %x01 = sub i16 %x0, %x1
473  ret i16 %x01
474}
475
476define i16 @extract_extract01_v8i16_sub_i16_commute(<8 x i16> %x) {
477; SSE3-LABEL: extract_extract01_v8i16_sub_i16_commute:
478; SSE3:       # %bb.0:
479; SSE3-NEXT:    movd %xmm0, %ecx
480; SSE3-NEXT:    pextrw $1, %xmm0, %eax
481; SSE3-NEXT:    subl %ecx, %eax
482; SSE3-NEXT:    # kill: def $ax killed $ax killed $eax
483; SSE3-NEXT:    retq
484;
485; AVX-LABEL: extract_extract01_v8i16_sub_i16_commute:
486; AVX:       # %bb.0:
487; AVX-NEXT:    vmovd %xmm0, %ecx
488; AVX-NEXT:    vpextrw $1, %xmm0, %eax
489; AVX-NEXT:    subl %ecx, %eax
490; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
491; AVX-NEXT:    retq
492  %x0 = extractelement <8 x i16> %x, i32 0
493  %x1 = extractelement <8 x i16> %x, i32 1
494  %x01 = sub i16 %x1, %x0
495  ret i16 %x01
496}
497
498define i16 @extract_extract23_v8i16_sub_i16_commute(<8 x i16> %x) {
499; SSE3-LABEL: extract_extract23_v8i16_sub_i16_commute:
500; SSE3:       # %bb.0:
501; SSE3-NEXT:    pextrw $2, %xmm0, %ecx
502; SSE3-NEXT:    pextrw $3, %xmm0, %eax
503; SSE3-NEXT:    subl %ecx, %eax
504; SSE3-NEXT:    # kill: def $ax killed $ax killed $eax
505; SSE3-NEXT:    retq
506;
507; AVX-LABEL: extract_extract23_v8i16_sub_i16_commute:
508; AVX:       # %bb.0:
509; AVX-NEXT:    vpextrw $2, %xmm0, %ecx
510; AVX-NEXT:    vpextrw $3, %xmm0, %eax
511; AVX-NEXT:    subl %ecx, %eax
512; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
513; AVX-NEXT:    retq
514  %x0 = extractelement <8 x i16> %x, i32 2
515  %x1 = extractelement <8 x i16> %x, i32 3
516  %x01 = sub i16 %x1, %x0
517  ret i16 %x01
518}
519
520; 256-bit vectors, i32/i16, add/sub
521
522define i32 @extract_extract01_v8i32_add_i32(<8 x i32> %x) {
523; SSE3-SLOW-LABEL: extract_extract01_v8i32_add_i32:
524; SSE3-SLOW:       # %bb.0:
525; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
526; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
527; SSE3-SLOW-NEXT:    movd %xmm0, %eax
528; SSE3-SLOW-NEXT:    addl %ecx, %eax
529; SSE3-SLOW-NEXT:    retq
530;
531; SSE3-FAST-LABEL: extract_extract01_v8i32_add_i32:
532; SSE3-FAST:       # %bb.0:
533; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
534; SSE3-FAST-NEXT:    movd %xmm0, %eax
535; SSE3-FAST-NEXT:    retq
536;
537; AVX-SLOW-LABEL: extract_extract01_v8i32_add_i32:
538; AVX-SLOW:       # %bb.0:
539; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx
540; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %eax
541; AVX-SLOW-NEXT:    addl %ecx, %eax
542; AVX-SLOW-NEXT:    vzeroupper
543; AVX-SLOW-NEXT:    retq
544;
545; AVX-FAST-LABEL: extract_extract01_v8i32_add_i32:
546; AVX-FAST:       # %bb.0:
547; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
548; AVX-FAST-NEXT:    vmovd %xmm0, %eax
549; AVX-FAST-NEXT:    vzeroupper
550; AVX-FAST-NEXT:    retq
551  %x0 = extractelement <8 x i32> %x, i32 0
552  %x1 = extractelement <8 x i32> %x, i32 1
553  %x01 = add i32 %x0, %x1
554  ret i32 %x01
555}
556
557define i32 @extract_extract23_v8i32_add_i32(<8 x i32> %x) {
558; SSE3-SLOW-LABEL: extract_extract23_v8i32_add_i32:
559; SSE3-SLOW:       # %bb.0:
560; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
561; SSE3-SLOW-NEXT:    movd %xmm1, %ecx
562; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
563; SSE3-SLOW-NEXT:    movd %xmm0, %eax
564; SSE3-SLOW-NEXT:    addl %ecx, %eax
565; SSE3-SLOW-NEXT:    retq
566;
567; SSE3-FAST-LABEL: extract_extract23_v8i32_add_i32:
568; SSE3-FAST:       # %bb.0:
569; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
570; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
571; SSE3-FAST-NEXT:    movd %xmm0, %eax
572; SSE3-FAST-NEXT:    retq
573;
574; AVX-SLOW-LABEL: extract_extract23_v8i32_add_i32:
575; AVX-SLOW:       # %bb.0:
576; AVX-SLOW-NEXT:    vextractps $2, %xmm0, %ecx
577; AVX-SLOW-NEXT:    vextractps $3, %xmm0, %eax
578; AVX-SLOW-NEXT:    addl %ecx, %eax
579; AVX-SLOW-NEXT:    vzeroupper
580; AVX-SLOW-NEXT:    retq
581;
582; AVX-FAST-LABEL: extract_extract23_v8i32_add_i32:
583; AVX-FAST:       # %bb.0:
584; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
585; AVX-FAST-NEXT:    vpextrd $1, %xmm0, %eax
586; AVX-FAST-NEXT:    vzeroupper
587; AVX-FAST-NEXT:    retq
588  %x0 = extractelement <8 x i32> %x, i32 2
589  %x1 = extractelement <8 x i32> %x, i32 3
590  %x01 = add i32 %x0, %x1
591  ret i32 %x01
592}
593
594define i32 @extract_extract67_v8i32_add_i32(<8 x i32> %x) {
595; SSE3-SLOW-LABEL: extract_extract67_v8i32_add_i32:
596; SSE3-SLOW:       # %bb.0:
597; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
598; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
599; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
600; SSE3-SLOW-NEXT:    movd %xmm0, %eax
601; SSE3-SLOW-NEXT:    addl %ecx, %eax
602; SSE3-SLOW-NEXT:    retq
603;
604; SSE3-FAST-LABEL: extract_extract67_v8i32_add_i32:
605; SSE3-FAST:       # %bb.0:
606; SSE3-FAST-NEXT:    phaddd %xmm1, %xmm1
607; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
608; SSE3-FAST-NEXT:    movd %xmm0, %eax
609; SSE3-FAST-NEXT:    retq
610;
611; AVX-SLOW-LABEL: extract_extract67_v8i32_add_i32:
612; AVX-SLOW:       # %bb.0:
613; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
614; AVX-SLOW-NEXT:    vextractps $2, %xmm0, %ecx
615; AVX-SLOW-NEXT:    vextractps $3, %xmm0, %eax
616; AVX-SLOW-NEXT:    addl %ecx, %eax
617; AVX-SLOW-NEXT:    vzeroupper
618; AVX-SLOW-NEXT:    retq
619;
620; AVX1-FAST-LABEL: extract_extract67_v8i32_add_i32:
621; AVX1-FAST:       # %bb.0:
622; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
623; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
624; AVX1-FAST-NEXT:    vpextrd $1, %xmm0, %eax
625; AVX1-FAST-NEXT:    vzeroupper
626; AVX1-FAST-NEXT:    retq
627;
628; AVX2-FAST-LABEL: extract_extract67_v8i32_add_i32:
629; AVX2-FAST:       # %bb.0:
630; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
631; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
632; AVX2-FAST-NEXT:    vpextrd $1, %xmm0, %eax
633; AVX2-FAST-NEXT:    vzeroupper
634; AVX2-FAST-NEXT:    retq
635;
636; AVX512-FAST-LABEL: extract_extract67_v8i32_add_i32:
637; AVX512-FAST:       # %bb.0:
638; AVX512-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
639; AVX512-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
640; AVX512-FAST-NEXT:    vpextrd $1, %xmm0, %eax
641; AVX512-FAST-NEXT:    vzeroupper
642; AVX512-FAST-NEXT:    retq
643  %x0 = extractelement <8 x i32> %x, i32 6
644  %x1 = extractelement <8 x i32> %x, i32 7
645  %x01 = add i32 %x0, %x1
646  ret i32 %x01
647}
648
649define i32 @extract_extract01_v8i32_add_i32_commute(<8 x i32> %x) {
650; SSE3-SLOW-LABEL: extract_extract01_v8i32_add_i32_commute:
651; SSE3-SLOW:       # %bb.0:
652; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
653; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
654; SSE3-SLOW-NEXT:    movd %xmm0, %eax
655; SSE3-SLOW-NEXT:    addl %ecx, %eax
656; SSE3-SLOW-NEXT:    retq
657;
658; SSE3-FAST-LABEL: extract_extract01_v8i32_add_i32_commute:
659; SSE3-FAST:       # %bb.0:
660; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
661; SSE3-FAST-NEXT:    movd %xmm0, %eax
662; SSE3-FAST-NEXT:    retq
663;
664; AVX-SLOW-LABEL: extract_extract01_v8i32_add_i32_commute:
665; AVX-SLOW:       # %bb.0:
666; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx
667; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %eax
668; AVX-SLOW-NEXT:    addl %ecx, %eax
669; AVX-SLOW-NEXT:    vzeroupper
670; AVX-SLOW-NEXT:    retq
671;
672; AVX-FAST-LABEL: extract_extract01_v8i32_add_i32_commute:
673; AVX-FAST:       # %bb.0:
674; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
675; AVX-FAST-NEXT:    vmovd %xmm0, %eax
676; AVX-FAST-NEXT:    vzeroupper
677; AVX-FAST-NEXT:    retq
678  %x0 = extractelement <8 x i32> %x, i32 0
679  %x1 = extractelement <8 x i32> %x, i32 1
680  %x01 = add i32 %x1, %x0
681  ret i32 %x01
682}
683
684define i32 @extract_extract23_v8i32_add_i32_commute(<8 x i32> %x) {
685; SSE3-SLOW-LABEL: extract_extract23_v8i32_add_i32_commute:
686; SSE3-SLOW:       # %bb.0:
687; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
688; SSE3-SLOW-NEXT:    movd %xmm1, %ecx
689; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
690; SSE3-SLOW-NEXT:    movd %xmm0, %eax
691; SSE3-SLOW-NEXT:    addl %ecx, %eax
692; SSE3-SLOW-NEXT:    retq
693;
694; SSE3-FAST-LABEL: extract_extract23_v8i32_add_i32_commute:
695; SSE3-FAST:       # %bb.0:
696; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
697; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
698; SSE3-FAST-NEXT:    movd %xmm0, %eax
699; SSE3-FAST-NEXT:    retq
700;
701; AVX-SLOW-LABEL: extract_extract23_v8i32_add_i32_commute:
702; AVX-SLOW:       # %bb.0:
703; AVX-SLOW-NEXT:    vextractps $2, %xmm0, %ecx
704; AVX-SLOW-NEXT:    vextractps $3, %xmm0, %eax
705; AVX-SLOW-NEXT:    addl %ecx, %eax
706; AVX-SLOW-NEXT:    vzeroupper
707; AVX-SLOW-NEXT:    retq
708;
709; AVX-FAST-LABEL: extract_extract23_v8i32_add_i32_commute:
710; AVX-FAST:       # %bb.0:
711; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
712; AVX-FAST-NEXT:    vpextrd $1, %xmm0, %eax
713; AVX-FAST-NEXT:    vzeroupper
714; AVX-FAST-NEXT:    retq
715  %x0 = extractelement <8 x i32> %x, i32 2
716  %x1 = extractelement <8 x i32> %x, i32 3
717  %x01 = add i32 %x1, %x0
718  ret i32 %x01
719}
720
721define i32 @extract_extract67_v8i32_add_i32_commute(<8 x i32> %x) {
722; SSE3-SLOW-LABEL: extract_extract67_v8i32_add_i32_commute:
723; SSE3-SLOW:       # %bb.0:
724; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
725; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
726; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
727; SSE3-SLOW-NEXT:    movd %xmm0, %eax
728; SSE3-SLOW-NEXT:    addl %ecx, %eax
729; SSE3-SLOW-NEXT:    retq
730;
731; SSE3-FAST-LABEL: extract_extract67_v8i32_add_i32_commute:
732; SSE3-FAST:       # %bb.0:
733; SSE3-FAST-NEXT:    phaddd %xmm1, %xmm1
734; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
735; SSE3-FAST-NEXT:    movd %xmm0, %eax
736; SSE3-FAST-NEXT:    retq
737;
738; AVX-SLOW-LABEL: extract_extract67_v8i32_add_i32_commute:
739; AVX-SLOW:       # %bb.0:
740; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
741; AVX-SLOW-NEXT:    vextractps $2, %xmm0, %ecx
742; AVX-SLOW-NEXT:    vextractps $3, %xmm0, %eax
743; AVX-SLOW-NEXT:    addl %ecx, %eax
744; AVX-SLOW-NEXT:    vzeroupper
745; AVX-SLOW-NEXT:    retq
746;
747; AVX1-FAST-LABEL: extract_extract67_v8i32_add_i32_commute:
748; AVX1-FAST:       # %bb.0:
749; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
750; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
751; AVX1-FAST-NEXT:    vpextrd $1, %xmm0, %eax
752; AVX1-FAST-NEXT:    vzeroupper
753; AVX1-FAST-NEXT:    retq
754;
755; AVX2-FAST-LABEL: extract_extract67_v8i32_add_i32_commute:
756; AVX2-FAST:       # %bb.0:
757; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
758; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
759; AVX2-FAST-NEXT:    vpextrd $1, %xmm0, %eax
760; AVX2-FAST-NEXT:    vzeroupper
761; AVX2-FAST-NEXT:    retq
762;
763; AVX512-FAST-LABEL: extract_extract67_v8i32_add_i32_commute:
764; AVX512-FAST:       # %bb.0:
765; AVX512-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
766; AVX512-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
767; AVX512-FAST-NEXT:    vpextrd $1, %xmm0, %eax
768; AVX512-FAST-NEXT:    vzeroupper
769; AVX512-FAST-NEXT:    retq
770  %x0 = extractelement <8 x i32> %x, i32 6
771  %x1 = extractelement <8 x i32> %x, i32 7
772  %x01 = add i32 %x1, %x0
773  ret i32 %x01
774}
775
776define i16 @extract_extract01_v16i16_add_i16(<16 x i16> %x) {
777; SSE3-SLOW-LABEL: extract_extract01_v16i16_add_i16:
778; SSE3-SLOW:       # %bb.0:
779; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
780; SSE3-SLOW-NEXT:    pextrw $1, %xmm0, %eax
781; SSE3-SLOW-NEXT:    addl %ecx, %eax
782; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
783; SSE3-SLOW-NEXT:    retq
784;
785; SSE3-FAST-LABEL: extract_extract01_v16i16_add_i16:
786; SSE3-FAST:       # %bb.0:
787; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
788; SSE3-FAST-NEXT:    movd %xmm0, %eax
789; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
790; SSE3-FAST-NEXT:    retq
791;
792; AVX-SLOW-LABEL: extract_extract01_v16i16_add_i16:
793; AVX-SLOW:       # %bb.0:
794; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx
795; AVX-SLOW-NEXT:    vpextrw $1, %xmm0, %eax
796; AVX-SLOW-NEXT:    addl %ecx, %eax
797; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
798; AVX-SLOW-NEXT:    vzeroupper
799; AVX-SLOW-NEXT:    retq
800;
801; AVX-FAST-LABEL: extract_extract01_v16i16_add_i16:
802; AVX-FAST:       # %bb.0:
803; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
804; AVX-FAST-NEXT:    vmovd %xmm0, %eax
805; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
806; AVX-FAST-NEXT:    vzeroupper
807; AVX-FAST-NEXT:    retq
808  %x0 = extractelement <16 x i16> %x, i32 0
809  %x1 = extractelement <16 x i16> %x, i32 1
810  %x01 = add i16 %x0, %x1
811  ret i16 %x01
812}
813
814define i16 @extract_extract23_v16i16_add_i16(<16 x i16> %x) {
815; SSE3-SLOW-LABEL: extract_extract23_v16i16_add_i16:
816; SSE3-SLOW:       # %bb.0:
817; SSE3-SLOW-NEXT:    pextrw $2, %xmm0, %ecx
818; SSE3-SLOW-NEXT:    pextrw $3, %xmm0, %eax
819; SSE3-SLOW-NEXT:    addl %ecx, %eax
820; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
821; SSE3-SLOW-NEXT:    retq
822;
823; SSE3-FAST-LABEL: extract_extract23_v16i16_add_i16:
824; SSE3-FAST:       # %bb.0:
825; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
826; SSE3-FAST-NEXT:    pextrw $1, %xmm0, %eax
827; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
828; SSE3-FAST-NEXT:    retq
829;
830; AVX-SLOW-LABEL: extract_extract23_v16i16_add_i16:
831; AVX-SLOW:       # %bb.0:
832; AVX-SLOW-NEXT:    vpextrw $2, %xmm0, %ecx
833; AVX-SLOW-NEXT:    vpextrw $3, %xmm0, %eax
834; AVX-SLOW-NEXT:    addl %ecx, %eax
835; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
836; AVX-SLOW-NEXT:    vzeroupper
837; AVX-SLOW-NEXT:    retq
838;
839; AVX-FAST-LABEL: extract_extract23_v16i16_add_i16:
840; AVX-FAST:       # %bb.0:
841; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
842; AVX-FAST-NEXT:    vpextrw $1, %xmm0, %eax
843; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
844; AVX-FAST-NEXT:    vzeroupper
845; AVX-FAST-NEXT:    retq
846  %x0 = extractelement <16 x i16> %x, i32 2
847  %x1 = extractelement <16 x i16> %x, i32 3
848  %x01 = add i16 %x0, %x1
849  ret i16 %x01
850}
851
852define i16 @extract_extract89_v16i16_add_i16(<16 x i16> %x) {
853; SSE3-SLOW-LABEL: extract_extract89_v16i16_add_i16:
854; SSE3-SLOW:       # %bb.0:
855; SSE3-SLOW-NEXT:    movd %xmm1, %ecx
856; SSE3-SLOW-NEXT:    pextrw $1, %xmm1, %eax
857; SSE3-SLOW-NEXT:    addl %ecx, %eax
858; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
859; SSE3-SLOW-NEXT:    retq
860;
861; SSE3-FAST-LABEL: extract_extract89_v16i16_add_i16:
862; SSE3-FAST:       # %bb.0:
863; SSE3-FAST-NEXT:    phaddw %xmm1, %xmm1
864; SSE3-FAST-NEXT:    movd %xmm1, %eax
865; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
866; SSE3-FAST-NEXT:    retq
867;
868; AVX1-SLOW-LABEL: extract_extract89_v16i16_add_i16:
869; AVX1-SLOW:       # %bb.0:
870; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
871; AVX1-SLOW-NEXT:    vmovd %xmm0, %ecx
872; AVX1-SLOW-NEXT:    vpextrw $1, %xmm0, %eax
873; AVX1-SLOW-NEXT:    addl %ecx, %eax
874; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
875; AVX1-SLOW-NEXT:    vzeroupper
876; AVX1-SLOW-NEXT:    retq
877;
878; AVX1-FAST-LABEL: extract_extract89_v16i16_add_i16:
879; AVX1-FAST:       # %bb.0:
880; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
881; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
882; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
883; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
884; AVX1-FAST-NEXT:    vzeroupper
885; AVX1-FAST-NEXT:    retq
886;
887; AVX2-SLOW-LABEL: extract_extract89_v16i16_add_i16:
888; AVX2-SLOW:       # %bb.0:
889; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0
890; AVX2-SLOW-NEXT:    vmovd %xmm0, %ecx
891; AVX2-SLOW-NEXT:    vpextrw $1, %xmm0, %eax
892; AVX2-SLOW-NEXT:    addl %ecx, %eax
893; AVX2-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
894; AVX2-SLOW-NEXT:    vzeroupper
895; AVX2-SLOW-NEXT:    retq
896;
897; AVX2-FAST-LABEL: extract_extract89_v16i16_add_i16:
898; AVX2-FAST:       # %bb.0:
899; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
900; AVX2-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
901; AVX2-FAST-NEXT:    vmovd %xmm0, %eax
902; AVX2-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
903; AVX2-FAST-NEXT:    vzeroupper
904; AVX2-FAST-NEXT:    retq
905;
906; AVX512-SLOW-LABEL: extract_extract89_v16i16_add_i16:
907; AVX512-SLOW:       # %bb.0:
908; AVX512-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0
909; AVX512-SLOW-NEXT:    vmovd %xmm0, %ecx
910; AVX512-SLOW-NEXT:    vpextrw $1, %xmm0, %eax
911; AVX512-SLOW-NEXT:    addl %ecx, %eax
912; AVX512-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
913; AVX512-SLOW-NEXT:    vzeroupper
914; AVX512-SLOW-NEXT:    retq
915;
916; AVX512-FAST-LABEL: extract_extract89_v16i16_add_i16:
917; AVX512-FAST:       # %bb.0:
918; AVX512-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
919; AVX512-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
920; AVX512-FAST-NEXT:    vmovd %xmm0, %eax
921; AVX512-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
922; AVX512-FAST-NEXT:    vzeroupper
923; AVX512-FAST-NEXT:    retq
924  %x0 = extractelement <16 x i16> %x, i32 8
925  %x1 = extractelement <16 x i16> %x, i32 9
926  %x01 = add i16 %x0, %x1
927  ret i16 %x01
928}
929
930define i16 @extract_extract01_v16i16_add_i16_commute(<16 x i16> %x) {
931; SSE3-SLOW-LABEL: extract_extract01_v16i16_add_i16_commute:
932; SSE3-SLOW:       # %bb.0:
933; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
934; SSE3-SLOW-NEXT:    pextrw $1, %xmm0, %eax
935; SSE3-SLOW-NEXT:    addl %ecx, %eax
936; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
937; SSE3-SLOW-NEXT:    retq
938;
939; SSE3-FAST-LABEL: extract_extract01_v16i16_add_i16_commute:
940; SSE3-FAST:       # %bb.0:
941; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
942; SSE3-FAST-NEXT:    movd %xmm0, %eax
943; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
944; SSE3-FAST-NEXT:    retq
945;
946; AVX-SLOW-LABEL: extract_extract01_v16i16_add_i16_commute:
947; AVX-SLOW:       # %bb.0:
948; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx
949; AVX-SLOW-NEXT:    vpextrw $1, %xmm0, %eax
950; AVX-SLOW-NEXT:    addl %ecx, %eax
951; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
952; AVX-SLOW-NEXT:    vzeroupper
953; AVX-SLOW-NEXT:    retq
954;
955; AVX-FAST-LABEL: extract_extract01_v16i16_add_i16_commute:
956; AVX-FAST:       # %bb.0:
957; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
958; AVX-FAST-NEXT:    vmovd %xmm0, %eax
959; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
960; AVX-FAST-NEXT:    vzeroupper
961; AVX-FAST-NEXT:    retq
962  %x0 = extractelement <16 x i16> %x, i32 0
963  %x1 = extractelement <16 x i16> %x, i32 1
964  %x01 = add i16 %x1, %x0
965  ret i16 %x01
966}
967
968define i16 @extract_extract45_v16i16_add_i16_commute(<16 x i16> %x) {
969; SSE3-SLOW-LABEL: extract_extract45_v16i16_add_i16_commute:
970; SSE3-SLOW:       # %bb.0:
971; SSE3-SLOW-NEXT:    pextrw $4, %xmm0, %ecx
972; SSE3-SLOW-NEXT:    pextrw $5, %xmm0, %eax
973; SSE3-SLOW-NEXT:    addl %ecx, %eax
974; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
975; SSE3-SLOW-NEXT:    retq
976;
977; SSE3-FAST-LABEL: extract_extract45_v16i16_add_i16_commute:
978; SSE3-FAST:       # %bb.0:
979; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
980; SSE3-FAST-NEXT:    pextrw $2, %xmm0, %eax
981; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
982; SSE3-FAST-NEXT:    retq
983;
984; AVX-SLOW-LABEL: extract_extract45_v16i16_add_i16_commute:
985; AVX-SLOW:       # %bb.0:
986; AVX-SLOW-NEXT:    vpextrw $4, %xmm0, %ecx
987; AVX-SLOW-NEXT:    vpextrw $5, %xmm0, %eax
988; AVX-SLOW-NEXT:    addl %ecx, %eax
989; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
990; AVX-SLOW-NEXT:    vzeroupper
991; AVX-SLOW-NEXT:    retq
992;
993; AVX-FAST-LABEL: extract_extract45_v16i16_add_i16_commute:
994; AVX-FAST:       # %bb.0:
995; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
996; AVX-FAST-NEXT:    vpextrw $2, %xmm0, %eax
997; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
998; AVX-FAST-NEXT:    vzeroupper
999; AVX-FAST-NEXT:    retq
1000  %x0 = extractelement <16 x i16> %x, i32 4
1001  %x1 = extractelement <16 x i16> %x, i32 5
1002  %x01 = add i16 %x1, %x0
1003  ret i16 %x01
1004}
1005
1006define i16 @extract_extract89_v16i16_add_i16_commute(<16 x i16> %x) {
1007; SSE3-SLOW-LABEL: extract_extract89_v16i16_add_i16_commute:
1008; SSE3-SLOW:       # %bb.0:
1009; SSE3-SLOW-NEXT:    movd %xmm1, %ecx
1010; SSE3-SLOW-NEXT:    pextrw $1, %xmm1, %eax
1011; SSE3-SLOW-NEXT:    addl %ecx, %eax
1012; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
1013; SSE3-SLOW-NEXT:    retq
1014;
1015; SSE3-FAST-LABEL: extract_extract89_v16i16_add_i16_commute:
1016; SSE3-FAST:       # %bb.0:
1017; SSE3-FAST-NEXT:    phaddw %xmm1, %xmm1
1018; SSE3-FAST-NEXT:    movd %xmm1, %eax
1019; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
1020; SSE3-FAST-NEXT:    retq
1021;
1022; AVX1-SLOW-LABEL: extract_extract89_v16i16_add_i16_commute:
1023; AVX1-SLOW:       # %bb.0:
1024; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1025; AVX1-SLOW-NEXT:    vmovd %xmm0, %ecx
1026; AVX1-SLOW-NEXT:    vpextrw $1, %xmm0, %eax
1027; AVX1-SLOW-NEXT:    addl %ecx, %eax
1028; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
1029; AVX1-SLOW-NEXT:    vzeroupper
1030; AVX1-SLOW-NEXT:    retq
1031;
1032; AVX1-FAST-LABEL: extract_extract89_v16i16_add_i16_commute:
1033; AVX1-FAST:       # %bb.0:
1034; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
1035; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
1036; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
1037; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
1038; AVX1-FAST-NEXT:    vzeroupper
1039; AVX1-FAST-NEXT:    retq
1040;
1041; AVX2-SLOW-LABEL: extract_extract89_v16i16_add_i16_commute:
1042; AVX2-SLOW:       # %bb.0:
1043; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0
1044; AVX2-SLOW-NEXT:    vmovd %xmm0, %ecx
1045; AVX2-SLOW-NEXT:    vpextrw $1, %xmm0, %eax
1046; AVX2-SLOW-NEXT:    addl %ecx, %eax
1047; AVX2-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
1048; AVX2-SLOW-NEXT:    vzeroupper
1049; AVX2-SLOW-NEXT:    retq
1050;
1051; AVX2-FAST-LABEL: extract_extract89_v16i16_add_i16_commute:
1052; AVX2-FAST:       # %bb.0:
1053; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
1054; AVX2-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
1055; AVX2-FAST-NEXT:    vmovd %xmm0, %eax
1056; AVX2-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
1057; AVX2-FAST-NEXT:    vzeroupper
1058; AVX2-FAST-NEXT:    retq
1059;
1060; AVX512-SLOW-LABEL: extract_extract89_v16i16_add_i16_commute:
1061; AVX512-SLOW:       # %bb.0:
1062; AVX512-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0
1063; AVX512-SLOW-NEXT:    vmovd %xmm0, %ecx
1064; AVX512-SLOW-NEXT:    vpextrw $1, %xmm0, %eax
1065; AVX512-SLOW-NEXT:    addl %ecx, %eax
1066; AVX512-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
1067; AVX512-SLOW-NEXT:    vzeroupper
1068; AVX512-SLOW-NEXT:    retq
1069;
1070; AVX512-FAST-LABEL: extract_extract89_v16i16_add_i16_commute:
1071; AVX512-FAST:       # %bb.0:
1072; AVX512-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
1073; AVX512-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
1074; AVX512-FAST-NEXT:    vmovd %xmm0, %eax
1075; AVX512-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
1076; AVX512-FAST-NEXT:    vzeroupper
1077; AVX512-FAST-NEXT:    retq
1078  %x0 = extractelement <16 x i16> %x, i32 8
1079  %x1 = extractelement <16 x i16> %x, i32 9
1080  %x01 = add i16 %x1, %x0
1081  ret i16 %x01
1082}
1083
1084define i32 @extract_extract01_v8i32_sub_i32(<8 x i32> %x) {
1085; SSE3-SLOW-LABEL: extract_extract01_v8i32_sub_i32:
1086; SSE3-SLOW:       # %bb.0:
1087; SSE3-SLOW-NEXT:    movd %xmm0, %eax
1088; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1089; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
1090; SSE3-SLOW-NEXT:    subl %ecx, %eax
1091; SSE3-SLOW-NEXT:    retq
1092;
1093; SSE3-FAST-LABEL: extract_extract01_v8i32_sub_i32:
1094; SSE3-FAST:       # %bb.0:
1095; SSE3-FAST-NEXT:    phsubd %xmm0, %xmm0
1096; SSE3-FAST-NEXT:    movd %xmm0, %eax
1097; SSE3-FAST-NEXT:    retq
1098;
1099; AVX-SLOW-LABEL: extract_extract01_v8i32_sub_i32:
1100; AVX-SLOW:       # %bb.0:
1101; AVX-SLOW-NEXT:    vmovd %xmm0, %eax
1102; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %ecx
1103; AVX-SLOW-NEXT:    subl %ecx, %eax
1104; AVX-SLOW-NEXT:    vzeroupper
1105; AVX-SLOW-NEXT:    retq
1106;
1107; AVX-FAST-LABEL: extract_extract01_v8i32_sub_i32:
1108; AVX-FAST:       # %bb.0:
1109; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
1110; AVX-FAST-NEXT:    vmovd %xmm0, %eax
1111; AVX-FAST-NEXT:    vzeroupper
1112; AVX-FAST-NEXT:    retq
1113  %x0 = extractelement <8 x i32> %x, i32 0
1114  %x1 = extractelement <8 x i32> %x, i32 1
1115  %x01 = sub i32 %x0, %x1
1116  ret i32 %x01
1117}
1118
1119define i32 @extract_extract23_v8i32_sub_i32(<8 x i32> %x) {
1120; SSE3-SLOW-LABEL: extract_extract23_v8i32_sub_i32:
1121; SSE3-SLOW:       # %bb.0:
1122; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1123; SSE3-SLOW-NEXT:    movd %xmm1, %eax
1124; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
1125; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
1126; SSE3-SLOW-NEXT:    subl %ecx, %eax
1127; SSE3-SLOW-NEXT:    retq
1128;
1129; SSE3-FAST-LABEL: extract_extract23_v8i32_sub_i32:
1130; SSE3-FAST:       # %bb.0:
1131; SSE3-FAST-NEXT:    phsubd %xmm0, %xmm0
1132; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1133; SSE3-FAST-NEXT:    movd %xmm0, %eax
1134; SSE3-FAST-NEXT:    retq
1135;
1136; AVX-SLOW-LABEL: extract_extract23_v8i32_sub_i32:
1137; AVX-SLOW:       # %bb.0:
1138; AVX-SLOW-NEXT:    vextractps $2, %xmm0, %eax
1139; AVX-SLOW-NEXT:    vextractps $3, %xmm0, %ecx
1140; AVX-SLOW-NEXT:    subl %ecx, %eax
1141; AVX-SLOW-NEXT:    vzeroupper
1142; AVX-SLOW-NEXT:    retq
1143;
1144; AVX-FAST-LABEL: extract_extract23_v8i32_sub_i32:
1145; AVX-FAST:       # %bb.0:
1146; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
1147; AVX-FAST-NEXT:    vpextrd $1, %xmm0, %eax
1148; AVX-FAST-NEXT:    vzeroupper
1149; AVX-FAST-NEXT:    retq
1150  %x0 = extractelement <8 x i32> %x, i32 2
1151  %x1 = extractelement <8 x i32> %x, i32 3
1152  %x01 = sub i32 %x0, %x1
1153  ret i32 %x01
1154}
1155
1156define i32 @extract_extract67_v8i32_sub_i32(<8 x i32> %x) {
1157; SSE3-SLOW-LABEL: extract_extract67_v8i32_sub_i32:
1158; SSE3-SLOW:       # %bb.0:
1159; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
1160; SSE3-SLOW-NEXT:    movd %xmm0, %eax
1161; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
1162; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
1163; SSE3-SLOW-NEXT:    subl %ecx, %eax
1164; SSE3-SLOW-NEXT:    retq
1165;
1166; SSE3-FAST-LABEL: extract_extract67_v8i32_sub_i32:
1167; SSE3-FAST:       # %bb.0:
1168; SSE3-FAST-NEXT:    phsubd %xmm1, %xmm1
1169; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1170; SSE3-FAST-NEXT:    movd %xmm0, %eax
1171; SSE3-FAST-NEXT:    retq
1172;
1173; AVX-SLOW-LABEL: extract_extract67_v8i32_sub_i32:
1174; AVX-SLOW:       # %bb.0:
1175; AVX-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm0
1176; AVX-SLOW-NEXT:    vextractps $2, %xmm0, %eax
1177; AVX-SLOW-NEXT:    vextractps $3, %xmm0, %ecx
1178; AVX-SLOW-NEXT:    subl %ecx, %eax
1179; AVX-SLOW-NEXT:    vzeroupper
1180; AVX-SLOW-NEXT:    retq
1181;
1182; AVX1-FAST-LABEL: extract_extract67_v8i32_sub_i32:
1183; AVX1-FAST:       # %bb.0:
1184; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm0
1185; AVX1-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
1186; AVX1-FAST-NEXT:    vpextrd $1, %xmm0, %eax
1187; AVX1-FAST-NEXT:    vzeroupper
1188; AVX1-FAST-NEXT:    retq
1189;
1190; AVX2-FAST-LABEL: extract_extract67_v8i32_sub_i32:
1191; AVX2-FAST:       # %bb.0:
1192; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
1193; AVX2-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
1194; AVX2-FAST-NEXT:    vpextrd $1, %xmm0, %eax
1195; AVX2-FAST-NEXT:    vzeroupper
1196; AVX2-FAST-NEXT:    retq
1197;
1198; AVX512-FAST-LABEL: extract_extract67_v8i32_sub_i32:
1199; AVX512-FAST:       # %bb.0:
1200; AVX512-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
1201; AVX512-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
1202; AVX512-FAST-NEXT:    vpextrd $1, %xmm0, %eax
1203; AVX512-FAST-NEXT:    vzeroupper
1204; AVX512-FAST-NEXT:    retq
1205  %x0 = extractelement <8 x i32> %x, i32 6
1206  %x1 = extractelement <8 x i32> %x, i32 7
1207  %x01 = sub i32 %x0, %x1
1208  ret i32 %x01
1209}
1210
1211; Negative test...or get hoppy and negate?
1212
1213define i32 @extract_extract01_v8i32_sub_i32_commute(<8 x i32> %x) {
1214; SSE3-LABEL: extract_extract01_v8i32_sub_i32_commute:
1215; SSE3:       # %bb.0:
1216; SSE3-NEXT:    movd %xmm0, %ecx
1217; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1218; SSE3-NEXT:    movd %xmm0, %eax
1219; SSE3-NEXT:    subl %ecx, %eax
1220; SSE3-NEXT:    retq
1221;
1222; AVX-LABEL: extract_extract01_v8i32_sub_i32_commute:
1223; AVX:       # %bb.0:
1224; AVX-NEXT:    vmovd %xmm0, %ecx
1225; AVX-NEXT:    vpextrd $1, %xmm0, %eax
1226; AVX-NEXT:    subl %ecx, %eax
1227; AVX-NEXT:    vzeroupper
1228; AVX-NEXT:    retq
1229  %x0 = extractelement <8 x i32> %x, i32 0
1230  %x1 = extractelement <8 x i32> %x, i32 1
1231  %x01 = sub i32 %x1, %x0
1232  ret i32 %x01
1233}
1234
1235define i16 @extract_extract01_v16i16_sub_i16(<16 x i16> %x) {
1236; SSE3-SLOW-LABEL: extract_extract01_v16i16_sub_i16:
1237; SSE3-SLOW:       # %bb.0:
1238; SSE3-SLOW-NEXT:    movd %xmm0, %eax
1239; SSE3-SLOW-NEXT:    pextrw $1, %xmm0, %ecx
1240; SSE3-SLOW-NEXT:    subl %ecx, %eax
1241; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
1242; SSE3-SLOW-NEXT:    retq
1243;
1244; SSE3-FAST-LABEL: extract_extract01_v16i16_sub_i16:
1245; SSE3-FAST:       # %bb.0:
1246; SSE3-FAST-NEXT:    phsubw %xmm0, %xmm0
1247; SSE3-FAST-NEXT:    movd %xmm0, %eax
1248; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
1249; SSE3-FAST-NEXT:    retq
1250;
1251; AVX-SLOW-LABEL: extract_extract01_v16i16_sub_i16:
1252; AVX-SLOW:       # %bb.0:
1253; AVX-SLOW-NEXT:    vmovd %xmm0, %eax
1254; AVX-SLOW-NEXT:    vpextrw $1, %xmm0, %ecx
1255; AVX-SLOW-NEXT:    subl %ecx, %eax
1256; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
1257; AVX-SLOW-NEXT:    vzeroupper
1258; AVX-SLOW-NEXT:    retq
1259;
1260; AVX-FAST-LABEL: extract_extract01_v16i16_sub_i16:
1261; AVX-FAST:       # %bb.0:
1262; AVX-FAST-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
1263; AVX-FAST-NEXT:    vmovd %xmm0, %eax
1264; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
1265; AVX-FAST-NEXT:    vzeroupper
1266; AVX-FAST-NEXT:    retq
1267  %x0 = extractelement <16 x i16> %x, i32 0
1268  %x1 = extractelement <16 x i16> %x, i32 1
1269  %x01 = sub i16 %x0, %x1
1270  ret i16 %x01
1271}
1272
1273; Negative test...or get hoppy and negate?
1274
1275define i16 @extract_extract01_v16i16_sub_i16_commute(<16 x i16> %x) {
1276; SSE3-LABEL: extract_extract01_v16i16_sub_i16_commute:
1277; SSE3:       # %bb.0:
1278; SSE3-NEXT:    movd %xmm0, %ecx
1279; SSE3-NEXT:    pextrw $1, %xmm0, %eax
1280; SSE3-NEXT:    subl %ecx, %eax
1281; SSE3-NEXT:    # kill: def $ax killed $ax killed $eax
1282; SSE3-NEXT:    retq
1283;
1284; AVX-LABEL: extract_extract01_v16i16_sub_i16_commute:
1285; AVX:       # %bb.0:
1286; AVX-NEXT:    vmovd %xmm0, %ecx
1287; AVX-NEXT:    vpextrw $1, %xmm0, %eax
1288; AVX-NEXT:    subl %ecx, %eax
1289; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
1290; AVX-NEXT:    vzeroupper
1291; AVX-NEXT:    retq
1292  %x0 = extractelement <16 x i16> %x, i32 0
1293  %x1 = extractelement <16 x i16> %x, i32 1
1294  %x01 = sub i16 %x1, %x0
1295  ret i16 %x01
1296}
1297
1298; 512-bit vectors, i32/i16, add/sub
1299
1300define i32 @extract_extract01_v16i32_add_i32(<16 x i32> %x) {
1301; SSE3-SLOW-LABEL: extract_extract01_v16i32_add_i32:
1302; SSE3-SLOW:       # %bb.0:
1303; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
1304; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1305; SSE3-SLOW-NEXT:    movd %xmm0, %eax
1306; SSE3-SLOW-NEXT:    addl %ecx, %eax
1307; SSE3-SLOW-NEXT:    retq
1308;
1309; SSE3-FAST-LABEL: extract_extract01_v16i32_add_i32:
1310; SSE3-FAST:       # %bb.0:
1311; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
1312; SSE3-FAST-NEXT:    movd %xmm0, %eax
1313; SSE3-FAST-NEXT:    retq
1314;
1315; AVX-SLOW-LABEL: extract_extract01_v16i32_add_i32:
1316; AVX-SLOW:       # %bb.0:
1317; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx
1318; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %eax
1319; AVX-SLOW-NEXT:    addl %ecx, %eax
1320; AVX-SLOW-NEXT:    vzeroupper
1321; AVX-SLOW-NEXT:    retq
1322;
1323; AVX-FAST-LABEL: extract_extract01_v16i32_add_i32:
1324; AVX-FAST:       # %bb.0:
1325; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
1326; AVX-FAST-NEXT:    vmovd %xmm0, %eax
1327; AVX-FAST-NEXT:    vzeroupper
1328; AVX-FAST-NEXT:    retq
1329  %x0 = extractelement <16 x i32> %x, i32 0
1330  %x1 = extractelement <16 x i32> %x, i32 1
1331  %x01 = add i32 %x0, %x1
1332  ret i32 %x01
1333}
1334
1335define i32 @extract_extract01_v16i32_add_i32_commute(<16 x i32> %x) {
1336; SSE3-SLOW-LABEL: extract_extract01_v16i32_add_i32_commute:
1337; SSE3-SLOW:       # %bb.0:
1338; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
1339; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1340; SSE3-SLOW-NEXT:    movd %xmm0, %eax
1341; SSE3-SLOW-NEXT:    addl %ecx, %eax
1342; SSE3-SLOW-NEXT:    retq
1343;
1344; SSE3-FAST-LABEL: extract_extract01_v16i32_add_i32_commute:
1345; SSE3-FAST:       # %bb.0:
1346; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
1347; SSE3-FAST-NEXT:    movd %xmm0, %eax
1348; SSE3-FAST-NEXT:    retq
1349;
1350; AVX-SLOW-LABEL: extract_extract01_v16i32_add_i32_commute:
1351; AVX-SLOW:       # %bb.0:
1352; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx
1353; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %eax
1354; AVX-SLOW-NEXT:    addl %ecx, %eax
1355; AVX-SLOW-NEXT:    vzeroupper
1356; AVX-SLOW-NEXT:    retq
1357;
1358; AVX-FAST-LABEL: extract_extract01_v16i32_add_i32_commute:
1359; AVX-FAST:       # %bb.0:
1360; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
1361; AVX-FAST-NEXT:    vmovd %xmm0, %eax
1362; AVX-FAST-NEXT:    vzeroupper
1363; AVX-FAST-NEXT:    retq
1364  %x0 = extractelement <16 x i32> %x, i32 0
1365  %x1 = extractelement <16 x i32> %x, i32 1
1366  %x01 = add i32 %x1, %x0
1367  ret i32 %x01
1368}
1369
1370define i16 @extract_extract01_v32i16_add_i16(<32 x i16> %x) {
1371; SSE3-SLOW-LABEL: extract_extract01_v32i16_add_i16:
1372; SSE3-SLOW:       # %bb.0:
1373; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
1374; SSE3-SLOW-NEXT:    pextrw $1, %xmm0, %eax
1375; SSE3-SLOW-NEXT:    addl %ecx, %eax
1376; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
1377; SSE3-SLOW-NEXT:    retq
1378;
1379; SSE3-FAST-LABEL: extract_extract01_v32i16_add_i16:
1380; SSE3-FAST:       # %bb.0:
1381; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
1382; SSE3-FAST-NEXT:    movd %xmm0, %eax
1383; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
1384; SSE3-FAST-NEXT:    retq
1385;
1386; AVX-SLOW-LABEL: extract_extract01_v32i16_add_i16:
1387; AVX-SLOW:       # %bb.0:
1388; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx
1389; AVX-SLOW-NEXT:    vpextrw $1, %xmm0, %eax
1390; AVX-SLOW-NEXT:    addl %ecx, %eax
1391; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
1392; AVX-SLOW-NEXT:    vzeroupper
1393; AVX-SLOW-NEXT:    retq
1394;
1395; AVX-FAST-LABEL: extract_extract01_v32i16_add_i16:
1396; AVX-FAST:       # %bb.0:
1397; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
1398; AVX-FAST-NEXT:    vmovd %xmm0, %eax
1399; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
1400; AVX-FAST-NEXT:    vzeroupper
1401; AVX-FAST-NEXT:    retq
1402  %x0 = extractelement <32 x i16> %x, i32 0
1403  %x1 = extractelement <32 x i16> %x, i32 1
1404  %x01 = add i16 %x0, %x1
1405  ret i16 %x01
1406}
1407
1408define i16 @extract_extract01_v32i16_add_i16_commute(<32 x i16> %x) {
1409; SSE3-SLOW-LABEL: extract_extract01_v32i16_add_i16_commute:
1410; SSE3-SLOW:       # %bb.0:
1411; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
1412; SSE3-SLOW-NEXT:    pextrw $1, %xmm0, %eax
1413; SSE3-SLOW-NEXT:    addl %ecx, %eax
1414; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
1415; SSE3-SLOW-NEXT:    retq
1416;
1417; SSE3-FAST-LABEL: extract_extract01_v32i16_add_i16_commute:
1418; SSE3-FAST:       # %bb.0:
1419; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
1420; SSE3-FAST-NEXT:    movd %xmm0, %eax
1421; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
1422; SSE3-FAST-NEXT:    retq
1423;
1424; AVX-SLOW-LABEL: extract_extract01_v32i16_add_i16_commute:
1425; AVX-SLOW:       # %bb.0:
1426; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx
1427; AVX-SLOW-NEXT:    vpextrw $1, %xmm0, %eax
1428; AVX-SLOW-NEXT:    addl %ecx, %eax
1429; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
1430; AVX-SLOW-NEXT:    vzeroupper
1431; AVX-SLOW-NEXT:    retq
1432;
1433; AVX-FAST-LABEL: extract_extract01_v32i16_add_i16_commute:
1434; AVX-FAST:       # %bb.0:
1435; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
1436; AVX-FAST-NEXT:    vmovd %xmm0, %eax
1437; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
1438; AVX-FAST-NEXT:    vzeroupper
1439; AVX-FAST-NEXT:    retq
1440  %x0 = extractelement <32 x i16> %x, i32 0
1441  %x1 = extractelement <32 x i16> %x, i32 1
1442  %x01 = add i16 %x1, %x0
1443  ret i16 %x01
1444}
1445
1446define i32 @extract_extract01_v16i32_sub_i32(<16 x i32> %x) {
1447; SSE3-SLOW-LABEL: extract_extract01_v16i32_sub_i32:
1448; SSE3-SLOW:       # %bb.0:
1449; SSE3-SLOW-NEXT:    movd %xmm0, %eax
1450; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1451; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
1452; SSE3-SLOW-NEXT:    subl %ecx, %eax
1453; SSE3-SLOW-NEXT:    retq
1454;
1455; SSE3-FAST-LABEL: extract_extract01_v16i32_sub_i32:
1456; SSE3-FAST:       # %bb.0:
1457; SSE3-FAST-NEXT:    phsubd %xmm0, %xmm0
1458; SSE3-FAST-NEXT:    movd %xmm0, %eax
1459; SSE3-FAST-NEXT:    retq
1460;
1461; AVX-SLOW-LABEL: extract_extract01_v16i32_sub_i32:
1462; AVX-SLOW:       # %bb.0:
1463; AVX-SLOW-NEXT:    vmovd %xmm0, %eax
1464; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %ecx
1465; AVX-SLOW-NEXT:    subl %ecx, %eax
1466; AVX-SLOW-NEXT:    vzeroupper
1467; AVX-SLOW-NEXT:    retq
1468;
1469; AVX-FAST-LABEL: extract_extract01_v16i32_sub_i32:
1470; AVX-FAST:       # %bb.0:
1471; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
1472; AVX-FAST-NEXT:    vmovd %xmm0, %eax
1473; AVX-FAST-NEXT:    vzeroupper
1474; AVX-FAST-NEXT:    retq
1475  %x0 = extractelement <16 x i32> %x, i32 0
1476  %x1 = extractelement <16 x i32> %x, i32 1
1477  %x01 = sub i32 %x0, %x1
1478  ret i32 %x01
1479}
1480
1481define i32 @extract_extract01_v16i32_sub_i32_commute(<16 x i32> %x) {
1482; SSE3-LABEL: extract_extract01_v16i32_sub_i32_commute:
1483; SSE3:       # %bb.0:
1484; SSE3-NEXT:    movd %xmm0, %ecx
1485; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1486; SSE3-NEXT:    movd %xmm0, %eax
1487; SSE3-NEXT:    subl %ecx, %eax
1488; SSE3-NEXT:    retq
1489;
1490; AVX-LABEL: extract_extract01_v16i32_sub_i32_commute:
1491; AVX:       # %bb.0:
1492; AVX-NEXT:    vmovd %xmm0, %ecx
1493; AVX-NEXT:    vpextrd $1, %xmm0, %eax
1494; AVX-NEXT:    subl %ecx, %eax
1495; AVX-NEXT:    vzeroupper
1496; AVX-NEXT:    retq
1497  %x0 = extractelement <16 x i32> %x, i32 0
1498  %x1 = extractelement <16 x i32> %x, i32 1
1499  %x01 = sub i32 %x1, %x0
1500  ret i32 %x01
1501}
1502
1503define i16 @extract_extract01_v32i16_sub_i16(<32 x i16> %x) {
1504; SSE3-SLOW-LABEL: extract_extract01_v32i16_sub_i16:
1505; SSE3-SLOW:       # %bb.0:
1506; SSE3-SLOW-NEXT:    movd %xmm0, %eax
1507; SSE3-SLOW-NEXT:    pextrw $1, %xmm0, %ecx
1508; SSE3-SLOW-NEXT:    subl %ecx, %eax
1509; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
1510; SSE3-SLOW-NEXT:    retq
1511;
1512; SSE3-FAST-LABEL: extract_extract01_v32i16_sub_i16:
1513; SSE3-FAST:       # %bb.0:
1514; SSE3-FAST-NEXT:    phsubw %xmm0, %xmm0
1515; SSE3-FAST-NEXT:    movd %xmm0, %eax
1516; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
1517; SSE3-FAST-NEXT:    retq
1518;
1519; AVX-SLOW-LABEL: extract_extract01_v32i16_sub_i16:
1520; AVX-SLOW:       # %bb.0:
1521; AVX-SLOW-NEXT:    vmovd %xmm0, %eax
1522; AVX-SLOW-NEXT:    vpextrw $1, %xmm0, %ecx
1523; AVX-SLOW-NEXT:    subl %ecx, %eax
1524; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
1525; AVX-SLOW-NEXT:    vzeroupper
1526; AVX-SLOW-NEXT:    retq
1527;
1528; AVX-FAST-LABEL: extract_extract01_v32i16_sub_i16:
1529; AVX-FAST:       # %bb.0:
1530; AVX-FAST-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
1531; AVX-FAST-NEXT:    vmovd %xmm0, %eax
1532; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
1533; AVX-FAST-NEXT:    vzeroupper
1534; AVX-FAST-NEXT:    retq
1535  %x0 = extractelement <32 x i16> %x, i32 0
1536  %x1 = extractelement <32 x i16> %x, i32 1
1537  %x01 = sub i16 %x0, %x1
1538  ret i16 %x01
1539}
1540
1541define i16 @extract_extract01_v32i16_sub_i16_commute(<32 x i16> %x) {
1542; SSE3-LABEL: extract_extract01_v32i16_sub_i16_commute:
1543; SSE3:       # %bb.0:
1544; SSE3-NEXT:    movd %xmm0, %ecx
1545; SSE3-NEXT:    pextrw $1, %xmm0, %eax
1546; SSE3-NEXT:    subl %ecx, %eax
1547; SSE3-NEXT:    # kill: def $ax killed $ax killed $eax
1548; SSE3-NEXT:    retq
1549;
1550; AVX-LABEL: extract_extract01_v32i16_sub_i16_commute:
1551; AVX:       # %bb.0:
1552; AVX-NEXT:    vmovd %xmm0, %ecx
1553; AVX-NEXT:    vpextrw $1, %xmm0, %eax
1554; AVX-NEXT:    subl %ecx, %eax
1555; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
1556; AVX-NEXT:    vzeroupper
1557; AVX-NEXT:    retq
1558  %x0 = extractelement <32 x i16> %x, i32 0
1559  %x1 = extractelement <32 x i16> %x, i32 1
1560  %x01 = sub i16 %x1, %x0
1561  ret i16 %x01
1562}
1563
1564; Check output when 1 or both extracts have extra uses.
1565
1566define i32 @extract_extract01_v4i32_add_i32_uses1(<4 x i32> %x, i32* %p) {
1567; SSE3-SLOW-LABEL: extract_extract01_v4i32_add_i32_uses1:
1568; SSE3-SLOW:       # %bb.0:
1569; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
1570; SSE3-SLOW-NEXT:    movd %xmm0, (%rdi)
1571; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1572; SSE3-SLOW-NEXT:    movd %xmm0, %eax
1573; SSE3-SLOW-NEXT:    addl %ecx, %eax
1574; SSE3-SLOW-NEXT:    retq
1575;
1576; SSE3-FAST-LABEL: extract_extract01_v4i32_add_i32_uses1:
1577; SSE3-FAST:       # %bb.0:
1578; SSE3-FAST-NEXT:    movd %xmm0, (%rdi)
1579; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
1580; SSE3-FAST-NEXT:    movd %xmm0, %eax
1581; SSE3-FAST-NEXT:    retq
1582;
1583; AVX-SLOW-LABEL: extract_extract01_v4i32_add_i32_uses1:
1584; AVX-SLOW:       # %bb.0:
1585; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx
1586; AVX-SLOW-NEXT:    vmovd %xmm0, (%rdi)
1587; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %eax
1588; AVX-SLOW-NEXT:    addl %ecx, %eax
1589; AVX-SLOW-NEXT:    retq
1590;
1591; AVX-FAST-LABEL: extract_extract01_v4i32_add_i32_uses1:
1592; AVX-FAST:       # %bb.0:
1593; AVX-FAST-NEXT:    vmovd %xmm0, (%rdi)
1594; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
1595; AVX-FAST-NEXT:    vmovd %xmm0, %eax
1596; AVX-FAST-NEXT:    retq
1597  %x0 = extractelement <4 x i32> %x, i32 0
1598  store i32 %x0, i32* %p
1599  %x1 = extractelement <4 x i32> %x, i32 1
1600  %x01 = add i32 %x0, %x1
1601  ret i32 %x01
1602}
1603
1604define i32 @extract_extract01_v4i32_add_i32_uses2(<4 x i32> %x, i32* %p) {
1605; SSE3-SLOW-LABEL: extract_extract01_v4i32_add_i32_uses2:
1606; SSE3-SLOW:       # %bb.0:
1607; SSE3-SLOW-NEXT:    movd %xmm0, %ecx
1608; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1609; SSE3-SLOW-NEXT:    movd %xmm0, %eax
1610; SSE3-SLOW-NEXT:    addl %ecx, %eax
1611; SSE3-SLOW-NEXT:    movd %xmm0, (%rdi)
1612; SSE3-SLOW-NEXT:    retq
1613;
1614; SSE3-FAST-LABEL: extract_extract01_v4i32_add_i32_uses2:
1615; SSE3-FAST:       # %bb.0:
1616; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1617; SSE3-FAST-NEXT:    movd %xmm1, (%rdi)
1618; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
1619; SSE3-FAST-NEXT:    movd %xmm0, %eax
1620; SSE3-FAST-NEXT:    retq
1621;
1622; AVX-SLOW-LABEL: extract_extract01_v4i32_add_i32_uses2:
1623; AVX-SLOW:       # %bb.0:
1624; AVX-SLOW-NEXT:    vmovd %xmm0, %ecx
1625; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, %eax
1626; AVX-SLOW-NEXT:    addl %ecx, %eax
1627; AVX-SLOW-NEXT:    vpextrd $1, %xmm0, (%rdi)
1628; AVX-SLOW-NEXT:    retq
1629;
1630; AVX-FAST-LABEL: extract_extract01_v4i32_add_i32_uses2:
1631; AVX-FAST:       # %bb.0:
1632; AVX-FAST-NEXT:    vpextrd $1, %xmm0, (%rdi)
1633; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
1634; AVX-FAST-NEXT:    vmovd %xmm0, %eax
1635; AVX-FAST-NEXT:    retq
1636  %x0 = extractelement <4 x i32> %x, i32 0
1637  %x1 = extractelement <4 x i32> %x, i32 1
1638  store i32 %x1, i32* %p
1639  %x01 = add i32 %x0, %x1
1640  ret i32 %x01
1641}
1642
1643define i32 @extract_extract01_v4i32_add_i32_uses3(<4 x i32> %x, i32* %p1, i32* %p2) {
1644; SSE3-LABEL: extract_extract01_v4i32_add_i32_uses3:
1645; SSE3:       # %bb.0:
1646; SSE3-NEXT:    movd %xmm0, %ecx
1647; SSE3-NEXT:    movd %xmm0, (%rdi)
1648; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
1649; SSE3-NEXT:    movd %xmm0, %eax
1650; SSE3-NEXT:    addl %ecx, %eax
1651; SSE3-NEXT:    movd %xmm0, (%rsi)
1652; SSE3-NEXT:    retq
1653;
1654; AVX-LABEL: extract_extract01_v4i32_add_i32_uses3:
1655; AVX:       # %bb.0:
1656; AVX-NEXT:    vmovd %xmm0, %ecx
1657; AVX-NEXT:    vmovd %xmm0, (%rdi)
1658; AVX-NEXT:    vpextrd $1, %xmm0, %eax
1659; AVX-NEXT:    addl %ecx, %eax
1660; AVX-NEXT:    vpextrd $1, %xmm0, (%rsi)
1661; AVX-NEXT:    retq
1662  %x0 = extractelement <4 x i32> %x, i32 0
1663  store i32 %x0, i32* %p1
1664  %x1 = extractelement <4 x i32> %x, i32 1
1665  store i32 %x1, i32* %p2
1666  %x01 = add i32 %x0, %x1
1667  ret i32 %x01
1668}
1669
1670; PR33758: https://bugs.llvm.org/show_bug.cgi?id=33758
1671
1672define i32 @partial_reduction_add_v8i32(<8 x i32> %x) {
1673; SSE3-SLOW-LABEL: partial_reduction_add_v8i32:
1674; SSE3-SLOW:       # %bb.0:
1675; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1676; SSE3-SLOW-NEXT:    paddd %xmm0, %xmm1
1677; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1678; SSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
1679; SSE3-SLOW-NEXT:    movd %xmm0, %eax
1680; SSE3-SLOW-NEXT:    retq
1681;
1682; SSE3-FAST-LABEL: partial_reduction_add_v8i32:
1683; SSE3-FAST:       # %bb.0:
1684; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1685; SSE3-FAST-NEXT:    paddd %xmm0, %xmm1
1686; SSE3-FAST-NEXT:    phaddd %xmm1, %xmm1
1687; SSE3-FAST-NEXT:    movd %xmm1, %eax
1688; SSE3-FAST-NEXT:    retq
1689;
1690; AVX-SLOW-LABEL: partial_reduction_add_v8i32:
1691; AVX-SLOW:       # %bb.0:
1692; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1693; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1694; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1695; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1696; AVX-SLOW-NEXT:    vmovd %xmm0, %eax
1697; AVX-SLOW-NEXT:    vzeroupper
1698; AVX-SLOW-NEXT:    retq
1699;
1700; AVX-FAST-LABEL: partial_reduction_add_v8i32:
1701; AVX-FAST:       # %bb.0:
1702; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
1703; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
1704; AVX-FAST-NEXT:    vmovd %xmm0, %eax
1705; AVX-FAST-NEXT:    vzeroupper
1706; AVX-FAST-NEXT:    retq
1707  %x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1708  %x0213 = add <8 x i32> %x, %x23
1709  %x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1710  %x0123 = add <8 x i32> %x0213, %x13
1711  %r = extractelement <8 x i32> %x0123, i32 0
1712  ret i32 %r
1713}
1714
1715define i32 @partial_reduction_add_v16i32(<16 x i32> %x) {
1716; SSE3-SLOW-LABEL: partial_reduction_add_v16i32:
1717; SSE3-SLOW:       # %bb.0:
1718; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1719; SSE3-SLOW-NEXT:    paddd %xmm0, %xmm1
1720; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1721; SSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
1722; SSE3-SLOW-NEXT:    movd %xmm0, %eax
1723; SSE3-SLOW-NEXT:    retq
1724;
1725; SSE3-FAST-LABEL: partial_reduction_add_v16i32:
1726; SSE3-FAST:       # %bb.0:
1727; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1728; SSE3-FAST-NEXT:    paddd %xmm0, %xmm1
1729; SSE3-FAST-NEXT:    phaddd %xmm1, %xmm1
1730; SSE3-FAST-NEXT:    movd %xmm1, %eax
1731; SSE3-FAST-NEXT:    retq
1732;
1733; AVX-SLOW-LABEL: partial_reduction_add_v16i32:
1734; AVX-SLOW:       # %bb.0:
1735; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1736; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1737; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1738; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1739; AVX-SLOW-NEXT:    vmovd %xmm0, %eax
1740; AVX-SLOW-NEXT:    vzeroupper
1741; AVX-SLOW-NEXT:    retq
1742;
1743; AVX-FAST-LABEL: partial_reduction_add_v16i32:
1744; AVX-FAST:       # %bb.0:
1745; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
1746; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
1747; AVX-FAST-NEXT:    vmovd %xmm0, %eax
1748; AVX-FAST-NEXT:    vzeroupper
1749; AVX-FAST-NEXT:    retq
1750  %x23 = shufflevector <16 x i32> %x, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1751  %x0213 = add <16 x i32> %x, %x23
1752  %x13 = shufflevector <16 x i32> %x0213, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1753  %x0123 = add <16 x i32> %x0213, %x13
1754  %r = extractelement <16 x i32> %x0123, i32 0
1755  ret i32 %r
1756}
1757
1758define i32 @partial_reduction_sub_v8i32(<8 x i32> %x) {
1759; SSE3-SLOW-LABEL: partial_reduction_sub_v8i32:
1760; SSE3-SLOW:       # %bb.0:
1761; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1762; SSE3-SLOW-NEXT:    psubd %xmm1, %xmm0
1763; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1764; SSE3-SLOW-NEXT:    psubd %xmm1, %xmm0
1765; SSE3-SLOW-NEXT:    movd %xmm0, %eax
1766; SSE3-SLOW-NEXT:    retq
1767;
1768; SSE3-FAST-LABEL: partial_reduction_sub_v8i32:
1769; SSE3-FAST:       # %bb.0:
1770; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1771; SSE3-FAST-NEXT:    psubd %xmm1, %xmm0
1772; SSE3-FAST-NEXT:    phsubd %xmm0, %xmm0
1773; SSE3-FAST-NEXT:    movd %xmm0, %eax
1774; SSE3-FAST-NEXT:    retq
1775;
1776; AVX-SLOW-LABEL: partial_reduction_sub_v8i32:
1777; AVX-SLOW:       # %bb.0:
1778; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1779; AVX-SLOW-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
1780; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1781; AVX-SLOW-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
1782; AVX-SLOW-NEXT:    vmovd %xmm0, %eax
1783; AVX-SLOW-NEXT:    vzeroupper
1784; AVX-SLOW-NEXT:    retq
1785;
1786; AVX-FAST-LABEL: partial_reduction_sub_v8i32:
1787; AVX-FAST:       # %bb.0:
1788; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1789; AVX-FAST-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
1790; AVX-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
1791; AVX-FAST-NEXT:    vmovd %xmm0, %eax
1792; AVX-FAST-NEXT:    vzeroupper
1793; AVX-FAST-NEXT:    retq
1794  %x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1795  %x0213 = sub <8 x i32> %x, %x23
1796  %x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1797  %x0123 = sub <8 x i32> %x0213, %x13
1798  %r = extractelement <8 x i32> %x0123, i32 0
1799  ret i32 %r
1800}
1801
1802define i32 @partial_reduction_sub_v16i32(<16 x i32> %x) {
1803; SSE3-SLOW-LABEL: partial_reduction_sub_v16i32:
1804; SSE3-SLOW:       # %bb.0:
1805; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1806; SSE3-SLOW-NEXT:    psubd %xmm1, %xmm0
1807; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1808; SSE3-SLOW-NEXT:    psubd %xmm1, %xmm0
1809; SSE3-SLOW-NEXT:    movd %xmm0, %eax
1810; SSE3-SLOW-NEXT:    retq
1811;
1812; SSE3-FAST-LABEL: partial_reduction_sub_v16i32:
1813; SSE3-FAST:       # %bb.0:
1814; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1815; SSE3-FAST-NEXT:    psubd %xmm1, %xmm0
1816; SSE3-FAST-NEXT:    phsubd %xmm0, %xmm0
1817; SSE3-FAST-NEXT:    movd %xmm0, %eax
1818; SSE3-FAST-NEXT:    retq
1819;
1820; AVX-SLOW-LABEL: partial_reduction_sub_v16i32:
1821; AVX-SLOW:       # %bb.0:
1822; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1823; AVX-SLOW-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
1824; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1825; AVX-SLOW-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
1826; AVX-SLOW-NEXT:    vmovd %xmm0, %eax
1827; AVX-SLOW-NEXT:    vzeroupper
1828; AVX-SLOW-NEXT:    retq
1829;
1830; AVX1-FAST-LABEL: partial_reduction_sub_v16i32:
1831; AVX1-FAST:       # %bb.0:
1832; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1833; AVX1-FAST-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
1834; AVX1-FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
1835; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
1836; AVX1-FAST-NEXT:    vzeroupper
1837; AVX1-FAST-NEXT:    retq
1838;
1839; AVX2-FAST-LABEL: partial_reduction_sub_v16i32:
1840; AVX2-FAST:       # %bb.0:
1841; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1842; AVX2-FAST-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
1843; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1844; AVX2-FAST-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
1845; AVX2-FAST-NEXT:    vmovd %xmm0, %eax
1846; AVX2-FAST-NEXT:    vzeroupper
1847; AVX2-FAST-NEXT:    retq
1848;
1849; AVX512-FAST-LABEL: partial_reduction_sub_v16i32:
1850; AVX512-FAST:       # %bb.0:
1851; AVX512-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1852; AVX512-FAST-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
1853; AVX512-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1854; AVX512-FAST-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
1855; AVX512-FAST-NEXT:    vmovd %xmm0, %eax
1856; AVX512-FAST-NEXT:    vzeroupper
1857; AVX512-FAST-NEXT:    retq
1858  %x23 = shufflevector <16 x i32> %x, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1859  %x0213 = sub <16 x i32> %x, %x23
1860  %x13 = shufflevector <16 x i32> %x0213, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1861  %x0123 = sub <16 x i32> %x0213, %x13
1862  %r = extractelement <16 x i32> %x0123, i32 0
1863  ret i32 %r
1864}
1865
1866; https://bugs.chromium.org/p/chromium/issues/detail?id=1195353
1867define <2 x i64> @negative_extract_v16i16_v8i16(<4 x i64> %a0) {
1868; SSE3-LABEL: negative_extract_v16i16_v8i16:
1869; SSE3:       # %bb.0:
1870; SSE3-NEXT:    paddw %xmm1, %xmm0
1871; SSE3-NEXT:    retq
1872;
1873; AVX1-SLOW-LABEL: negative_extract_v16i16_v8i16:
1874; AVX1-SLOW:       # %bb.0:
1875; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
1876; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1877; AVX1-SLOW-NEXT:    vzeroupper
1878; AVX1-SLOW-NEXT:    retq
1879;
1880; AVX1-FAST-LABEL: negative_extract_v16i16_v8i16:
1881; AVX1-FAST:       # %bb.0:
1882; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
1883; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1884; AVX1-FAST-NEXT:    vzeroupper
1885; AVX1-FAST-NEXT:    retq
1886;
1887; AVX2-SLOW-LABEL: negative_extract_v16i16_v8i16:
1888; AVX2-SLOW:       # %bb.0:
1889; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
1890; AVX2-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1891; AVX2-SLOW-NEXT:    vzeroupper
1892; AVX2-SLOW-NEXT:    retq
1893;
1894; AVX2-FAST-LABEL: negative_extract_v16i16_v8i16:
1895; AVX2-FAST:       # %bb.0:
1896; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
1897; AVX2-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1898; AVX2-FAST-NEXT:    vzeroupper
1899; AVX2-FAST-NEXT:    retq
1900;
1901; AVX512-SLOW-LABEL: negative_extract_v16i16_v8i16:
1902; AVX512-SLOW:       # %bb.0:
1903; AVX512-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
1904; AVX512-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1905; AVX512-SLOW-NEXT:    vzeroupper
1906; AVX512-SLOW-NEXT:    retq
1907;
1908; AVX512-FAST-LABEL: negative_extract_v16i16_v8i16:
1909; AVX512-FAST:       # %bb.0:
1910; AVX512-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
1911; AVX512-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1912; AVX512-FAST-NEXT:    vzeroupper
1913; AVX512-FAST-NEXT:    retq
1914  %s = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
1915  %b = bitcast <4 x i64> %a0 to <16 x i16>
1916  %c = bitcast <4 x i64> %s to <16 x i16>
1917  %d = add <16 x i16> %b, %c
1918  %e = bitcast <16 x i16> %d to <4 x i64>
1919  %f = shufflevector <4 x i64> %e, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
1920  ret <2 x i64> %f
1921}
1922
1923; PR42023 - https://bugs.llvm.org/show_bug.cgi?id=42023
1924
1925define i16 @hadd16_8(<8 x i16> %x223) {
1926; SSE3-SLOW-LABEL: hadd16_8:
1927; SSE3-SLOW:       # %bb.0:
1928; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1929; SSE3-SLOW-NEXT:    paddw %xmm0, %xmm1
1930; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1931; SSE3-SLOW-NEXT:    paddw %xmm1, %xmm0
1932; SSE3-SLOW-NEXT:    movdqa %xmm0, %xmm1
1933; SSE3-SLOW-NEXT:    psrld $16, %xmm1
1934; SSE3-SLOW-NEXT:    paddw %xmm0, %xmm1
1935; SSE3-SLOW-NEXT:    movd %xmm1, %eax
1936; SSE3-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
1937; SSE3-SLOW-NEXT:    retq
1938;
1939; SSE3-FAST-LABEL: hadd16_8:
1940; SSE3-FAST:       # %bb.0:
1941; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
1942; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
1943; SSE3-FAST-NEXT:    phaddw %xmm0, %xmm0
1944; SSE3-FAST-NEXT:    movd %xmm0, %eax
1945; SSE3-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
1946; SSE3-FAST-NEXT:    retq
1947;
1948; AVX-SLOW-LABEL: hadd16_8:
1949; AVX-SLOW:       # %bb.0:
1950; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1951; AVX-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1952; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1953; AVX-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1954; AVX-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
1955; AVX-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
1956; AVX-SLOW-NEXT:    vmovd %xmm0, %eax
1957; AVX-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
1958; AVX-SLOW-NEXT:    retq
1959;
1960; AVX-FAST-LABEL: hadd16_8:
1961; AVX-FAST:       # %bb.0:
1962; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
1963; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
1964; AVX-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
1965; AVX-FAST-NEXT:    vmovd %xmm0, %eax
1966; AVX-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
1967; AVX-FAST-NEXT:    retq
1968  %x224 = shufflevector <8 x i16> %x223, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
1969  %x225 = add <8 x i16> %x223, %x224
1970  %x226 = shufflevector <8 x i16> %x225, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1971  %x227 = add <8 x i16> %x225, %x226
1972  %x228 = shufflevector <8 x i16> %x227, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1973  %x229 = add <8 x i16> %x227, %x228
1974  %x230 = extractelement <8 x i16> %x229, i32 0
1975  ret i16 %x230
1976}
1977
1978define i32 @hadd32_4(<4 x i32> %x225) {
1979; SSE3-SLOW-LABEL: hadd32_4:
1980; SSE3-SLOW:       # %bb.0:
1981; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1982; SSE3-SLOW-NEXT:    paddd %xmm0, %xmm1
1983; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1984; SSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
1985; SSE3-SLOW-NEXT:    movd %xmm0, %eax
1986; SSE3-SLOW-NEXT:    retq
1987;
1988; SSE3-FAST-LABEL: hadd32_4:
1989; SSE3-FAST:       # %bb.0:
1990; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
1991; SSE3-FAST-NEXT:    phaddd %xmm0, %xmm0
1992; SSE3-FAST-NEXT:    movd %xmm0, %eax
1993; SSE3-FAST-NEXT:    retq
1994;
1995; AVX-SLOW-LABEL: hadd32_4:
1996; AVX-SLOW:       # %bb.0:
1997; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1998; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
1999; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2000; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2001; AVX-SLOW-NEXT:    vmovd %xmm0, %eax
2002; AVX-SLOW-NEXT:    retq
2003;
2004; AVX-FAST-LABEL: hadd32_4:
2005; AVX-FAST:       # %bb.0:
2006; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
2007; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
2008; AVX-FAST-NEXT:    vmovd %xmm0, %eax
2009; AVX-FAST-NEXT:    retq
2010  %x226 = shufflevector <4 x i32> %x225, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
2011  %x227 = add <4 x i32> %x225, %x226
2012  %x228 = shufflevector <4 x i32> %x227, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
2013  %x229 = add <4 x i32> %x227, %x228
2014  %x230 = extractelement <4 x i32> %x229, i32 0
2015  ret i32 %x230
2016}
2017
2018define i32 @hadd32_8(<8 x i32> %x225) {
2019; SSE3-SLOW-LABEL: hadd32_8:
2020; SSE3-SLOW:       # %bb.0:
2021; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2022; SSE3-SLOW-NEXT:    paddd %xmm0, %xmm1
2023; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
2024; SSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
2025; SSE3-SLOW-NEXT:    movd %xmm0, %eax
2026; SSE3-SLOW-NEXT:    retq
2027;
2028; SSE3-FAST-LABEL: hadd32_8:
2029; SSE3-FAST:       # %bb.0:
2030; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2031; SSE3-FAST-NEXT:    paddd %xmm0, %xmm1
2032; SSE3-FAST-NEXT:    phaddd %xmm1, %xmm1
2033; SSE3-FAST-NEXT:    movd %xmm1, %eax
2034; SSE3-FAST-NEXT:    retq
2035;
2036; AVX-SLOW-LABEL: hadd32_8:
2037; AVX-SLOW:       # %bb.0:
2038; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2039; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2040; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2041; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2042; AVX-SLOW-NEXT:    vmovd %xmm0, %eax
2043; AVX-SLOW-NEXT:    vzeroupper
2044; AVX-SLOW-NEXT:    retq
2045;
2046; AVX-FAST-LABEL: hadd32_8:
2047; AVX-FAST:       # %bb.0:
2048; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
2049; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
2050; AVX-FAST-NEXT:    vmovd %xmm0, %eax
2051; AVX-FAST-NEXT:    vzeroupper
2052; AVX-FAST-NEXT:    retq
2053  %x226 = shufflevector <8 x i32> %x225, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2054  %x227 = add <8 x i32> %x225, %x226
2055  %x228 = shufflevector <8 x i32> %x227, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2056  %x229 = add <8 x i32> %x227, %x228
2057  %x230 = extractelement <8 x i32> %x229, i32 0
2058  ret i32 %x230
2059}
2060
2061define i32 @hadd32_16(<16 x i32> %x225) {
2062; SSE3-SLOW-LABEL: hadd32_16:
2063; SSE3-SLOW:       # %bb.0:
2064; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2065; SSE3-SLOW-NEXT:    paddd %xmm0, %xmm1
2066; SSE3-SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
2067; SSE3-SLOW-NEXT:    paddd %xmm1, %xmm0
2068; SSE3-SLOW-NEXT:    movd %xmm0, %eax
2069; SSE3-SLOW-NEXT:    retq
2070;
2071; SSE3-FAST-LABEL: hadd32_16:
2072; SSE3-FAST:       # %bb.0:
2073; SSE3-FAST-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2074; SSE3-FAST-NEXT:    paddd %xmm0, %xmm1
2075; SSE3-FAST-NEXT:    phaddd %xmm1, %xmm1
2076; SSE3-FAST-NEXT:    movd %xmm1, %eax
2077; SSE3-FAST-NEXT:    retq
2078;
2079; AVX-SLOW-LABEL: hadd32_16:
2080; AVX-SLOW:       # %bb.0:
2081; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2082; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2083; AVX-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
2084; AVX-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
2085; AVX-SLOW-NEXT:    vmovd %xmm0, %eax
2086; AVX-SLOW-NEXT:    vzeroupper
2087; AVX-SLOW-NEXT:    retq
2088;
2089; AVX-FAST-LABEL: hadd32_16:
2090; AVX-FAST:       # %bb.0:
2091; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
2092; AVX-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
2093; AVX-FAST-NEXT:    vmovd %xmm0, %eax
2094; AVX-FAST-NEXT:    vzeroupper
2095; AVX-FAST-NEXT:    retq
2096  %x226 = shufflevector <16 x i32> %x225, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2097  %x227 = add <16 x i32> %x225, %x226
2098  %x228 = shufflevector <16 x i32> %x227, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2099  %x229 = add <16 x i32> %x227, %x228
2100  %x230 = extractelement <16 x i32> %x229, i32 0
2101  ret i32 %x230
2102}
2103
2104define i16 @hadd16_8_optsize(<8 x i16> %x223) optsize {
2105; SSE3-LABEL: hadd16_8_optsize:
2106; SSE3:       # %bb.0:
2107; SSE3-NEXT:    phaddw %xmm0, %xmm0
2108; SSE3-NEXT:    phaddw %xmm0, %xmm0
2109; SSE3-NEXT:    phaddw %xmm0, %xmm0
2110; SSE3-NEXT:    movd %xmm0, %eax
2111; SSE3-NEXT:    # kill: def $ax killed $ax killed $eax
2112; SSE3-NEXT:    retq
2113;
2114; AVX-LABEL: hadd16_8_optsize:
2115; AVX:       # %bb.0:
2116; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
2117; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
2118; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
2119; AVX-NEXT:    vmovd %xmm0, %eax
2120; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
2121; AVX-NEXT:    retq
2122  %x224 = shufflevector <8 x i16> %x223, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
2123  %x225 = add <8 x i16> %x223, %x224
2124  %x226 = shufflevector <8 x i16> %x225, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2125  %x227 = add <8 x i16> %x225, %x226
2126  %x228 = shufflevector <8 x i16> %x227, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2127  %x229 = add <8 x i16> %x227, %x228
2128  %x230 = extractelement <8 x i16> %x229, i32 0
2129  ret i16 %x230
2130}
2131
2132define i32 @hadd32_4_optsize(<4 x i32> %x225) optsize {
2133; SSE3-LABEL: hadd32_4_optsize:
2134; SSE3:       # %bb.0:
2135; SSE3-NEXT:    phaddd %xmm0, %xmm0
2136; SSE3-NEXT:    phaddd %xmm0, %xmm0
2137; SSE3-NEXT:    movd %xmm0, %eax
2138; SSE3-NEXT:    retq
2139;
2140; AVX-LABEL: hadd32_4_optsize:
2141; AVX:       # %bb.0:
2142; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
2143; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
2144; AVX-NEXT:    vmovd %xmm0, %eax
2145; AVX-NEXT:    retq
2146  %x226 = shufflevector <4 x i32> %x225, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
2147  %x227 = add <4 x i32> %x225, %x226
2148  %x228 = shufflevector <4 x i32> %x227, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
2149  %x229 = add <4 x i32> %x227, %x228
2150  %x230 = extractelement <4 x i32> %x229, i32 0
2151  ret i32 %x230
2152}
2153
2154define i32 @hadd32_4_pgso(<4 x i32> %x225) !prof !14 {
2155; SSE3-LABEL: hadd32_4_pgso:
2156; SSE3:       # %bb.0:
2157; SSE3-NEXT:    phaddd %xmm0, %xmm0
2158; SSE3-NEXT:    phaddd %xmm0, %xmm0
2159; SSE3-NEXT:    movd %xmm0, %eax
2160; SSE3-NEXT:    retq
2161;
2162; AVX-LABEL: hadd32_4_pgso:
2163; AVX:       # %bb.0:
2164; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
2165; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
2166; AVX-NEXT:    vmovd %xmm0, %eax
2167; AVX-NEXT:    retq
2168  %x226 = shufflevector <4 x i32> %x225, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
2169  %x227 = add <4 x i32> %x225, %x226
2170  %x228 = shufflevector <4 x i32> %x227, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
2171  %x229 = add <4 x i32> %x227, %x228
2172  %x230 = extractelement <4 x i32> %x229, i32 0
2173  ret i32 %x230
2174}
2175
2176define i32 @hadd32_8_optsize(<8 x i32> %x225) optsize {
2177; SSE3-LABEL: hadd32_8_optsize:
2178; SSE3:       # %bb.0:
2179; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2180; SSE3-NEXT:    paddd %xmm0, %xmm1
2181; SSE3-NEXT:    phaddd %xmm1, %xmm1
2182; SSE3-NEXT:    movd %xmm1, %eax
2183; SSE3-NEXT:    retq
2184;
2185; AVX-LABEL: hadd32_8_optsize:
2186; AVX:       # %bb.0:
2187; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
2188; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
2189; AVX-NEXT:    vmovd %xmm0, %eax
2190; AVX-NEXT:    vzeroupper
2191; AVX-NEXT:    retq
2192  %x226 = shufflevector <8 x i32> %x225, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2193  %x227 = add <8 x i32> %x225, %x226
2194  %x228 = shufflevector <8 x i32> %x227, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2195  %x229 = add <8 x i32> %x227, %x228
2196  %x230 = extractelement <8 x i32> %x229, i32 0
2197  ret i32 %x230
2198}
2199
2200define i32 @hadd32_16_optsize(<16 x i32> %x225) optsize {
2201; SSE3-LABEL: hadd32_16_optsize:
2202; SSE3:       # %bb.0:
2203; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2204; SSE3-NEXT:    paddd %xmm0, %xmm1
2205; SSE3-NEXT:    phaddd %xmm1, %xmm1
2206; SSE3-NEXT:    movd %xmm1, %eax
2207; SSE3-NEXT:    retq
2208;
2209; AVX-LABEL: hadd32_16_optsize:
2210; AVX:       # %bb.0:
2211; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
2212; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
2213; AVX-NEXT:    vmovd %xmm0, %eax
2214; AVX-NEXT:    vzeroupper
2215; AVX-NEXT:    retq
2216  %x226 = shufflevector <16 x i32> %x225, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2217  %x227 = add <16 x i32> %x225, %x226
2218  %x228 = shufflevector <16 x i32> %x227, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
2219  %x229 = add <16 x i32> %x227, %x228
2220  %x230 = extractelement <16 x i32> %x229, i32 0
2221  ret i32 %x230
2222}
2223
2224!llvm.module.flags = !{!0}
2225!0 = !{i32 1, !"ProfileSummary", !1}
2226!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
2227!2 = !{!"ProfileFormat", !"InstrProf"}
2228!3 = !{!"TotalCount", i64 10000}
2229!4 = !{!"MaxCount", i64 10}
2230!5 = !{!"MaxInternalCount", i64 1}
2231!6 = !{!"MaxFunctionCount", i64 1000}
2232!7 = !{!"NumCounts", i64 3}
2233!8 = !{!"NumFunctions", i64 3}
2234!9 = !{!"DetailedSummary", !10}
2235!10 = !{!11, !12, !13}
2236!11 = !{i32 10000, i64 100, i32 1}
2237!12 = !{i32 999000, i64 100, i32 1}
2238!13 = !{i32 999999, i64 1, i32 2}
2239!14 = !{!"function_entry_count", i64 0}
2240