1; RUN: llc -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE2 %s
2; RUN: llc -mcpu=x86-64 -mattr=+sse4.1 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE41 %s
3; RUN: llc -mcpu=x86-64 -mattr=+avx < %s | FileCheck --check-prefix=AVX %s
4
5target triple = "x86_64-unknown-unknown"
6
7; Ensure that the backend no longer emits unnecessary vector insert
8; instructions immediately after SSE scalar fp instructions
9; like addss or mulss.
10
11define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
12; SSE-LABEL: test_add_ss:
13; SSE:       # BB#0:
14; SSE-NEXT:    addss %xmm1, %xmm0
15; SSE-NEXT:    retq
16;
17; AVX-LABEL: test_add_ss:
18; AVX:       # BB#0:
19; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
20; AVX-NEXT:    retq
21  %1 = extractelement <4 x float> %b, i32 0
22  %2 = extractelement <4 x float> %a, i32 0
23  %add = fadd float %2, %1
24  %3 = insertelement <4 x float> %a, float %add, i32 0
25  ret <4 x float> %3
26}
27
28define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
29; SSE-LABEL: test_sub_ss:
30; SSE:       # BB#0:
31; SSE-NEXT:    subss %xmm1, %xmm0
32; SSE-NEXT:    retq
33;
34; AVX-LABEL: test_sub_ss:
35; AVX:       # BB#0:
36; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
37; AVX-NEXT:    retq
38  %1 = extractelement <4 x float> %b, i32 0
39  %2 = extractelement <4 x float> %a, i32 0
40  %sub = fsub float %2, %1
41  %3 = insertelement <4 x float> %a, float %sub, i32 0
42  ret <4 x float> %3
43}
44
45define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
46; SSE-LABEL: test_mul_ss:
47; SSE:       # BB#0:
48; SSE-NEXT:    mulss %xmm1, %xmm0
49; SSE-NEXT:    retq
50;
51; AVX-LABEL: test_mul_ss:
52; AVX:       # BB#0:
53; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
54; AVX-NEXT:    retq
55  %1 = extractelement <4 x float> %b, i32 0
56  %2 = extractelement <4 x float> %a, i32 0
57  %mul = fmul float %2, %1
58  %3 = insertelement <4 x float> %a, float %mul, i32 0
59  ret <4 x float> %3
60}
61
62define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
63; SSE-LABEL: test_div_ss:
64; SSE:       # BB#0:
65; SSE-NEXT:    divss %xmm1, %xmm0
66; SSE-NEXT:    retq
67;
68; AVX-LABEL: test_div_ss:
69; AVX:       # BB#0:
70; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
71; AVX-NEXT:    retq
72  %1 = extractelement <4 x float> %b, i32 0
73  %2 = extractelement <4 x float> %a, i32 0
74  %div = fdiv float %2, %1
75  %3 = insertelement <4 x float> %a, float %div, i32 0
76  ret <4 x float> %3
77}
78
79define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
80; SSE-LABEL: test_add_sd:
81; SSE:       # BB#0:
82; SSE-NEXT:    addsd %xmm1, %xmm0
83; SSE-NEXT:    retq
84;
85; AVX-LABEL: test_add_sd:
86; AVX:       # BB#0:
87; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
88; AVX-NEXT:    retq
89  %1 = extractelement <2 x double> %b, i32 0
90  %2 = extractelement <2 x double> %a, i32 0
91  %add = fadd double %2, %1
92  %3 = insertelement <2 x double> %a, double %add, i32 0
93  ret <2 x double> %3
94}
95
96define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
97; SSE-LABEL: test_sub_sd:
98; SSE:       # BB#0:
99; SSE-NEXT:    subsd %xmm1, %xmm0
100; SSE-NEXT:    retq
101;
102; AVX-LABEL: test_sub_sd:
103; AVX:       # BB#0:
104; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
105; AVX-NEXT:    retq
106  %1 = extractelement <2 x double> %b, i32 0
107  %2 = extractelement <2 x double> %a, i32 0
108  %sub = fsub double %2, %1
109  %3 = insertelement <2 x double> %a, double %sub, i32 0
110  ret <2 x double> %3
111}
112
113define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
114; SSE-LABEL: test_mul_sd:
115; SSE:       # BB#0:
116; SSE-NEXT:    mulsd %xmm1, %xmm0
117; SSE-NEXT:    retq
118;
119; AVX-LABEL: test_mul_sd:
120; AVX:       # BB#0:
121; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
122; AVX-NEXT:    retq
123  %1 = extractelement <2 x double> %b, i32 0
124  %2 = extractelement <2 x double> %a, i32 0
125  %mul = fmul double %2, %1
126  %3 = insertelement <2 x double> %a, double %mul, i32 0
127  ret <2 x double> %3
128}
129
130define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
131; SSE-LABEL: test_div_sd:
132; SSE:       # BB#0:
133; SSE-NEXT:    divsd %xmm1, %xmm0
134; SSE-NEXT:    retq
135;
136; AVX-LABEL: test_div_sd:
137; AVX:       # BB#0:
138; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
139; AVX-NEXT:    retq
140  %1 = extractelement <2 x double> %b, i32 0
141  %2 = extractelement <2 x double> %a, i32 0
142  %div = fdiv double %2, %1
143  %3 = insertelement <2 x double> %a, double %div, i32 0
144  ret <2 x double> %3
145}
146
147define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
148; SSE-LABEL: test2_add_ss:
149; SSE:       # BB#0:
150; SSE-NEXT:    addss %xmm0, %xmm1
151; SSE-NEXT:    movaps %xmm1, %xmm0
152; SSE-NEXT:    retq
153;
154; AVX-LABEL: test2_add_ss:
155; AVX:       # BB#0:
156; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
157; AVX-NEXT:    retq
158  %1 = extractelement <4 x float> %a, i32 0
159  %2 = extractelement <4 x float> %b, i32 0
160  %add = fadd float %1, %2
161  %3 = insertelement <4 x float> %b, float %add, i32 0
162  ret <4 x float> %3
163}
164
165define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
166; SSE-LABEL: test2_sub_ss:
167; SSE:       # BB#0:
168; SSE-NEXT:    subss %xmm0, %xmm1
169; SSE-NEXT:    movaps %xmm1, %xmm0
170; SSE-NEXT:    retq
171;
172; AVX-LABEL: test2_sub_ss:
173; AVX:       # BB#0:
174; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
175; AVX-NEXT:    retq
176  %1 = extractelement <4 x float> %a, i32 0
177  %2 = extractelement <4 x float> %b, i32 0
178  %sub = fsub float %2, %1
179  %3 = insertelement <4 x float> %b, float %sub, i32 0
180  ret <4 x float> %3
181}
182
183define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
184; SSE-LABEL: test2_mul_ss:
185; SSE:       # BB#0:
186; SSE-NEXT:    mulss %xmm0, %xmm1
187; SSE-NEXT:    movaps %xmm1, %xmm0
188; SSE-NEXT:    retq
189;
190; AVX-LABEL: test2_mul_ss:
191; AVX:       # BB#0:
192; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
193; AVX-NEXT:    retq
194  %1 = extractelement <4 x float> %a, i32 0
195  %2 = extractelement <4 x float> %b, i32 0
196  %mul = fmul float %1, %2
197  %3 = insertelement <4 x float> %b, float %mul, i32 0
198  ret <4 x float> %3
199}
200
201define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
202; SSE-LABEL: test2_div_ss:
203; SSE:       # BB#0:
204; SSE-NEXT:    divss %xmm0, %xmm1
205; SSE-NEXT:    movaps %xmm1, %xmm0
206; SSE-NEXT:    retq
207;
208; AVX-LABEL: test2_div_ss:
209; AVX:       # BB#0:
210; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
211; AVX-NEXT:    retq
212  %1 = extractelement <4 x float> %a, i32 0
213  %2 = extractelement <4 x float> %b, i32 0
214  %div = fdiv float %2, %1
215  %3 = insertelement <4 x float> %b, float %div, i32 0
216  ret <4 x float> %3
217}
218
219define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
220; SSE-LABEL: test2_add_sd:
221; SSE:       # BB#0:
222; SSE-NEXT:    addsd %xmm0, %xmm1
223; SSE-NEXT:    movaps %xmm1, %xmm0
224; SSE-NEXT:    retq
225;
226; AVX-LABEL: test2_add_sd:
227; AVX:       # BB#0:
228; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
229; AVX-NEXT:    retq
230  %1 = extractelement <2 x double> %a, i32 0
231  %2 = extractelement <2 x double> %b, i32 0
232  %add = fadd double %1, %2
233  %3 = insertelement <2 x double> %b, double %add, i32 0
234  ret <2 x double> %3
235}
236
237define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
238; SSE-LABEL: test2_sub_sd:
239; SSE:       # BB#0:
240; SSE-NEXT:    subsd %xmm0, %xmm1
241; SSE-NEXT:    movaps %xmm1, %xmm0
242; SSE-NEXT:    retq
243;
244; AVX-LABEL: test2_sub_sd:
245; AVX:       # BB#0:
246; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
247; AVX-NEXT:    retq
248  %1 = extractelement <2 x double> %a, i32 0
249  %2 = extractelement <2 x double> %b, i32 0
250  %sub = fsub double %2, %1
251  %3 = insertelement <2 x double> %b, double %sub, i32 0
252  ret <2 x double> %3
253}
254
255define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
256; SSE-LABEL: test2_mul_sd:
257; SSE:       # BB#0:
258; SSE-NEXT:    mulsd %xmm0, %xmm1
259; SSE-NEXT:    movaps %xmm1, %xmm0
260; SSE-NEXT:    retq
261;
262; AVX-LABEL: test2_mul_sd:
263; AVX:       # BB#0:
264; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
265; AVX-NEXT:    retq
266  %1 = extractelement <2 x double> %a, i32 0
267  %2 = extractelement <2 x double> %b, i32 0
268  %mul = fmul double %1, %2
269  %3 = insertelement <2 x double> %b, double %mul, i32 0
270  ret <2 x double> %3
271}
272
273define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
274; SSE-LABEL: test2_div_sd:
275; SSE:       # BB#0:
276; SSE-NEXT:    divsd %xmm0, %xmm1
277; SSE-NEXT:    movaps %xmm1, %xmm0
278; SSE-NEXT:    retq
279;
280; AVX-LABEL: test2_div_sd:
281; AVX:       # BB#0:
282; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
283; AVX-NEXT:    retq
284  %1 = extractelement <2 x double> %a, i32 0
285  %2 = extractelement <2 x double> %b, i32 0
286  %div = fdiv double %2, %1
287  %3 = insertelement <2 x double> %b, double %div, i32 0
288  ret <2 x double> %3
289}
290
291define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) {
292; SSE-LABEL: test_multiple_add_ss:
293; SSE:       # BB#0:
294; SSE-NEXT:    addss %xmm0, %xmm1
295; SSE-NEXT:    addss %xmm1, %xmm0
296; SSE-NEXT:    retq
297;
298; AVX-LABEL: test_multiple_add_ss:
299; AVX:       # BB#0:
300; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm1
301; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
302; AVX-NEXT:    retq
303  %1 = extractelement <4 x float> %b, i32 0
304  %2 = extractelement <4 x float> %a, i32 0
305  %add = fadd float %2, %1
306  %add2 = fadd float %2, %add
307  %3 = insertelement <4 x float> %a, float %add2, i32 0
308  ret <4 x float> %3
309}
310
311define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) {
312; SSE-LABEL: test_multiple_sub_ss:
313; SSE:       # BB#0:
314; SSE-NEXT:    movaps %xmm0, %xmm2
315; SSE-NEXT:    subss %xmm1, %xmm2
316; SSE-NEXT:    subss %xmm2, %xmm0
317; SSE-NEXT:    retq
318;
319; AVX-LABEL: test_multiple_sub_ss:
320; AVX:       # BB#0:
321; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm1
322; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
323; AVX-NEXT:    retq
324  %1 = extractelement <4 x float> %b, i32 0
325  %2 = extractelement <4 x float> %a, i32 0
326  %sub = fsub float %2, %1
327  %sub2 = fsub float %2, %sub
328  %3 = insertelement <4 x float> %a, float %sub2, i32 0
329  ret <4 x float> %3
330}
331
332define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) {
333; SSE-LABEL: test_multiple_mul_ss:
334; SSE:       # BB#0:
335; SSE-NEXT:    mulss %xmm0, %xmm1
336; SSE-NEXT:    mulss %xmm1, %xmm0
337; SSE-NEXT:    retq
338;
339; AVX-LABEL: test_multiple_mul_ss:
340; AVX:       # BB#0:
341; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm1
342; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
343; AVX-NEXT:    retq
344  %1 = extractelement <4 x float> %b, i32 0
345  %2 = extractelement <4 x float> %a, i32 0
346  %mul = fmul float %2, %1
347  %mul2 = fmul float %2, %mul
348  %3 = insertelement <4 x float> %a, float %mul2, i32 0
349  ret <4 x float> %3
350}
351
352define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
353; SSE-LABEL: test_multiple_div_ss:
354; SSE:       # BB#0:
355; SSE-NEXT:    movaps %xmm0, %xmm2
356; SSE-NEXT:    divss %xmm1, %xmm2
357; SSE-NEXT:    divss %xmm2, %xmm0
358; SSE-NEXT:    retq
359;
360; AVX-LABEL: test_multiple_div_ss:
361; AVX:       # BB#0:
362; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm1
363; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
364; AVX-NEXT:    retq
365  %1 = extractelement <4 x float> %b, i32 0
366  %2 = extractelement <4 x float> %a, i32 0
367  %div = fdiv float %2, %1
368  %div2 = fdiv float %2, %div
369  %3 = insertelement <4 x float> %a, float %div2, i32 0
370  ret <4 x float> %3
371}
372
373; Ensure that the backend selects SSE/AVX scalar fp instructions
374; from a packed fp instrution plus a vector insert.
375
376define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) {
377; SSE-LABEL: insert_test_add_ss:
378; SSE:       # BB#0:
379; SSE-NEXT:    addss %xmm1, %xmm0
380; SSE-NEXT:    retq
381;
382; AVX-LABEL: insert_test_add_ss:
383; AVX:       # BB#0:
384; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
385; AVX-NEXT:    retq
386  %1 = fadd <4 x float> %a, %b
387  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
388  ret <4 x float> %2
389}
390
391define <4 x float> @insert_test_sub_ss(<4 x float> %a, <4 x float> %b) {
392; SSE-LABEL: insert_test_sub_ss:
393; SSE:       # BB#0:
394; SSE-NEXT:    subss %xmm1, %xmm0
395; SSE-NEXT:    retq
396;
397; AVX-LABEL: insert_test_sub_ss:
398; AVX:       # BB#0:
399; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
400; AVX-NEXT:    retq
401  %1 = fsub <4 x float> %a, %b
402  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
403  ret <4 x float> %2
404}
405
406define <4 x float> @insert_test_mul_ss(<4 x float> %a, <4 x float> %b) {
407; SSE-LABEL: insert_test_mul_ss:
408; SSE:       # BB#0:
409; SSE-NEXT:    mulss %xmm1, %xmm0
410; SSE-NEXT:    retq
411;
412; AVX-LABEL: insert_test_mul_ss:
413; AVX:       # BB#0:
414; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
415; AVX-NEXT:    retq
416  %1 = fmul <4 x float> %a, %b
417  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
418  ret <4 x float> %2
419}
420
421define <4 x float> @insert_test_div_ss(<4 x float> %a, <4 x float> %b) {
422; SSE-LABEL: insert_test_div_ss:
423; SSE:       # BB#0:
424; SSE-NEXT:    divss %xmm1, %xmm0
425; SSE-NEXT:    retq
426;
427; AVX-LABEL: insert_test_div_ss:
428; AVX:       # BB#0:
429; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
430; AVX-NEXT:    retq
431  %1 = fdiv <4 x float> %a, %b
432  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
433  ret <4 x float> %2
434}
435
436define <2 x double> @insert_test_add_sd(<2 x double> %a, <2 x double> %b) {
437; SSE-LABEL: insert_test_add_sd:
438; SSE:       # BB#0:
439; SSE-NEXT:    addsd %xmm1, %xmm0
440; SSE-NEXT:    retq
441;
442; AVX-LABEL: insert_test_add_sd:
443; AVX:       # BB#0:
444; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
445; AVX-NEXT:    retq
446  %1 = fadd <2 x double> %a, %b
447  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
448  ret <2 x double> %2
449}
450
451define <2 x double> @insert_test_sub_sd(<2 x double> %a, <2 x double> %b) {
452; SSE-LABEL: insert_test_sub_sd:
453; SSE:       # BB#0:
454; SSE-NEXT:    subsd %xmm1, %xmm0
455; SSE-NEXT:    retq
456;
457; AVX-LABEL: insert_test_sub_sd:
458; AVX:       # BB#0:
459; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
460; AVX-NEXT:    retq
461  %1 = fsub <2 x double> %a, %b
462  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
463  ret <2 x double> %2
464}
465
466define <2 x double> @insert_test_mul_sd(<2 x double> %a, <2 x double> %b) {
467; SSE-LABEL: insert_test_mul_sd:
468; SSE:       # BB#0:
469; SSE-NEXT:    mulsd %xmm1, %xmm0
470; SSE-NEXT:    retq
471;
472; AVX-LABEL: insert_test_mul_sd:
473; AVX:       # BB#0:
474; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
475; AVX-NEXT:    retq
476  %1 = fmul <2 x double> %a, %b
477  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
478  ret <2 x double> %2
479}
480
481define <2 x double> @insert_test_div_sd(<2 x double> %a, <2 x double> %b) {
482; SSE-LABEL: insert_test_div_sd:
483; SSE:       # BB#0:
484; SSE-NEXT:    divsd %xmm1, %xmm0
485; SSE-NEXT:    retq
486;
487; AVX-LABEL: insert_test_div_sd:
488; AVX:       # BB#0:
489; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
490; AVX-NEXT:    retq
491  %1 = fdiv <2 x double> %a, %b
492  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
493  ret <2 x double> %2
494}
495
496define <4 x float> @insert_test2_add_ss(<4 x float> %a, <4 x float> %b) {
497; SSE-LABEL: insert_test2_add_ss:
498; SSE:       # BB#0:
499; SSE-NEXT:    addss %xmm0, %xmm1
500; SSE-NEXT:    movaps %xmm1, %xmm0
501; SSE-NEXT:    retq
502;
503; AVX-LABEL: insert_test2_add_ss:
504; AVX:       # BB#0:
505; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
506; AVX-NEXT:    retq
507  %1 = fadd <4 x float> %b, %a
508  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
509  ret <4 x float> %2
510}
511
512define <4 x float> @insert_test2_sub_ss(<4 x float> %a, <4 x float> %b) {
513; SSE-LABEL: insert_test2_sub_ss:
514; SSE:       # BB#0:
515; SSE-NEXT:    subss %xmm0, %xmm1
516; SSE-NEXT:    movaps %xmm1, %xmm0
517; SSE-NEXT:    retq
518;
519; AVX-LABEL: insert_test2_sub_ss:
520; AVX:       # BB#0:
521; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
522; AVX-NEXT:    retq
523  %1 = fsub <4 x float> %b, %a
524  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
525  ret <4 x float> %2
526}
527
528define <4 x float> @insert_test2_mul_ss(<4 x float> %a, <4 x float> %b) {
529; SSE-LABEL: insert_test2_mul_ss:
530; SSE:       # BB#0:
531; SSE-NEXT:    mulss %xmm0, %xmm1
532; SSE-NEXT:    movaps %xmm1, %xmm0
533; SSE-NEXT:    retq
534;
535; AVX-LABEL: insert_test2_mul_ss:
536; AVX:       # BB#0:
537; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
538; AVX-NEXT:    retq
539  %1 = fmul <4 x float> %b, %a
540  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
541  ret <4 x float> %2
542}
543
544define <4 x float> @insert_test2_div_ss(<4 x float> %a, <4 x float> %b) {
545; SSE-LABEL: insert_test2_div_ss:
546; SSE:       # BB#0:
547; SSE-NEXT:    divss %xmm0, %xmm1
548; SSE-NEXT:    movaps %xmm1, %xmm0
549; SSE-NEXT:    retq
550;
551; AVX-LABEL: insert_test2_div_ss:
552; AVX:       # BB#0:
553; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
554; AVX-NEXT:    retq
555  %1 = fdiv <4 x float> %b, %a
556  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
557  ret <4 x float> %2
558}
559
560define <2 x double> @insert_test2_add_sd(<2 x double> %a, <2 x double> %b) {
561; SSE-LABEL: insert_test2_add_sd:
562; SSE:       # BB#0:
563; SSE-NEXT:    addsd %xmm0, %xmm1
564; SSE-NEXT:    movaps %xmm1, %xmm0
565; SSE-NEXT:    retq
566;
567; AVX-LABEL: insert_test2_add_sd:
568; AVX:       # BB#0:
569; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
570; AVX-NEXT:    retq
571  %1 = fadd <2 x double> %b, %a
572  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
573  ret <2 x double> %2
574}
575
576define <2 x double> @insert_test2_sub_sd(<2 x double> %a, <2 x double> %b) {
577; SSE-LABEL: insert_test2_sub_sd:
578; SSE:       # BB#0:
579; SSE-NEXT:    subsd %xmm0, %xmm1
580; SSE-NEXT:    movaps %xmm1, %xmm0
581; SSE-NEXT:    retq
582;
583; AVX-LABEL: insert_test2_sub_sd:
584; AVX:       # BB#0:
585; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
586; AVX-NEXT:    retq
587  %1 = fsub <2 x double> %b, %a
588  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
589  ret <2 x double> %2
590}
591
592define <2 x double> @insert_test2_mul_sd(<2 x double> %a, <2 x double> %b) {
593; SSE-LABEL: insert_test2_mul_sd:
594; SSE:       # BB#0:
595; SSE-NEXT:    mulsd %xmm0, %xmm1
596; SSE-NEXT:    movaps %xmm1, %xmm0
597; SSE-NEXT:    retq
598;
599; AVX-LABEL: insert_test2_mul_sd:
600; AVX:       # BB#0:
601; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
602; AVX-NEXT:    retq
603  %1 = fmul <2 x double> %b, %a
604  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
605  ret <2 x double> %2
606}
607
608define <2 x double> @insert_test2_div_sd(<2 x double> %a, <2 x double> %b) {
609; SSE-LABEL: insert_test2_div_sd:
610; SSE:       # BB#0:
611; SSE-NEXT:    divsd %xmm0, %xmm1
612; SSE-NEXT:    movaps %xmm1, %xmm0
613; SSE-NEXT:    retq
614;
615; AVX-LABEL: insert_test2_div_sd:
616; AVX:       # BB#0:
617; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
618; AVX-NEXT:    retq
619  %1 = fdiv <2 x double> %b, %a
620  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
621  ret <2 x double> %2
622}
623
624define <4 x float> @insert_test3_add_ss(<4 x float> %a, <4 x float> %b) {
625; SSE-LABEL: insert_test3_add_ss:
626; SSE:       # BB#0:
627; SSE-NEXT:    addss %xmm1, %xmm0
628; SSE-NEXT:    retq
629;
630; AVX-LABEL: insert_test3_add_ss:
631; AVX:       # BB#0:
632; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
633; AVX-NEXT:    retq
634  %1 = fadd <4 x float> %a, %b
635  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
636  ret <4 x float> %2
637}
638
639define <4 x float> @insert_test3_sub_ss(<4 x float> %a, <4 x float> %b) {
640; SSE-LABEL: insert_test3_sub_ss:
641; SSE:       # BB#0:
642; SSE-NEXT:    subss %xmm1, %xmm0
643; SSE-NEXT:    retq
644;
645; AVX-LABEL: insert_test3_sub_ss:
646; AVX:       # BB#0:
647; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
648; AVX-NEXT:    retq
649  %1 = fsub <4 x float> %a, %b
650  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
651  ret <4 x float> %2
652}
653
654define <4 x float> @insert_test3_mul_ss(<4 x float> %a, <4 x float> %b) {
655; SSE-LABEL: insert_test3_mul_ss:
656; SSE:       # BB#0:
657; SSE-NEXT:    mulss %xmm1, %xmm0
658; SSE-NEXT:    retq
659;
660; AVX-LABEL: insert_test3_mul_ss:
661; AVX:       # BB#0:
662; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
663; AVX-NEXT:    retq
664  %1 = fmul <4 x float> %a, %b
665  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
666  ret <4 x float> %2
667}
668
669define <4 x float> @insert_test3_div_ss(<4 x float> %a, <4 x float> %b) {
670; SSE-LABEL: insert_test3_div_ss:
671; SSE:       # BB#0:
672; SSE-NEXT:    divss %xmm1, %xmm0
673; SSE-NEXT:    retq
674;
675; AVX-LABEL: insert_test3_div_ss:
676; AVX:       # BB#0:
677; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
678; AVX-NEXT:    retq
679  %1 = fdiv <4 x float> %a, %b
680  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
681  ret <4 x float> %2
682}
683
684define <2 x double> @insert_test3_add_sd(<2 x double> %a, <2 x double> %b) {
685; SSE-LABEL: insert_test3_add_sd:
686; SSE:       # BB#0:
687; SSE-NEXT:    addsd %xmm1, %xmm0
688; SSE-NEXT:    retq
689;
690; AVX-LABEL: insert_test3_add_sd:
691; AVX:       # BB#0:
692; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
693; AVX-NEXT:    retq
694  %1 = fadd <2 x double> %a, %b
695  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
696  ret <2 x double> %2
697}
698
699define <2 x double> @insert_test3_sub_sd(<2 x double> %a, <2 x double> %b) {
700; SSE-LABEL: insert_test3_sub_sd:
701; SSE:       # BB#0:
702; SSE-NEXT:    subsd %xmm1, %xmm0
703; SSE-NEXT:    retq
704;
705; AVX-LABEL: insert_test3_sub_sd:
706; AVX:       # BB#0:
707; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
708; AVX-NEXT:    retq
709  %1 = fsub <2 x double> %a, %b
710  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
711  ret <2 x double> %2
712}
713
714define <2 x double> @insert_test3_mul_sd(<2 x double> %a, <2 x double> %b) {
715; SSE-LABEL: insert_test3_mul_sd:
716; SSE:       # BB#0:
717; SSE-NEXT:    mulsd %xmm1, %xmm0
718; SSE-NEXT:    retq
719;
720; AVX-LABEL: insert_test3_mul_sd:
721; AVX:       # BB#0:
722; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
723; AVX-NEXT:    retq
724  %1 = fmul <2 x double> %a, %b
725  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
726  ret <2 x double> %2
727}
728
729define <2 x double> @insert_test3_div_sd(<2 x double> %a, <2 x double> %b) {
730; SSE-LABEL: insert_test3_div_sd:
731; SSE:       # BB#0:
732; SSE-NEXT:    divsd %xmm1, %xmm0
733; SSE-NEXT:    retq
734;
735; AVX-LABEL: insert_test3_div_sd:
736; AVX:       # BB#0:
737; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
738; AVX-NEXT:    retq
739  %1 = fdiv <2 x double> %a, %b
740  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
741  ret <2 x double> %2
742}
743
744define <4 x float> @insert_test4_add_ss(<4 x float> %a, <4 x float> %b) {
745; SSE-LABEL: insert_test4_add_ss:
746; SSE:       # BB#0:
747; SSE-NEXT:    addss %xmm0, %xmm1
748; SSE-NEXT:    movaps %xmm1, %xmm0
749; SSE-NEXT:    retq
750;
751; AVX-LABEL: insert_test4_add_ss:
752; AVX:       # BB#0:
753; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
754; AVX-NEXT:    retq
755  %1 = fadd <4 x float> %b, %a
756  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
757  ret <4 x float> %2
758}
759
760define <4 x float> @insert_test4_sub_ss(<4 x float> %a, <4 x float> %b) {
761; SSE-LABEL: insert_test4_sub_ss:
762; SSE:       # BB#0:
763; SSE-NEXT:    subss %xmm0, %xmm1
764; SSE-NEXT:    movaps %xmm1, %xmm0
765; SSE-NEXT:    retq
766;
767; AVX-LABEL: insert_test4_sub_ss:
768; AVX:       # BB#0:
769; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
770; AVX-NEXT:    retq
771  %1 = fsub <4 x float> %b, %a
772  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
773  ret <4 x float> %2
774}
775
776define <4 x float> @insert_test4_mul_ss(<4 x float> %a, <4 x float> %b) {
777; SSE-LABEL: insert_test4_mul_ss:
778; SSE:       # BB#0:
779; SSE-NEXT:    mulss %xmm0, %xmm1
780; SSE-NEXT:    movaps %xmm1, %xmm0
781; SSE-NEXT:    retq
782;
783; AVX-LABEL: insert_test4_mul_ss:
784; AVX:       # BB#0:
785; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
786; AVX-NEXT:    retq
787  %1 = fmul <4 x float> %b, %a
788  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
789  ret <4 x float> %2
790}
791
792define <4 x float> @insert_test4_div_ss(<4 x float> %a, <4 x float> %b) {
793; SSE-LABEL: insert_test4_div_ss:
794; SSE:       # BB#0:
795; SSE-NEXT:    divss %xmm0, %xmm1
796; SSE-NEXT:    movaps %xmm1, %xmm0
797; SSE-NEXT:    retq
798;
799; AVX-LABEL: insert_test4_div_ss:
800; AVX:       # BB#0:
801; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
802; AVX-NEXT:    retq
803  %1 = fdiv <4 x float> %b, %a
804  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
805  ret <4 x float> %2
806}
807
808define <2 x double> @insert_test4_add_sd(<2 x double> %a, <2 x double> %b) {
809; SSE-LABEL: insert_test4_add_sd:
810; SSE:       # BB#0:
811; SSE-NEXT:    addsd %xmm0, %xmm1
812; SSE-NEXT:    movaps %xmm1, %xmm0
813; SSE-NEXT:    retq
814;
815; AVX-LABEL: insert_test4_add_sd:
816; AVX:       # BB#0:
817; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
818; AVX-NEXT:    retq
819  %1 = fadd <2 x double> %b, %a
820  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
821  ret <2 x double> %2
822}
823
824define <2 x double> @insert_test4_sub_sd(<2 x double> %a, <2 x double> %b) {
825; SSE-LABEL: insert_test4_sub_sd:
826; SSE:       # BB#0:
827; SSE-NEXT:    subsd %xmm0, %xmm1
828; SSE-NEXT:    movaps %xmm1, %xmm0
829; SSE-NEXT:    retq
830;
831; AVX-LABEL: insert_test4_sub_sd:
832; AVX:       # BB#0:
833; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
834; AVX-NEXT:    retq
835  %1 = fsub <2 x double> %b, %a
836  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
837  ret <2 x double> %2
838}
839
840define <2 x double> @insert_test4_mul_sd(<2 x double> %a, <2 x double> %b) {
841; SSE-LABEL: insert_test4_mul_sd:
842; SSE:       # BB#0:
843; SSE-NEXT:    mulsd %xmm0, %xmm1
844; SSE-NEXT:    movaps %xmm1, %xmm0
845; SSE-NEXT:    retq
846;
847; AVX-LABEL: insert_test4_mul_sd:
848; AVX:       # BB#0:
849; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
850; AVX-NEXT:    retq
851  %1 = fmul <2 x double> %b, %a
852  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
853  ret <2 x double> %2
854}
855
856define <2 x double> @insert_test4_div_sd(<2 x double> %a, <2 x double> %b) {
857; SSE-LABEL: insert_test4_div_sd:
858; SSE:       # BB#0:
859; SSE-NEXT:    divsd %xmm0, %xmm1
860; SSE-NEXT:    movaps %xmm1, %xmm0
861; SSE-NEXT:    retq
862;
863; AVX-LABEL: insert_test4_div_sd:
864; AVX:       # BB#0:
865; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
866; AVX-NEXT:    retq
867  %1 = fdiv <2 x double> %b, %a
868  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
869  ret <2 x double> %2
870}
871