1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE2,X86-SSE2
3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE,SSE41,X86-SSE41
4; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X86-AVX,X86-AVX1
5; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,X86-AVX,X86-AVX512
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE2,X64-SSE2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE,SSE41,X64-SSE41
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X64-AVX,X64-AVX1
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,X64-AVX,X64-AVX512
10
11; Ensure that the backend no longer emits unnecessary vector insert
12; instructions immediately after SSE scalar fp instructions
13; like addss or mulss.
14
15define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
16; SSE-LABEL: test_add_ss:
17; SSE:       # %bb.0:
18; SSE-NEXT:    addss %xmm1, %xmm0
19; SSE-NEXT:    ret{{[l|q]}}
20;
21; AVX-LABEL: test_add_ss:
22; AVX:       # %bb.0:
23; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
24; AVX-NEXT:    ret{{[l|q]}}
25  %1 = extractelement <4 x float> %b, i32 0
26  %2 = extractelement <4 x float> %a, i32 0
27  %add = fadd float %2, %1
28  %3 = insertelement <4 x float> %a, float %add, i32 0
29  ret <4 x float> %3
30}
31
32define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
33; SSE-LABEL: test_sub_ss:
34; SSE:       # %bb.0:
35; SSE-NEXT:    subss %xmm1, %xmm0
36; SSE-NEXT:    ret{{[l|q]}}
37;
38; AVX-LABEL: test_sub_ss:
39; AVX:       # %bb.0:
40; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
41; AVX-NEXT:    ret{{[l|q]}}
42  %1 = extractelement <4 x float> %b, i32 0
43  %2 = extractelement <4 x float> %a, i32 0
44  %sub = fsub float %2, %1
45  %3 = insertelement <4 x float> %a, float %sub, i32 0
46  ret <4 x float> %3
47}
48
49define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
50; SSE-LABEL: test_mul_ss:
51; SSE:       # %bb.0:
52; SSE-NEXT:    mulss %xmm1, %xmm0
53; SSE-NEXT:    ret{{[l|q]}}
54;
55; AVX-LABEL: test_mul_ss:
56; AVX:       # %bb.0:
57; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
58; AVX-NEXT:    ret{{[l|q]}}
59  %1 = extractelement <4 x float> %b, i32 0
60  %2 = extractelement <4 x float> %a, i32 0
61  %mul = fmul float %2, %1
62  %3 = insertelement <4 x float> %a, float %mul, i32 0
63  ret <4 x float> %3
64}
65
66define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
67; SSE-LABEL: test_div_ss:
68; SSE:       # %bb.0:
69; SSE-NEXT:    divss %xmm1, %xmm0
70; SSE-NEXT:    ret{{[l|q]}}
71;
72; AVX-LABEL: test_div_ss:
73; AVX:       # %bb.0:
74; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
75; AVX-NEXT:    ret{{[l|q]}}
76  %1 = extractelement <4 x float> %b, i32 0
77  %2 = extractelement <4 x float> %a, i32 0
78  %div = fdiv float %2, %1
79  %3 = insertelement <4 x float> %a, float %div, i32 0
80  ret <4 x float> %3
81}
82
83define <4 x float> @test_sqrt_ss(<4 x float> %a) {
84; SSE-LABEL: test_sqrt_ss:
85; SSE:       # %bb.0:
86; SSE-NEXT:    sqrtss %xmm0, %xmm0
87; SSE-NEXT:    ret{{[l|q]}}
88;
89; AVX-LABEL: test_sqrt_ss:
90; AVX:       # %bb.0:
91; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
92; AVX-NEXT:    ret{{[l|q]}}
93  %1 = extractelement <4 x float> %a, i32 0
94  %2 = call float @llvm.sqrt.f32(float %1)
95  %3 = insertelement <4 x float> %a, float %2, i32 0
96  ret <4 x float> %3
97}
98declare float @llvm.sqrt.f32(float)
99
100define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
101; SSE-LABEL: test_add_sd:
102; SSE:       # %bb.0:
103; SSE-NEXT:    addsd %xmm1, %xmm0
104; SSE-NEXT:    ret{{[l|q]}}
105;
106; AVX-LABEL: test_add_sd:
107; AVX:       # %bb.0:
108; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
109; AVX-NEXT:    ret{{[l|q]}}
110  %1 = extractelement <2 x double> %b, i32 0
111  %2 = extractelement <2 x double> %a, i32 0
112  %add = fadd double %2, %1
113  %3 = insertelement <2 x double> %a, double %add, i32 0
114  ret <2 x double> %3
115}
116
117define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
118; SSE-LABEL: test_sub_sd:
119; SSE:       # %bb.0:
120; SSE-NEXT:    subsd %xmm1, %xmm0
121; SSE-NEXT:    ret{{[l|q]}}
122;
123; AVX-LABEL: test_sub_sd:
124; AVX:       # %bb.0:
125; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
126; AVX-NEXT:    ret{{[l|q]}}
127  %1 = extractelement <2 x double> %b, i32 0
128  %2 = extractelement <2 x double> %a, i32 0
129  %sub = fsub double %2, %1
130  %3 = insertelement <2 x double> %a, double %sub, i32 0
131  ret <2 x double> %3
132}
133
134define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
135; SSE-LABEL: test_mul_sd:
136; SSE:       # %bb.0:
137; SSE-NEXT:    mulsd %xmm1, %xmm0
138; SSE-NEXT:    ret{{[l|q]}}
139;
140; AVX-LABEL: test_mul_sd:
141; AVX:       # %bb.0:
142; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
143; AVX-NEXT:    ret{{[l|q]}}
144  %1 = extractelement <2 x double> %b, i32 0
145  %2 = extractelement <2 x double> %a, i32 0
146  %mul = fmul double %2, %1
147  %3 = insertelement <2 x double> %a, double %mul, i32 0
148  ret <2 x double> %3
149}
150
151define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
152; SSE-LABEL: test_div_sd:
153; SSE:       # %bb.0:
154; SSE-NEXT:    divsd %xmm1, %xmm0
155; SSE-NEXT:    ret{{[l|q]}}
156;
157; AVX-LABEL: test_div_sd:
158; AVX:       # %bb.0:
159; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
160; AVX-NEXT:    ret{{[l|q]}}
161  %1 = extractelement <2 x double> %b, i32 0
162  %2 = extractelement <2 x double> %a, i32 0
163  %div = fdiv double %2, %1
164  %3 = insertelement <2 x double> %a, double %div, i32 0
165  ret <2 x double> %3
166}
167
168define <2 x double> @test_sqrt_sd(<2 x double> %a) {
169; SSE-LABEL: test_sqrt_sd:
170; SSE:       # %bb.0:
171; SSE-NEXT:    sqrtsd %xmm0, %xmm0
172; SSE-NEXT:    ret{{[l|q]}}
173;
174; AVX-LABEL: test_sqrt_sd:
175; AVX:       # %bb.0:
176; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
177; AVX-NEXT:    ret{{[l|q]}}
178  %1 = extractelement <2 x double> %a, i32 0
179  %2 = call double @llvm.sqrt.f64(double %1)
180  %3 = insertelement <2 x double> %a, double %2, i32 0
181  ret <2 x double> %3
182}
183declare double @llvm.sqrt.f64(double)
184
185define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
186; SSE-LABEL: test2_add_ss:
187; SSE:       # %bb.0:
188; SSE-NEXT:    addss %xmm0, %xmm1
189; SSE-NEXT:    movaps %xmm1, %xmm0
190; SSE-NEXT:    ret{{[l|q]}}
191;
192; AVX-LABEL: test2_add_ss:
193; AVX:       # %bb.0:
194; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
195; AVX-NEXT:    ret{{[l|q]}}
196  %1 = extractelement <4 x float> %a, i32 0
197  %2 = extractelement <4 x float> %b, i32 0
198  %add = fadd float %1, %2
199  %3 = insertelement <4 x float> %b, float %add, i32 0
200  ret <4 x float> %3
201}
202
203define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
204; SSE-LABEL: test2_sub_ss:
205; SSE:       # %bb.0:
206; SSE-NEXT:    subss %xmm0, %xmm1
207; SSE-NEXT:    movaps %xmm1, %xmm0
208; SSE-NEXT:    ret{{[l|q]}}
209;
210; AVX-LABEL: test2_sub_ss:
211; AVX:       # %bb.0:
212; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
213; AVX-NEXT:    ret{{[l|q]}}
214  %1 = extractelement <4 x float> %a, i32 0
215  %2 = extractelement <4 x float> %b, i32 0
216  %sub = fsub float %2, %1
217  %3 = insertelement <4 x float> %b, float %sub, i32 0
218  ret <4 x float> %3
219}
220
221define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
222; SSE-LABEL: test2_mul_ss:
223; SSE:       # %bb.0:
224; SSE-NEXT:    mulss %xmm0, %xmm1
225; SSE-NEXT:    movaps %xmm1, %xmm0
226; SSE-NEXT:    ret{{[l|q]}}
227;
228; AVX-LABEL: test2_mul_ss:
229; AVX:       # %bb.0:
230; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
231; AVX-NEXT:    ret{{[l|q]}}
232  %1 = extractelement <4 x float> %a, i32 0
233  %2 = extractelement <4 x float> %b, i32 0
234  %mul = fmul float %1, %2
235  %3 = insertelement <4 x float> %b, float %mul, i32 0
236  ret <4 x float> %3
237}
238
239define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
240; SSE-LABEL: test2_div_ss:
241; SSE:       # %bb.0:
242; SSE-NEXT:    divss %xmm0, %xmm1
243; SSE-NEXT:    movaps %xmm1, %xmm0
244; SSE-NEXT:    ret{{[l|q]}}
245;
246; AVX-LABEL: test2_div_ss:
247; AVX:       # %bb.0:
248; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
249; AVX-NEXT:    ret{{[l|q]}}
250  %1 = extractelement <4 x float> %a, i32 0
251  %2 = extractelement <4 x float> %b, i32 0
252  %div = fdiv float %2, %1
253  %3 = insertelement <4 x float> %b, float %div, i32 0
254  ret <4 x float> %3
255}
256
257define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
258; SSE-LABEL: test2_add_sd:
259; SSE:       # %bb.0:
260; SSE-NEXT:    addsd %xmm0, %xmm1
261; SSE-NEXT:    movapd %xmm1, %xmm0
262; SSE-NEXT:    ret{{[l|q]}}
263;
264; AVX-LABEL: test2_add_sd:
265; AVX:       # %bb.0:
266; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
267; AVX-NEXT:    ret{{[l|q]}}
268  %1 = extractelement <2 x double> %a, i32 0
269  %2 = extractelement <2 x double> %b, i32 0
270  %add = fadd double %1, %2
271  %3 = insertelement <2 x double> %b, double %add, i32 0
272  ret <2 x double> %3
273}
274
275define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
276; SSE-LABEL: test2_sub_sd:
277; SSE:       # %bb.0:
278; SSE-NEXT:    subsd %xmm0, %xmm1
279; SSE-NEXT:    movapd %xmm1, %xmm0
280; SSE-NEXT:    ret{{[l|q]}}
281;
282; AVX-LABEL: test2_sub_sd:
283; AVX:       # %bb.0:
284; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
285; AVX-NEXT:    ret{{[l|q]}}
286  %1 = extractelement <2 x double> %a, i32 0
287  %2 = extractelement <2 x double> %b, i32 0
288  %sub = fsub double %2, %1
289  %3 = insertelement <2 x double> %b, double %sub, i32 0
290  ret <2 x double> %3
291}
292
293define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
294; SSE-LABEL: test2_mul_sd:
295; SSE:       # %bb.0:
296; SSE-NEXT:    mulsd %xmm0, %xmm1
297; SSE-NEXT:    movapd %xmm1, %xmm0
298; SSE-NEXT:    ret{{[l|q]}}
299;
300; AVX-LABEL: test2_mul_sd:
301; AVX:       # %bb.0:
302; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
303; AVX-NEXT:    ret{{[l|q]}}
304  %1 = extractelement <2 x double> %a, i32 0
305  %2 = extractelement <2 x double> %b, i32 0
306  %mul = fmul double %1, %2
307  %3 = insertelement <2 x double> %b, double %mul, i32 0
308  ret <2 x double> %3
309}
310
311define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
312; SSE-LABEL: test2_div_sd:
313; SSE:       # %bb.0:
314; SSE-NEXT:    divsd %xmm0, %xmm1
315; SSE-NEXT:    movapd %xmm1, %xmm0
316; SSE-NEXT:    ret{{[l|q]}}
317;
318; AVX-LABEL: test2_div_sd:
319; AVX:       # %bb.0:
320; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
321; AVX-NEXT:    ret{{[l|q]}}
322  %1 = extractelement <2 x double> %a, i32 0
323  %2 = extractelement <2 x double> %b, i32 0
324  %div = fdiv double %2, %1
325  %3 = insertelement <2 x double> %b, double %div, i32 0
326  ret <2 x double> %3
327}
328
329define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) {
330; SSE-LABEL: test_multiple_add_ss:
331; SSE:       # %bb.0:
332; SSE-NEXT:    addss %xmm0, %xmm1
333; SSE-NEXT:    addss %xmm1, %xmm0
334; SSE-NEXT:    ret{{[l|q]}}
335;
336; AVX-LABEL: test_multiple_add_ss:
337; AVX:       # %bb.0:
338; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm1
339; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
340; AVX-NEXT:    ret{{[l|q]}}
341  %1 = extractelement <4 x float> %b, i32 0
342  %2 = extractelement <4 x float> %a, i32 0
343  %add = fadd float %2, %1
344  %add2 = fadd float %2, %add
345  %3 = insertelement <4 x float> %a, float %add2, i32 0
346  ret <4 x float> %3
347}
348
349define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) {
350; SSE-LABEL: test_multiple_sub_ss:
351; SSE:       # %bb.0:
352; SSE-NEXT:    movaps %xmm0, %xmm2
353; SSE-NEXT:    subss %xmm1, %xmm2
354; SSE-NEXT:    subss %xmm2, %xmm0
355; SSE-NEXT:    ret{{[l|q]}}
356;
357; AVX-LABEL: test_multiple_sub_ss:
358; AVX:       # %bb.0:
359; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm1
360; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
361; AVX-NEXT:    ret{{[l|q]}}
362  %1 = extractelement <4 x float> %b, i32 0
363  %2 = extractelement <4 x float> %a, i32 0
364  %sub = fsub float %2, %1
365  %sub2 = fsub float %2, %sub
366  %3 = insertelement <4 x float> %a, float %sub2, i32 0
367  ret <4 x float> %3
368}
369
370define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) {
371; SSE-LABEL: test_multiple_mul_ss:
372; SSE:       # %bb.0:
373; SSE-NEXT:    mulss %xmm0, %xmm1
374; SSE-NEXT:    mulss %xmm1, %xmm0
375; SSE-NEXT:    ret{{[l|q]}}
376;
377; AVX-LABEL: test_multiple_mul_ss:
378; AVX:       # %bb.0:
379; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm1
380; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
381; AVX-NEXT:    ret{{[l|q]}}
382  %1 = extractelement <4 x float> %b, i32 0
383  %2 = extractelement <4 x float> %a, i32 0
384  %mul = fmul float %2, %1
385  %mul2 = fmul float %2, %mul
386  %3 = insertelement <4 x float> %a, float %mul2, i32 0
387  ret <4 x float> %3
388}
389
390define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
391; SSE-LABEL: test_multiple_div_ss:
392; SSE:       # %bb.0:
393; SSE-NEXT:    movaps %xmm0, %xmm2
394; SSE-NEXT:    divss %xmm1, %xmm2
395; SSE-NEXT:    divss %xmm2, %xmm0
396; SSE-NEXT:    ret{{[l|q]}}
397;
398; AVX-LABEL: test_multiple_div_ss:
399; AVX:       # %bb.0:
400; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm1
401; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
402; AVX-NEXT:    ret{{[l|q]}}
403  %1 = extractelement <4 x float> %b, i32 0
404  %2 = extractelement <4 x float> %a, i32 0
405  %div = fdiv float %2, %1
406  %div2 = fdiv float %2, %div
407  %3 = insertelement <4 x float> %a, float %div2, i32 0
408  ret <4 x float> %3
409}
410
411; With SSE4.1 or greater, the shuffles in the following tests may
412; be lowered to X86Blendi nodes.
413
414define <4 x float> @blend_add_ss(<4 x float> %a, float %b) {
415; X86-SSE-LABEL: blend_add_ss:
416; X86-SSE:       # %bb.0:
417; X86-SSE-NEXT:    addss {{[0-9]+}}(%esp), %xmm0
418; X86-SSE-NEXT:    retl
419;
420; X86-AVX-LABEL: blend_add_ss:
421; X86-AVX:       # %bb.0:
422; X86-AVX-NEXT:    vaddss {{[0-9]+}}(%esp), %xmm0, %xmm0
423; X86-AVX-NEXT:    retl
424;
425; X64-SSE-LABEL: blend_add_ss:
426; X64-SSE:       # %bb.0:
427; X64-SSE-NEXT:    addss %xmm1, %xmm0
428; X64-SSE-NEXT:    retq
429;
430; X64-AVX-LABEL: blend_add_ss:
431; X64-AVX:       # %bb.0:
432; X64-AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
433; X64-AVX-NEXT:    retq
434
435  %ext = extractelement <4 x float> %a, i32 0
436  %op = fadd float %b, %ext
437  %ins = insertelement <4 x float> undef, float %op, i32 0
438  %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
439  ret <4 x float> %shuf
440}
441
442define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) {
443; X86-SSE-LABEL: blend_sub_ss:
444; X86-SSE:       # %bb.0:
445; X86-SSE-NEXT:    subss {{[0-9]+}}(%esp), %xmm0
446; X86-SSE-NEXT:    retl
447;
448; X86-AVX-LABEL: blend_sub_ss:
449; X86-AVX:       # %bb.0:
450; X86-AVX-NEXT:    vsubss {{[0-9]+}}(%esp), %xmm0, %xmm0
451; X86-AVX-NEXT:    retl
452;
453; X64-SSE-LABEL: blend_sub_ss:
454; X64-SSE:       # %bb.0:
455; X64-SSE-NEXT:    subss %xmm1, %xmm0
456; X64-SSE-NEXT:    retq
457;
458; X64-AVX-LABEL: blend_sub_ss:
459; X64-AVX:       # %bb.0:
460; X64-AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
461; X64-AVX-NEXT:    retq
462
463  %ext = extractelement <4 x float> %a, i32 0
464  %op = fsub float %ext, %b
465  %ins = insertelement <4 x float> undef, float %op, i32 0
466  %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
467  ret <4 x float> %shuf
468}
469
470define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) {
471; X86-SSE-LABEL: blend_mul_ss:
472; X86-SSE:       # %bb.0:
473; X86-SSE-NEXT:    mulss {{[0-9]+}}(%esp), %xmm0
474; X86-SSE-NEXT:    retl
475;
476; X86-AVX-LABEL: blend_mul_ss:
477; X86-AVX:       # %bb.0:
478; X86-AVX-NEXT:    vmulss {{[0-9]+}}(%esp), %xmm0, %xmm0
479; X86-AVX-NEXT:    retl
480;
481; X64-SSE-LABEL: blend_mul_ss:
482; X64-SSE:       # %bb.0:
483; X64-SSE-NEXT:    mulss %xmm1, %xmm0
484; X64-SSE-NEXT:    retq
485;
486; X64-AVX-LABEL: blend_mul_ss:
487; X64-AVX:       # %bb.0:
488; X64-AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
489; X64-AVX-NEXT:    retq
490
491  %ext = extractelement <4 x float> %a, i32 0
492  %op = fmul float %b, %ext
493  %ins = insertelement <4 x float> undef, float %op, i32 0
494  %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
495  ret <4 x float> %shuf
496}
497
498define <4 x float> @blend_div_ss(<4 x float> %a, float %b) {
499; X86-SSE-LABEL: blend_div_ss:
500; X86-SSE:       # %bb.0:
501; X86-SSE-NEXT:    divss {{[0-9]+}}(%esp), %xmm0
502; X86-SSE-NEXT:    retl
503;
504; X86-AVX-LABEL: blend_div_ss:
505; X86-AVX:       # %bb.0:
506; X86-AVX-NEXT:    vdivss {{[0-9]+}}(%esp), %xmm0, %xmm0
507; X86-AVX-NEXT:    retl
508;
509; X64-SSE-LABEL: blend_div_ss:
510; X64-SSE:       # %bb.0:
511; X64-SSE-NEXT:    divss %xmm1, %xmm0
512; X64-SSE-NEXT:    retq
513;
514; X64-AVX-LABEL: blend_div_ss:
515; X64-AVX:       # %bb.0:
516; X64-AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
517; X64-AVX-NEXT:    retq
518
519  %ext = extractelement <4 x float> %a, i32 0
520  %op = fdiv float %ext, %b
521  %ins = insertelement <4 x float> undef, float %op, i32 0
522  %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
523  ret <4 x float> %shuf
524}
525
526define <2 x double> @blend_add_sd(<2 x double> %a, double %b) {
527; X86-SSE-LABEL: blend_add_sd:
528; X86-SSE:       # %bb.0:
529; X86-SSE-NEXT:    addsd {{[0-9]+}}(%esp), %xmm0
530; X86-SSE-NEXT:    retl
531;
532; X86-AVX-LABEL: blend_add_sd:
533; X86-AVX:       # %bb.0:
534; X86-AVX-NEXT:    vaddsd {{[0-9]+}}(%esp), %xmm0, %xmm0
535; X86-AVX-NEXT:    retl
536;
537; X64-SSE-LABEL: blend_add_sd:
538; X64-SSE:       # %bb.0:
539; X64-SSE-NEXT:    addsd %xmm1, %xmm0
540; X64-SSE-NEXT:    retq
541;
542; X64-AVX-LABEL: blend_add_sd:
543; X64-AVX:       # %bb.0:
544; X64-AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
545; X64-AVX-NEXT:    retq
546
547  %ext = extractelement <2 x double> %a, i32 0
548  %op = fadd double %b, %ext
549  %ins = insertelement <2 x double> undef, double %op, i32 0
550  %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
551  ret <2 x double> %shuf
552}
553
554define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) {
555; X86-SSE-LABEL: blend_sub_sd:
556; X86-SSE:       # %bb.0:
557; X86-SSE-NEXT:    subsd {{[0-9]+}}(%esp), %xmm0
558; X86-SSE-NEXT:    retl
559;
560; X86-AVX-LABEL: blend_sub_sd:
561; X86-AVX:       # %bb.0:
562; X86-AVX-NEXT:    vsubsd {{[0-9]+}}(%esp), %xmm0, %xmm0
563; X86-AVX-NEXT:    retl
564;
565; X64-SSE-LABEL: blend_sub_sd:
566; X64-SSE:       # %bb.0:
567; X64-SSE-NEXT:    subsd %xmm1, %xmm0
568; X64-SSE-NEXT:    retq
569;
570; X64-AVX-LABEL: blend_sub_sd:
571; X64-AVX:       # %bb.0:
572; X64-AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
573; X64-AVX-NEXT:    retq
574
575  %ext = extractelement <2 x double> %a, i32 0
576  %op = fsub double %ext, %b
577  %ins = insertelement <2 x double> undef, double %op, i32 0
578  %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
579  ret <2 x double> %shuf
580}
581
582define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) {
583; X86-SSE-LABEL: blend_mul_sd:
584; X86-SSE:       # %bb.0:
585; X86-SSE-NEXT:    mulsd {{[0-9]+}}(%esp), %xmm0
586; X86-SSE-NEXT:    retl
587;
588; X86-AVX-LABEL: blend_mul_sd:
589; X86-AVX:       # %bb.0:
590; X86-AVX-NEXT:    vmulsd {{[0-9]+}}(%esp), %xmm0, %xmm0
591; X86-AVX-NEXT:    retl
592;
593; X64-SSE-LABEL: blend_mul_sd:
594; X64-SSE:       # %bb.0:
595; X64-SSE-NEXT:    mulsd %xmm1, %xmm0
596; X64-SSE-NEXT:    retq
597;
598; X64-AVX-LABEL: blend_mul_sd:
599; X64-AVX:       # %bb.0:
600; X64-AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
601; X64-AVX-NEXT:    retq
602
603  %ext = extractelement <2 x double> %a, i32 0
604  %op = fmul double %b, %ext
605  %ins = insertelement <2 x double> undef, double %op, i32 0
606  %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
607  ret <2 x double> %shuf
608}
609
610define <2 x double> @blend_div_sd(<2 x double> %a, double %b) {
611; X86-SSE-LABEL: blend_div_sd:
612; X86-SSE:       # %bb.0:
613; X86-SSE-NEXT:    divsd {{[0-9]+}}(%esp), %xmm0
614; X86-SSE-NEXT:    retl
615;
616; X86-AVX-LABEL: blend_div_sd:
617; X86-AVX:       # %bb.0:
618; X86-AVX-NEXT:    vdivsd {{[0-9]+}}(%esp), %xmm0, %xmm0
619; X86-AVX-NEXT:    retl
620;
621; X64-SSE-LABEL: blend_div_sd:
622; X64-SSE:       # %bb.0:
623; X64-SSE-NEXT:    divsd %xmm1, %xmm0
624; X64-SSE-NEXT:    retq
625;
626; X64-AVX-LABEL: blend_div_sd:
627; X64-AVX:       # %bb.0:
628; X64-AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
629; X64-AVX-NEXT:    retq
630
631  %ext = extractelement <2 x double> %a, i32 0
632  %op = fdiv double %ext, %b
633  %ins = insertelement <2 x double> undef, double %op, i32 0
634  %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
635  ret <2 x double> %shuf
636}
637
638; Ensure that the backend selects SSE/AVX scalar fp instructions
639; from a packed fp instruction plus a vector insert.
640
641define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) {
642; SSE-LABEL: insert_test_add_ss:
643; SSE:       # %bb.0:
644; SSE-NEXT:    addss %xmm1, %xmm0
645; SSE-NEXT:    ret{{[l|q]}}
646;
647; AVX-LABEL: insert_test_add_ss:
648; AVX:       # %bb.0:
649; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
650; AVX-NEXT:    ret{{[l|q]}}
651  %1 = fadd <4 x float> %a, %b
652  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
653  ret <4 x float> %2
654}
655
656define <4 x float> @insert_test_sub_ss(<4 x float> %a, <4 x float> %b) {
657; SSE-LABEL: insert_test_sub_ss:
658; SSE:       # %bb.0:
659; SSE-NEXT:    subss %xmm1, %xmm0
660; SSE-NEXT:    ret{{[l|q]}}
661;
662; AVX-LABEL: insert_test_sub_ss:
663; AVX:       # %bb.0:
664; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
665; AVX-NEXT:    ret{{[l|q]}}
666  %1 = fsub <4 x float> %a, %b
667  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
668  ret <4 x float> %2
669}
670
671define <4 x float> @insert_test_mul_ss(<4 x float> %a, <4 x float> %b) {
672; SSE-LABEL: insert_test_mul_ss:
673; SSE:       # %bb.0:
674; SSE-NEXT:    mulss %xmm1, %xmm0
675; SSE-NEXT:    ret{{[l|q]}}
676;
677; AVX-LABEL: insert_test_mul_ss:
678; AVX:       # %bb.0:
679; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
680; AVX-NEXT:    ret{{[l|q]}}
681  %1 = fmul <4 x float> %a, %b
682  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
683  ret <4 x float> %2
684}
685
686define <4 x float> @insert_test_div_ss(<4 x float> %a, <4 x float> %b) {
687; SSE-LABEL: insert_test_div_ss:
688; SSE:       # %bb.0:
689; SSE-NEXT:    divss %xmm1, %xmm0
690; SSE-NEXT:    ret{{[l|q]}}
691;
692; AVX-LABEL: insert_test_div_ss:
693; AVX:       # %bb.0:
694; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
695; AVX-NEXT:    ret{{[l|q]}}
696  %1 = fdiv <4 x float> %a, %b
697  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
698  ret <4 x float> %2
699}
700
701define <2 x double> @insert_test_add_sd(<2 x double> %a, <2 x double> %b) {
702; SSE-LABEL: insert_test_add_sd:
703; SSE:       # %bb.0:
704; SSE-NEXT:    addsd %xmm1, %xmm0
705; SSE-NEXT:    ret{{[l|q]}}
706;
707; AVX-LABEL: insert_test_add_sd:
708; AVX:       # %bb.0:
709; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
710; AVX-NEXT:    ret{{[l|q]}}
711  %1 = fadd <2 x double> %a, %b
712  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
713  ret <2 x double> %2
714}
715
716define <2 x double> @insert_test_sub_sd(<2 x double> %a, <2 x double> %b) {
717; SSE-LABEL: insert_test_sub_sd:
718; SSE:       # %bb.0:
719; SSE-NEXT:    subsd %xmm1, %xmm0
720; SSE-NEXT:    ret{{[l|q]}}
721;
722; AVX-LABEL: insert_test_sub_sd:
723; AVX:       # %bb.0:
724; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
725; AVX-NEXT:    ret{{[l|q]}}
726  %1 = fsub <2 x double> %a, %b
727  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
728  ret <2 x double> %2
729}
730
731define <2 x double> @insert_test_mul_sd(<2 x double> %a, <2 x double> %b) {
732; SSE-LABEL: insert_test_mul_sd:
733; SSE:       # %bb.0:
734; SSE-NEXT:    mulsd %xmm1, %xmm0
735; SSE-NEXT:    ret{{[l|q]}}
736;
737; AVX-LABEL: insert_test_mul_sd:
738; AVX:       # %bb.0:
739; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
740; AVX-NEXT:    ret{{[l|q]}}
741  %1 = fmul <2 x double> %a, %b
742  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
743  ret <2 x double> %2
744}
745
746define <2 x double> @insert_test_div_sd(<2 x double> %a, <2 x double> %b) {
747; SSE-LABEL: insert_test_div_sd:
748; SSE:       # %bb.0:
749; SSE-NEXT:    divsd %xmm1, %xmm0
750; SSE-NEXT:    ret{{[l|q]}}
751;
752; AVX-LABEL: insert_test_div_sd:
753; AVX:       # %bb.0:
754; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
755; AVX-NEXT:    ret{{[l|q]}}
756  %1 = fdiv <2 x double> %a, %b
757  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
758  ret <2 x double> %2
759}
760
761define <4 x float> @insert_test2_add_ss(<4 x float> %a, <4 x float> %b) {
762; SSE-LABEL: insert_test2_add_ss:
763; SSE:       # %bb.0:
764; SSE-NEXT:    addss %xmm0, %xmm1
765; SSE-NEXT:    movaps %xmm1, %xmm0
766; SSE-NEXT:    ret{{[l|q]}}
767;
768; AVX-LABEL: insert_test2_add_ss:
769; AVX:       # %bb.0:
770; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
771; AVX-NEXT:    ret{{[l|q]}}
772  %1 = fadd <4 x float> %b, %a
773  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
774  ret <4 x float> %2
775}
776
777define <4 x float> @insert_test2_sub_ss(<4 x float> %a, <4 x float> %b) {
778; SSE-LABEL: insert_test2_sub_ss:
779; SSE:       # %bb.0:
780; SSE-NEXT:    subss %xmm0, %xmm1
781; SSE-NEXT:    movaps %xmm1, %xmm0
782; SSE-NEXT:    ret{{[l|q]}}
783;
784; AVX-LABEL: insert_test2_sub_ss:
785; AVX:       # %bb.0:
786; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
787; AVX-NEXT:    ret{{[l|q]}}
788  %1 = fsub <4 x float> %b, %a
789  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
790  ret <4 x float> %2
791}
792
793define <4 x float> @insert_test2_mul_ss(<4 x float> %a, <4 x float> %b) {
794; SSE-LABEL: insert_test2_mul_ss:
795; SSE:       # %bb.0:
796; SSE-NEXT:    mulss %xmm0, %xmm1
797; SSE-NEXT:    movaps %xmm1, %xmm0
798; SSE-NEXT:    ret{{[l|q]}}
799;
800; AVX-LABEL: insert_test2_mul_ss:
801; AVX:       # %bb.0:
802; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
803; AVX-NEXT:    ret{{[l|q]}}
804  %1 = fmul <4 x float> %b, %a
805  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
806  ret <4 x float> %2
807}
808
809define <4 x float> @insert_test2_div_ss(<4 x float> %a, <4 x float> %b) {
810; SSE-LABEL: insert_test2_div_ss:
811; SSE:       # %bb.0:
812; SSE-NEXT:    divss %xmm0, %xmm1
813; SSE-NEXT:    movaps %xmm1, %xmm0
814; SSE-NEXT:    ret{{[l|q]}}
815;
816; AVX-LABEL: insert_test2_div_ss:
817; AVX:       # %bb.0:
818; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
819; AVX-NEXT:    ret{{[l|q]}}
820  %1 = fdiv <4 x float> %b, %a
821  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
822  ret <4 x float> %2
823}
824
825define <2 x double> @insert_test2_add_sd(<2 x double> %a, <2 x double> %b) {
826; SSE-LABEL: insert_test2_add_sd:
827; SSE:       # %bb.0:
828; SSE-NEXT:    addsd %xmm0, %xmm1
829; SSE-NEXT:    movapd %xmm1, %xmm0
830; SSE-NEXT:    ret{{[l|q]}}
831;
832; AVX-LABEL: insert_test2_add_sd:
833; AVX:       # %bb.0:
834; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
835; AVX-NEXT:    ret{{[l|q]}}
836  %1 = fadd <2 x double> %b, %a
837  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
838  ret <2 x double> %2
839}
840
841define <2 x double> @insert_test2_sub_sd(<2 x double> %a, <2 x double> %b) {
842; SSE-LABEL: insert_test2_sub_sd:
843; SSE:       # %bb.0:
844; SSE-NEXT:    subsd %xmm0, %xmm1
845; SSE-NEXT:    movapd %xmm1, %xmm0
846; SSE-NEXT:    ret{{[l|q]}}
847;
848; AVX-LABEL: insert_test2_sub_sd:
849; AVX:       # %bb.0:
850; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
851; AVX-NEXT:    ret{{[l|q]}}
852  %1 = fsub <2 x double> %b, %a
853  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
854  ret <2 x double> %2
855}
856
857define <2 x double> @insert_test2_mul_sd(<2 x double> %a, <2 x double> %b) {
858; SSE-LABEL: insert_test2_mul_sd:
859; SSE:       # %bb.0:
860; SSE-NEXT:    mulsd %xmm0, %xmm1
861; SSE-NEXT:    movapd %xmm1, %xmm0
862; SSE-NEXT:    ret{{[l|q]}}
863;
864; AVX-LABEL: insert_test2_mul_sd:
865; AVX:       # %bb.0:
866; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
867; AVX-NEXT:    ret{{[l|q]}}
868  %1 = fmul <2 x double> %b, %a
869  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
870  ret <2 x double> %2
871}
872
873define <2 x double> @insert_test2_div_sd(<2 x double> %a, <2 x double> %b) {
874; SSE-LABEL: insert_test2_div_sd:
875; SSE:       # %bb.0:
876; SSE-NEXT:    divsd %xmm0, %xmm1
877; SSE-NEXT:    movapd %xmm1, %xmm0
878; SSE-NEXT:    ret{{[l|q]}}
879;
880; AVX-LABEL: insert_test2_div_sd:
881; AVX:       # %bb.0:
882; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
883; AVX-NEXT:    ret{{[l|q]}}
884  %1 = fdiv <2 x double> %b, %a
885  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
886  ret <2 x double> %2
887}
888
889define <4 x float> @insert_test3_add_ss(<4 x float> %a, <4 x float> %b) {
890; SSE-LABEL: insert_test3_add_ss:
891; SSE:       # %bb.0:
892; SSE-NEXT:    addss %xmm1, %xmm0
893; SSE-NEXT:    ret{{[l|q]}}
894;
895; AVX-LABEL: insert_test3_add_ss:
896; AVX:       # %bb.0:
897; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
898; AVX-NEXT:    ret{{[l|q]}}
899  %1 = fadd <4 x float> %a, %b
900  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
901  ret <4 x float> %2
902}
903
904define <4 x float> @insert_test3_sub_ss(<4 x float> %a, <4 x float> %b) {
905; SSE-LABEL: insert_test3_sub_ss:
906; SSE:       # %bb.0:
907; SSE-NEXT:    subss %xmm1, %xmm0
908; SSE-NEXT:    ret{{[l|q]}}
909;
910; AVX-LABEL: insert_test3_sub_ss:
911; AVX:       # %bb.0:
912; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
913; AVX-NEXT:    ret{{[l|q]}}
914  %1 = fsub <4 x float> %a, %b
915  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
916  ret <4 x float> %2
917}
918
919define <4 x float> @insert_test3_mul_ss(<4 x float> %a, <4 x float> %b) {
920; SSE-LABEL: insert_test3_mul_ss:
921; SSE:       # %bb.0:
922; SSE-NEXT:    mulss %xmm1, %xmm0
923; SSE-NEXT:    ret{{[l|q]}}
924;
925; AVX-LABEL: insert_test3_mul_ss:
926; AVX:       # %bb.0:
927; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
928; AVX-NEXT:    ret{{[l|q]}}
929  %1 = fmul <4 x float> %a, %b
930  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
931  ret <4 x float> %2
932}
933
934define <4 x float> @insert_test3_div_ss(<4 x float> %a, <4 x float> %b) {
935; SSE-LABEL: insert_test3_div_ss:
936; SSE:       # %bb.0:
937; SSE-NEXT:    divss %xmm1, %xmm0
938; SSE-NEXT:    ret{{[l|q]}}
939;
940; AVX-LABEL: insert_test3_div_ss:
941; AVX:       # %bb.0:
942; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
943; AVX-NEXT:    ret{{[l|q]}}
944  %1 = fdiv <4 x float> %a, %b
945  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
946  ret <4 x float> %2
947}
948
949define <2 x double> @insert_test3_add_sd(<2 x double> %a, <2 x double> %b) {
950; SSE-LABEL: insert_test3_add_sd:
951; SSE:       # %bb.0:
952; SSE-NEXT:    addsd %xmm1, %xmm0
953; SSE-NEXT:    ret{{[l|q]}}
954;
955; AVX-LABEL: insert_test3_add_sd:
956; AVX:       # %bb.0:
957; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
958; AVX-NEXT:    ret{{[l|q]}}
959  %1 = fadd <2 x double> %a, %b
960  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
961  ret <2 x double> %2
962}
963
964define <2 x double> @insert_test3_sub_sd(<2 x double> %a, <2 x double> %b) {
965; SSE-LABEL: insert_test3_sub_sd:
966; SSE:       # %bb.0:
967; SSE-NEXT:    subsd %xmm1, %xmm0
968; SSE-NEXT:    ret{{[l|q]}}
969;
970; AVX-LABEL: insert_test3_sub_sd:
971; AVX:       # %bb.0:
972; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
973; AVX-NEXT:    ret{{[l|q]}}
974  %1 = fsub <2 x double> %a, %b
975  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
976  ret <2 x double> %2
977}
978
979define <2 x double> @insert_test3_mul_sd(<2 x double> %a, <2 x double> %b) {
980; SSE-LABEL: insert_test3_mul_sd:
981; SSE:       # %bb.0:
982; SSE-NEXT:    mulsd %xmm1, %xmm0
983; SSE-NEXT:    ret{{[l|q]}}
984;
985; AVX-LABEL: insert_test3_mul_sd:
986; AVX:       # %bb.0:
987; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
988; AVX-NEXT:    ret{{[l|q]}}
989  %1 = fmul <2 x double> %a, %b
990  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
991  ret <2 x double> %2
992}
993
994define <2 x double> @insert_test3_div_sd(<2 x double> %a, <2 x double> %b) {
995; SSE-LABEL: insert_test3_div_sd:
996; SSE:       # %bb.0:
997; SSE-NEXT:    divsd %xmm1, %xmm0
998; SSE-NEXT:    ret{{[l|q]}}
999;
1000; AVX-LABEL: insert_test3_div_sd:
1001; AVX:       # %bb.0:
1002; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
1003; AVX-NEXT:    ret{{[l|q]}}
1004  %1 = fdiv <2 x double> %a, %b
1005  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
1006  ret <2 x double> %2
1007}
1008
1009define <4 x float> @insert_test4_add_ss(<4 x float> %a, <4 x float> %b) {
1010; SSE-LABEL: insert_test4_add_ss:
1011; SSE:       # %bb.0:
1012; SSE-NEXT:    addss %xmm0, %xmm1
1013; SSE-NEXT:    movaps %xmm1, %xmm0
1014; SSE-NEXT:    ret{{[l|q]}}
1015;
1016; AVX-LABEL: insert_test4_add_ss:
1017; AVX:       # %bb.0:
1018; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
1019; AVX-NEXT:    ret{{[l|q]}}
1020  %1 = fadd <4 x float> %b, %a
1021  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
1022  ret <4 x float> %2
1023}
1024
1025define <4 x float> @insert_test4_sub_ss(<4 x float> %a, <4 x float> %b) {
1026; SSE-LABEL: insert_test4_sub_ss:
1027; SSE:       # %bb.0:
1028; SSE-NEXT:    subss %xmm0, %xmm1
1029; SSE-NEXT:    movaps %xmm1, %xmm0
1030; SSE-NEXT:    ret{{[l|q]}}
1031;
1032; AVX-LABEL: insert_test4_sub_ss:
1033; AVX:       # %bb.0:
1034; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
1035; AVX-NEXT:    ret{{[l|q]}}
1036  %1 = fsub <4 x float> %b, %a
1037  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
1038  ret <4 x float> %2
1039}
1040
1041define <4 x float> @insert_test4_mul_ss(<4 x float> %a, <4 x float> %b) {
1042; SSE-LABEL: insert_test4_mul_ss:
1043; SSE:       # %bb.0:
1044; SSE-NEXT:    mulss %xmm0, %xmm1
1045; SSE-NEXT:    movaps %xmm1, %xmm0
1046; SSE-NEXT:    ret{{[l|q]}}
1047;
1048; AVX-LABEL: insert_test4_mul_ss:
1049; AVX:       # %bb.0:
1050; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
1051; AVX-NEXT:    ret{{[l|q]}}
1052  %1 = fmul <4 x float> %b, %a
1053  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
1054  ret <4 x float> %2
1055}
1056
1057define <4 x float> @insert_test4_div_ss(<4 x float> %a, <4 x float> %b) {
1058; SSE-LABEL: insert_test4_div_ss:
1059; SSE:       # %bb.0:
1060; SSE-NEXT:    divss %xmm0, %xmm1
1061; SSE-NEXT:    movaps %xmm1, %xmm0
1062; SSE-NEXT:    ret{{[l|q]}}
1063;
1064; AVX-LABEL: insert_test4_div_ss:
1065; AVX:       # %bb.0:
1066; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
1067; AVX-NEXT:    ret{{[l|q]}}
1068  %1 = fdiv <4 x float> %b, %a
1069  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
1070  ret <4 x float> %2
1071}
1072
1073define <2 x double> @insert_test4_add_sd(<2 x double> %a, <2 x double> %b) {
1074; SSE-LABEL: insert_test4_add_sd:
1075; SSE:       # %bb.0:
1076; SSE-NEXT:    addsd %xmm0, %xmm1
1077; SSE-NEXT:    movapd %xmm1, %xmm0
1078; SSE-NEXT:    ret{{[l|q]}}
1079;
1080; AVX-LABEL: insert_test4_add_sd:
1081; AVX:       # %bb.0:
1082; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1083; AVX-NEXT:    ret{{[l|q]}}
1084  %1 = fadd <2 x double> %b, %a
1085  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1086  ret <2 x double> %2
1087}
1088
1089define <2 x double> @insert_test4_sub_sd(<2 x double> %a, <2 x double> %b) {
1090; SSE-LABEL: insert_test4_sub_sd:
1091; SSE:       # %bb.0:
1092; SSE-NEXT:    subsd %xmm0, %xmm1
1093; SSE-NEXT:    movapd %xmm1, %xmm0
1094; SSE-NEXT:    ret{{[l|q]}}
1095;
1096; AVX-LABEL: insert_test4_sub_sd:
1097; AVX:       # %bb.0:
1098; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
1099; AVX-NEXT:    ret{{[l|q]}}
1100  %1 = fsub <2 x double> %b, %a
1101  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1102  ret <2 x double> %2
1103}
1104
1105define <2 x double> @insert_test4_mul_sd(<2 x double> %a, <2 x double> %b) {
1106; SSE-LABEL: insert_test4_mul_sd:
1107; SSE:       # %bb.0:
1108; SSE-NEXT:    mulsd %xmm0, %xmm1
1109; SSE-NEXT:    movapd %xmm1, %xmm0
1110; SSE-NEXT:    ret{{[l|q]}}
1111;
1112; AVX-LABEL: insert_test4_mul_sd:
1113; AVX:       # %bb.0:
1114; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
1115; AVX-NEXT:    ret{{[l|q]}}
1116  %1 = fmul <2 x double> %b, %a
1117  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1118  ret <2 x double> %2
1119}
1120
1121define <2 x double> @insert_test4_div_sd(<2 x double> %a, <2 x double> %b) {
1122; SSE-LABEL: insert_test4_div_sd:
1123; SSE:       # %bb.0:
1124; SSE-NEXT:    divsd %xmm0, %xmm1
1125; SSE-NEXT:    movapd %xmm1, %xmm0
1126; SSE-NEXT:    ret{{[l|q]}}
1127;
1128; AVX-LABEL: insert_test4_div_sd:
1129; AVX:       # %bb.0:
1130; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
1131; AVX-NEXT:    ret{{[l|q]}}
1132  %1 = fdiv <2 x double> %b, %a
1133  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1134  ret <2 x double> %2
1135}
1136
1137define <4 x float> @insert_test5_add_ss(<4 x float> %a, <4 x float> %b) {
1138; SSE-LABEL: insert_test5_add_ss:
1139; SSE:       # %bb.0:
1140; SSE-NEXT:    addss %xmm1, %xmm0
1141; SSE-NEXT:    ret{{[l|q]}}
1142;
1143; AVX-LABEL: insert_test5_add_ss:
1144; AVX:       # %bb.0:
1145; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1146; AVX-NEXT:    ret{{[l|q]}}
1147  %1 = fadd <4 x float> %b, %a
1148  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1149  ret <4 x float> %2
1150}
1151
1152define <4 x float> @insert_test5_sub_ss(<4 x float> %a, <4 x float> %b) {
1153; SSE2-LABEL: insert_test5_sub_ss:
1154; SSE2:       # %bb.0:
1155; SSE2-NEXT:    subps %xmm0, %xmm1
1156; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1157; SSE2-NEXT:    ret{{[l|q]}}
1158;
1159; SSE41-LABEL: insert_test5_sub_ss:
1160; SSE41:       # %bb.0:
1161; SSE41-NEXT:    subps %xmm0, %xmm1
1162; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1163; SSE41-NEXT:    ret{{[l|q]}}
1164;
1165; AVX-LABEL: insert_test5_sub_ss:
1166; AVX:       # %bb.0:
1167; AVX-NEXT:    vsubps %xmm0, %xmm1, %xmm1
1168; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1169; AVX-NEXT:    ret{{[l|q]}}
1170  %1 = fsub <4 x float> %b, %a
1171  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1172  ret <4 x float> %2
1173}
1174
1175define <4 x float> @insert_test5_mul_ss(<4 x float> %a, <4 x float> %b) {
1176; SSE-LABEL: insert_test5_mul_ss:
1177; SSE:       # %bb.0:
1178; SSE-NEXT:    mulss %xmm1, %xmm0
1179; SSE-NEXT:    ret{{[l|q]}}
1180;
1181; AVX-LABEL: insert_test5_mul_ss:
1182; AVX:       # %bb.0:
1183; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
1184; AVX-NEXT:    ret{{[l|q]}}
1185  %1 = fmul <4 x float> %b, %a
1186  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1187  ret <4 x float> %2
1188}
1189
1190define <4 x float> @insert_test5_div_ss(<4 x float> %a, <4 x float> %b) {
1191; SSE2-LABEL: insert_test5_div_ss:
1192; SSE2:       # %bb.0:
1193; SSE2-NEXT:    divps %xmm0, %xmm1
1194; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1195; SSE2-NEXT:    ret{{[l|q]}}
1196;
1197; SSE41-LABEL: insert_test5_div_ss:
1198; SSE41:       # %bb.0:
1199; SSE41-NEXT:    divps %xmm0, %xmm1
1200; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1201; SSE41-NEXT:    ret{{[l|q]}}
1202;
1203; AVX-LABEL: insert_test5_div_ss:
1204; AVX:       # %bb.0:
1205; AVX-NEXT:    vdivps %xmm0, %xmm1, %xmm1
1206; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1207; AVX-NEXT:    ret{{[l|q]}}
1208  %1 = fdiv <4 x float> %b, %a
1209  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1210  ret <4 x float> %2
1211}
1212
1213define <2 x double> @insert_test5_add_sd(<2 x double> %a, <2 x double> %b) {
1214; SSE-LABEL: insert_test5_add_sd:
1215; SSE:       # %bb.0:
1216; SSE-NEXT:    addsd %xmm1, %xmm0
1217; SSE-NEXT:    ret{{[l|q]}}
1218;
1219; AVX-LABEL: insert_test5_add_sd:
1220; AVX:       # %bb.0:
1221; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1222; AVX-NEXT:    ret{{[l|q]}}
1223  %1 = fadd <2 x double> %b, %a
1224  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
1225  ret <2 x double> %2
1226}
1227
1228define <2 x double> @insert_test5_sub_sd(<2 x double> %a, <2 x double> %b) {
1229; SSE2-LABEL: insert_test5_sub_sd:
1230; SSE2:       # %bb.0:
1231; SSE2-NEXT:    subpd %xmm0, %xmm1
1232; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1233; SSE2-NEXT:    ret{{[l|q]}}
1234;
1235; SSE41-LABEL: insert_test5_sub_sd:
1236; SSE41:       # %bb.0:
1237; SSE41-NEXT:    subpd %xmm0, %xmm1
1238; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1239; SSE41-NEXT:    ret{{[l|q]}}
1240;
1241; AVX-LABEL: insert_test5_sub_sd:
1242; AVX:       # %bb.0:
1243; AVX-NEXT:    vsubpd %xmm0, %xmm1, %xmm1
1244; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1245; AVX-NEXT:    ret{{[l|q]}}
1246  %1 = fsub <2 x double> %b, %a
1247  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
1248  ret <2 x double> %2
1249}
1250
1251define <2 x double> @insert_test5_mul_sd(<2 x double> %a, <2 x double> %b) {
1252; SSE-LABEL: insert_test5_mul_sd:
1253; SSE:       # %bb.0:
1254; SSE-NEXT:    mulsd %xmm1, %xmm0
1255; SSE-NEXT:    ret{{[l|q]}}
1256;
1257; AVX-LABEL: insert_test5_mul_sd:
1258; AVX:       # %bb.0:
1259; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1260; AVX-NEXT:    ret{{[l|q]}}
1261  %1 = fmul <2 x double> %b, %a
1262  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
1263  ret <2 x double> %2
1264}
1265
1266define <2 x double> @insert_test5_div_sd(<2 x double> %a, <2 x double> %b) {
1267; SSE2-LABEL: insert_test5_div_sd:
1268; SSE2:       # %bb.0:
1269; SSE2-NEXT:    divpd %xmm0, %xmm1
1270; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1271; SSE2-NEXT:    ret{{[l|q]}}
1272;
1273; SSE41-LABEL: insert_test5_div_sd:
1274; SSE41:       # %bb.0:
1275; SSE41-NEXT:    divpd %xmm0, %xmm1
1276; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1277; SSE41-NEXT:    ret{{[l|q]}}
1278;
1279; AVX-LABEL: insert_test5_div_sd:
1280; AVX:       # %bb.0:
1281; AVX-NEXT:    vdivpd %xmm0, %xmm1, %xmm1
1282; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1283; AVX-NEXT:    ret{{[l|q]}}
1284  %1 = fdiv <2 x double> %b, %a
1285  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
1286  ret <2 x double> %2
1287}
1288
1289define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1290; X86-SSE2-LABEL: add_ss_mask:
1291; X86-SSE2:       # %bb.0:
1292; X86-SSE2-NEXT:    testb $1, {{[0-9]+}}(%esp)
1293; X86-SSE2-NEXT:    jne .LBB70_1
1294; X86-SSE2-NEXT:  # %bb.2:
1295; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1296; X86-SSE2-NEXT:    retl
1297; X86-SSE2-NEXT:  .LBB70_1:
1298; X86-SSE2-NEXT:    addss %xmm0, %xmm1
1299; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1300; X86-SSE2-NEXT:    retl
1301;
1302; X86-SSE41-LABEL: add_ss_mask:
1303; X86-SSE41:       # %bb.0:
1304; X86-SSE41-NEXT:    testb $1, {{[0-9]+}}(%esp)
1305; X86-SSE41-NEXT:    jne .LBB70_1
1306; X86-SSE41-NEXT:  # %bb.2:
1307; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1308; X86-SSE41-NEXT:    retl
1309; X86-SSE41-NEXT:  .LBB70_1:
1310; X86-SSE41-NEXT:    addss %xmm0, %xmm1
1311; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1312; X86-SSE41-NEXT:    retl
1313;
1314; X86-AVX1-LABEL: add_ss_mask:
1315; X86-AVX1:       # %bb.0:
1316; X86-AVX1-NEXT:    testb $1, {{[0-9]+}}(%esp)
1317; X86-AVX1-NEXT:    je .LBB70_2
1318; X86-AVX1-NEXT:  # %bb.1:
1319; X86-AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
1320; X86-AVX1-NEXT:  .LBB70_2:
1321; X86-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1322; X86-AVX1-NEXT:    retl
1323;
1324; X86-AVX512-LABEL: add_ss_mask:
1325; X86-AVX512:       # %bb.0:
1326; X86-AVX512-NEXT:    movb {{[0-9]+}}(%esp), %al
1327; X86-AVX512-NEXT:    kmovw %eax, %k1
1328; X86-AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm2 {%k1}
1329; X86-AVX512-NEXT:    vmovaps %xmm2, %xmm0
1330; X86-AVX512-NEXT:    retl
1331;
1332; X64-SSE2-LABEL: add_ss_mask:
1333; X64-SSE2:       # %bb.0:
1334; X64-SSE2-NEXT:    testb $1, %dil
1335; X64-SSE2-NEXT:    jne .LBB70_1
1336; X64-SSE2-NEXT:  # %bb.2:
1337; X64-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1338; X64-SSE2-NEXT:    retq
1339; X64-SSE2-NEXT:  .LBB70_1:
1340; X64-SSE2-NEXT:    addss %xmm0, %xmm1
1341; X64-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1342; X64-SSE2-NEXT:    retq
1343;
1344; X64-SSE41-LABEL: add_ss_mask:
1345; X64-SSE41:       # %bb.0:
1346; X64-SSE41-NEXT:    testb $1, %dil
1347; X64-SSE41-NEXT:    jne .LBB70_1
1348; X64-SSE41-NEXT:  # %bb.2:
1349; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1350; X64-SSE41-NEXT:    retq
1351; X64-SSE41-NEXT:  .LBB70_1:
1352; X64-SSE41-NEXT:    addss %xmm0, %xmm1
1353; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1354; X64-SSE41-NEXT:    retq
1355;
1356; X64-AVX1-LABEL: add_ss_mask:
1357; X64-AVX1:       # %bb.0:
1358; X64-AVX1-NEXT:    testb $1, %dil
1359; X64-AVX1-NEXT:    je .LBB70_2
1360; X64-AVX1-NEXT:  # %bb.1:
1361; X64-AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
1362; X64-AVX1-NEXT:  .LBB70_2:
1363; X64-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1364; X64-AVX1-NEXT:    retq
1365;
1366; X64-AVX512-LABEL: add_ss_mask:
1367; X64-AVX512:       # %bb.0:
1368; X64-AVX512-NEXT:    kmovw %edi, %k1
1369; X64-AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm2 {%k1}
1370; X64-AVX512-NEXT:    vmovaps %xmm2, %xmm0
1371; X64-AVX512-NEXT:    retq
1372  %1 = extractelement <4 x float> %a, i64 0
1373  %2 = extractelement <4 x float> %b, i64 0
1374  %3 = fadd float %1, %2
1375  %4 = extractelement <4 x float> %c, i32 0
1376  %5 = bitcast i8 %mask to <8 x i1>
1377  %6 = extractelement <8 x i1> %5, i64 0
1378  %7 = select i1 %6, float %3, float %4
1379  %8 = insertelement <4 x float> %a, float %7, i64 0
1380  ret <4 x float> %8
1381}
1382
1383define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1384; X86-SSE2-LABEL: add_sd_mask:
1385; X86-SSE2:       # %bb.0:
1386; X86-SSE2-NEXT:    testb $1, {{[0-9]+}}(%esp)
1387; X86-SSE2-NEXT:    jne .LBB71_1
1388; X86-SSE2-NEXT:  # %bb.2:
1389; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1390; X86-SSE2-NEXT:    retl
1391; X86-SSE2-NEXT:  .LBB71_1:
1392; X86-SSE2-NEXT:    addsd %xmm0, %xmm1
1393; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1394; X86-SSE2-NEXT:    retl
1395;
1396; X86-SSE41-LABEL: add_sd_mask:
1397; X86-SSE41:       # %bb.0:
1398; X86-SSE41-NEXT:    testb $1, {{[0-9]+}}(%esp)
1399; X86-SSE41-NEXT:    jne .LBB71_1
1400; X86-SSE41-NEXT:  # %bb.2:
1401; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
1402; X86-SSE41-NEXT:    retl
1403; X86-SSE41-NEXT:  .LBB71_1:
1404; X86-SSE41-NEXT:    addsd %xmm0, %xmm1
1405; X86-SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1406; X86-SSE41-NEXT:    retl
1407;
1408; X86-AVX1-LABEL: add_sd_mask:
1409; X86-AVX1:       # %bb.0:
1410; X86-AVX1-NEXT:    testb $1, {{[0-9]+}}(%esp)
1411; X86-AVX1-NEXT:    je .LBB71_2
1412; X86-AVX1-NEXT:  # %bb.1:
1413; X86-AVX1-NEXT:    vaddsd %xmm1, %xmm0, %xmm2
1414; X86-AVX1-NEXT:  .LBB71_2:
1415; X86-AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1416; X86-AVX1-NEXT:    retl
1417;
1418; X86-AVX512-LABEL: add_sd_mask:
1419; X86-AVX512:       # %bb.0:
1420; X86-AVX512-NEXT:    movb {{[0-9]+}}(%esp), %al
1421; X86-AVX512-NEXT:    kmovw %eax, %k1
1422; X86-AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm2 {%k1}
1423; X86-AVX512-NEXT:    vmovapd %xmm2, %xmm0
1424; X86-AVX512-NEXT:    retl
1425;
1426; X64-SSE2-LABEL: add_sd_mask:
1427; X64-SSE2:       # %bb.0:
1428; X64-SSE2-NEXT:    testb $1, %dil
1429; X64-SSE2-NEXT:    jne .LBB71_1
1430; X64-SSE2-NEXT:  # %bb.2:
1431; X64-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1432; X64-SSE2-NEXT:    retq
1433; X64-SSE2-NEXT:  .LBB71_1:
1434; X64-SSE2-NEXT:    addsd %xmm0, %xmm1
1435; X64-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1436; X64-SSE2-NEXT:    retq
1437;
1438; X64-SSE41-LABEL: add_sd_mask:
1439; X64-SSE41:       # %bb.0:
1440; X64-SSE41-NEXT:    testb $1, %dil
1441; X64-SSE41-NEXT:    jne .LBB71_1
1442; X64-SSE41-NEXT:  # %bb.2:
1443; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
1444; X64-SSE41-NEXT:    retq
1445; X64-SSE41-NEXT:  .LBB71_1:
1446; X64-SSE41-NEXT:    addsd %xmm0, %xmm1
1447; X64-SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1448; X64-SSE41-NEXT:    retq
1449;
1450; X64-AVX1-LABEL: add_sd_mask:
1451; X64-AVX1:       # %bb.0:
1452; X64-AVX1-NEXT:    testb $1, %dil
1453; X64-AVX1-NEXT:    je .LBB71_2
1454; X64-AVX1-NEXT:  # %bb.1:
1455; X64-AVX1-NEXT:    vaddsd %xmm1, %xmm0, %xmm2
1456; X64-AVX1-NEXT:  .LBB71_2:
1457; X64-AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1458; X64-AVX1-NEXT:    retq
1459;
1460; X64-AVX512-LABEL: add_sd_mask:
1461; X64-AVX512:       # %bb.0:
1462; X64-AVX512-NEXT:    kmovw %edi, %k1
1463; X64-AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm2 {%k1}
1464; X64-AVX512-NEXT:    vmovapd %xmm2, %xmm0
1465; X64-AVX512-NEXT:    retq
1466  %1 = extractelement <2 x double> %a, i64 0
1467  %2 = extractelement <2 x double> %b, i64 0
1468  %3 = fadd double %1, %2
1469  %4 = extractelement <2 x double> %c, i32 0
1470  %5 = bitcast i8 %mask to <8 x i1>
1471  %6 = extractelement <8 x i1> %5, i64 0
1472  %7 = select i1 %6, double %3, double %4
1473  %8 = insertelement <2 x double> %a, double %7, i64 0
1474  ret <2 x double> %8
1475}
1476