1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX
6
7define <4 x float> @shuffle_v4f32_0z27(<4 x float> %x, <4 x float> %a) {
8; SSE-LABEL: shuffle_v4f32_0z27:
9; SSE:       # %bb.0:
10; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
11; SSE-NEXT:    retq
12;
13; AVX-LABEL: shuffle_v4f32_0z27:
14; AVX:       # %bb.0:
15; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
16; AVX-NEXT:    retq
17  %vecext = extractelement <4 x float> %x, i32 0
18  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
19  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
20  %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
21  %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
22  ret <4 x float> %vecinit5
23}
24
25define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %xyzw, <4 x float> %abcd) {
26; SSE-LABEL: shuffle_v4f32_0zz4:
27; SSE:       # %bb.0:
28; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
29; SSE-NEXT:    retq
30;
31; AVX-LABEL: shuffle_v4f32_0zz4:
32; AVX:       # %bb.0:
33; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
34; AVX-NEXT:    retq
35  %vecext = extractelement <4 x float> %xyzw, i32 0
36  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
37  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
38  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.000000e+00, i32 2
39  %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %abcd, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
40  ret <4 x float> %vecinit4
41}
42
43define <4 x float> @shuffle_v4f32_0z24(<4 x float> %xyzw, <4 x float> %abcd) {
44; SSE-LABEL: shuffle_v4f32_0z24:
45; SSE:       # %bb.0:
46; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
47; SSE-NEXT:    retq
48;
49; AVX-LABEL: shuffle_v4f32_0z24:
50; AVX:       # %bb.0:
51; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
52; AVX-NEXT:    retq
53  %vecext = extractelement <4 x float> %xyzw, i32 0
54  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
55  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
56  %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %xyzw, <4 x i32> <i32 0, i32 1, i32 6, i32 undef>
57  %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %abcd, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
58  ret <4 x float> %vecinit5
59}
60
61define <4 x float> @shuffle_v4f32_0zz0(float %a) {
62; SSE-LABEL: shuffle_v4f32_0zz0:
63; SSE:       # %bb.0:
64; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
65; SSE-NEXT:    retq
66;
67; AVX-LABEL: shuffle_v4f32_0zz0:
68; AVX:       # %bb.0:
69; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
70; AVX-NEXT:    retq
71  %vecinit = insertelement <4 x float> undef, float %a, i32 0
72  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
73  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.000000e+00, i32 2
74  %vecinit3 = insertelement <4 x float> %vecinit2, float %a, i32 3
75  ret <4 x float> %vecinit3
76}
77
78define <4 x float> @shuffle_v4f32_0z6z(<4 x float> %A, <4 x float> %B) {
79; SSE-LABEL: shuffle_v4f32_0z6z:
80; SSE:       # %bb.0:
81; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
82; SSE-NEXT:    retq
83;
84; AVX-LABEL: shuffle_v4f32_0z6z:
85; AVX:       # %bb.0:
86; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
87; AVX-NEXT:    retq
88  %vecext = extractelement <4 x float> %A, i32 0
89  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
90  %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
91  %vecext2 = extractelement <4 x float> %B, i32 2
92  %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
93  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
94  ret <4 x float> %vecinit4
95}
96
97define <4 x float> @shuffle_v4f32_z06z(<4 x float> %a, <4 x float> %b) {
98; SSE-LABEL: shuffle_v4f32_z06z:
99; SSE:       # %bb.0:
100; SSE-NEXT:    insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
101; SSE-NEXT:    movaps %xmm1, %xmm0
102; SSE-NEXT:    retq
103;
104; AVX-LABEL: shuffle_v4f32_z06z:
105; AVX:       # %bb.0:
106; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm0[0],xmm1[2],zero
107; AVX-NEXT:    retq
108  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 undef, i32 0, i32 6, i32 undef>
109  %shuffle1 = shufflevector <4 x float> %shuffle, <4 x float> <float 0.000000e+00, float undef, float undef, float 0.000000e+00>, <4 x i32> <i32 4, i32 1, i32 2, i32 7>
110  ret <4 x float> %shuffle1
111}
112
113define <4 x float> @shuffle_v4f32_05zz(<4 x float> %a, <4 x float> %b) {
114; SSE-LABEL: shuffle_v4f32_05zz:
115; SSE:       # %bb.0:
116; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
117; SSE-NEXT:    retq
118;
119; AVX-LABEL: shuffle_v4f32_05zz:
120; AVX:       # %bb.0:
121; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
122; AVX-NEXT:    retq
123  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
124  %shuffle1 = shufflevector <4 x float> %shuffle, <4 x float> <float undef, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
125  ret <4 x float> %shuffle1
126}
127
128define <4 x float> @insertps_undef_input0(<4 x float> %a0, <4 x float> %a1) {
129; SSE-LABEL: insertps_undef_input0:
130; SSE:       # %bb.0:
131; SSE-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm1[0],zero,zero
132; SSE-NEXT:    retq
133;
134; AVX-LABEL: insertps_undef_input0:
135; AVX:       # %bb.0:
136; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm1[0],zero,zero
137; AVX-NEXT:    retq
138  %res0 = fadd <4 x float> %a0, <float 1.0, float 1.0, float 1.0, float 1.0>
139  %res1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %res0, <4 x float> %a1, i8 21)
140  %res2 = shufflevector <4 x float> %res1, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
141  ret <4 x float> %res2
142}
143
144define <4 x float> @insertps_undef_input1(<4 x float> %a0, <4 x float> %a1) {
145; SSE-LABEL: insertps_undef_input1:
146; SSE:       # %bb.0:
147; SSE-NEXT:    xorps %xmm1, %xmm1
148; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
149; SSE-NEXT:    retq
150;
151; AVX-LABEL: insertps_undef_input1:
152; AVX:       # %bb.0:
153; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
154; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
155; AVX-NEXT:    retq
156  %res0 = fadd <4 x float> %a1, <float 1.0, float 1.0, float 1.0, float 1.0>
157  %res1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %res0, i8 21)
158  %res2 = shufflevector <4 x float> %res1, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
159  ret <4 x float> %res2
160}
161
162define <4 x float> @insertps_zero_from_v2f64(<4 x float> %a0, <2 x double>* %a1) nounwind {
163; SSE-LABEL: insertps_zero_from_v2f64:
164; SSE:       # %bb.0:
165; SSE-NEXT:    movapd (%rdi), %xmm1
166; SSE-NEXT:    addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
167; SSE-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
168; SSE-NEXT:    movapd %xmm1, (%rdi)
169; SSE-NEXT:    retq
170;
171; AVX-LABEL: insertps_zero_from_v2f64:
172; AVX:       # %bb.0:
173; AVX-NEXT:    vmovapd (%rdi), %xmm1
174; AVX-NEXT:    vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
175; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
176; AVX-NEXT:    vmovapd %xmm1, (%rdi)
177; AVX-NEXT:    retq
178  %1 = load <2 x double>, <2 x double>* %a1
179  %2 = bitcast <2 x double> <double 1.0, double 2.0> to <4 x float>
180  %3 = fadd <2 x double> %1, <double 1.0, double 2.0>
181  %4 = shufflevector <4 x float> %a0, <4 x float> %2, <4 x i32> <i32 6, i32 2, i32 2, i32 3>
182  store <2 x double> %3, <2 x double> *%a1
183  ret <4 x float> %4
184}
185
186define <4 x float> @insertps_zero_from_v2i64(<4 x float> %a0, <2 x i64>* %a1) nounwind {
187; SSE-LABEL: insertps_zero_from_v2i64:
188; SSE:       # %bb.0:
189; SSE-NEXT:    movdqa (%rdi), %xmm1
190; SSE-NEXT:    paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
191; SSE-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
192; SSE-NEXT:    movdqa %xmm1, (%rdi)
193; SSE-NEXT:    retq
194;
195; AVX-LABEL: insertps_zero_from_v2i64:
196; AVX:       # %bb.0:
197; AVX-NEXT:    vmovdqa (%rdi), %xmm1
198; AVX-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
199; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
200; AVX-NEXT:    vmovdqa %xmm1, (%rdi)
201; AVX-NEXT:    retq
202  %1 = load <2 x i64>, <2 x i64>* %a1
203  %2 = bitcast <2 x i64> <i64 1, i64 -2> to <4 x float>
204  %3 = add <2 x i64> %1, <i64 1, i64 -2>
205  %4 = shufflevector <4 x float> %a0, <4 x float> %2, <4 x i32> <i32 5, i32 2, i32 2, i32 3>
206  store <2 x i64> %3, <2 x i64> *%a1
207  ret <4 x float> %4
208}
209
210define <4 x float> @insertps_zero_from_v8i16(<4 x float> %a0, <8 x i16>* %a1) nounwind {
211; SSE-LABEL: insertps_zero_from_v8i16:
212; SSE:       # %bb.0:
213; SSE-NEXT:    movdqa (%rdi), %xmm1
214; SSE-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
215; SSE-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
216; SSE-NEXT:    movdqa %xmm1, (%rdi)
217; SSE-NEXT:    retq
218;
219; AVX-LABEL: insertps_zero_from_v8i16:
220; AVX:       # %bb.0:
221; AVX-NEXT:    vmovdqa (%rdi), %xmm1
222; AVX-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
223; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
224; AVX-NEXT:    vmovdqa %xmm1, (%rdi)
225; AVX-NEXT:    retq
226  %1 = load <8 x i16>, <8 x i16>* %a1
227  %2 = bitcast <8 x i16> <i16 0, i16 0, i16 1, i16 1, i16 2, i16 2, i16 3, i16 3> to <4 x float>
228  %3 = add <8 x i16> %1, <i16 0, i16 0, i16 1, i16 1, i16 2, i16 2, i16 3, i16 3>
229  %4 = shufflevector <4 x float> %a0, <4 x float> %2, <4 x i32> <i32 4, i32 2, i32 2, i32 3>
230  store <8 x i16> %3, <8 x i16> *%a1
231  ret <4 x float> %4
232}
233
234define <4 x float> @consecutive_load_insertps_04zz(float* %p) {
235; SSE-LABEL: consecutive_load_insertps_04zz:
236; SSE:       # %bb.0:
237; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
238; SSE-NEXT:    retq
239;
240; AVX-LABEL: consecutive_load_insertps_04zz:
241; AVX:       # %bb.0:
242; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
243; AVX-NEXT:    retq
244  %p0 = getelementptr inbounds float, float* %p, i64 1
245  %p1 = getelementptr inbounds float, float* %p, i64 2
246  %s0 = load float, float* %p0
247  %s1 = load float, float* %p1
248  %v0 = insertelement <4 x float> undef, float %s0, i32 0
249  %v1 = insertelement <4 x float> undef, float %s1, i32 0
250  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %v0, <4 x float> %v1, i8 28)
251  ret <4 x float> %res
252}
253
254define float @extract_zero_insertps_z0z7(<4 x float> %a0, <4 x float> %a1) {
255; SSE-LABEL: extract_zero_insertps_z0z7:
256; SSE:       # %bb.0:
257; SSE-NEXT:    xorps %xmm0, %xmm0
258; SSE-NEXT:    retq
259;
260; AVX-LABEL: extract_zero_insertps_z0z7:
261; AVX:       # %bb.0:
262; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
263; AVX-NEXT:    retq
264  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 21)
265  %ext = extractelement <4 x float> %res, i32 0
266  ret float %ext
267}
268
269define float @extract_lane_insertps_5123(<4 x float> %a0, <4 x float> *%p1) {
270; SSE-LABEL: extract_lane_insertps_5123:
271; SSE:       # %bb.0:
272; SSE-NEXT:    movshdup {{.*#+}} xmm0 = mem[1,1,3,3]
273; SSE-NEXT:    retq
274;
275; AVX-LABEL: extract_lane_insertps_5123:
276; AVX:       # %bb.0:
277; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = mem[1,1,3,3]
278; AVX-NEXT:    retq
279  %a1 = load <4 x float>, <4 x float> *%p1
280  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 64)
281  %ext = extractelement <4 x float> %res, i32 0
282  ret float %ext
283}
284
285define float @extract_lane_insertps_6123(<4 x float> %a0, <4 x float> *%p1) {
286; SSE-LABEL: extract_lane_insertps_6123:
287; SSE:       # %bb.0:
288; SSE-NEXT:    movaps (%rdi), %xmm0
289; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
290; SSE-NEXT:    retq
291;
292; AVX-LABEL: extract_lane_insertps_6123:
293; AVX:       # %bb.0:
294; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = mem[1,0]
295; AVX-NEXT:    retq
296  %a1 = load <4 x float>, <4 x float> *%p1
297  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 128)
298  %ext = extractelement <4 x float> %res, i32 0
299  ret float %ext
300}
301
302; PR40340
303define <4 x float> @commute_load_insertps(<4 x float>, <4 x float>* nocapture readonly) {
304; SSE-LABEL: commute_load_insertps:
305; SSE:       # %bb.0:
306; SSE-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1],zero,mem[0]
307; SSE-NEXT:    retq
308;
309; AVX-LABEL: commute_load_insertps:
310; AVX:       # %bb.0:
311; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm0[1],zero,mem[0]
312; AVX-NEXT:    retq
313  %3 = load <4 x float>, <4 x float>* %1
314  %4 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %3, <4 x float> %0, i8 85)
315  ret <4 x float> %4
316}
317
318declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
319