1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2   | FileCheck %s --check-prefixes=ALL,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx    | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX1
5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2   | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2
6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512vl  | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512F
7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW
8; RUN: llc < %s -mtriple=i686-- -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,X86AVX2
9
10define <16 x i8> @undef_index(i8 %x) nounwind {
11; ALL-LABEL: undef_index:
12; ALL:       # %bb.0:
13; ALL-NEXT:    ret{{[l|q]}}
14  %ins = insertelement <16 x i8> undef, i8 %x, i64 undef
15  ret <16 x i8> %ins
16}
17
18define <16 x i8> @undef_scalar(<16 x i8> %x, i32 %index) nounwind {
19; ALL-LABEL: undef_scalar:
20; ALL:       # %bb.0:
21; ALL-NEXT:    ret{{[l|q]}}
22  %ins = insertelement <16 x i8> %x, i8 undef, i32 %index
23  ret <16 x i8> %ins
24}
25
26;
27; Insertion into undef vectors
28;
29
30define <16 x i8> @arg_i8_v16i8_undef(i8 %x, i32 %y) nounwind {
31; SSE2-LABEL: arg_i8_v16i8_undef:
32; SSE2:       # %bb.0:
33; SSE2-NEXT:    movd %edi, %xmm0
34; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
35; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
36; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
37; SSE2-NEXT:    retq
38;
39; SSE41-LABEL: arg_i8_v16i8_undef:
40; SSE41:       # %bb.0:
41; SSE41-NEXT:    movd %edi, %xmm0
42; SSE41-NEXT:    pxor %xmm1, %xmm1
43; SSE41-NEXT:    pshufb %xmm1, %xmm0
44; SSE41-NEXT:    retq
45;
46; AVX1-LABEL: arg_i8_v16i8_undef:
47; AVX1:       # %bb.0:
48; AVX1-NEXT:    vmovd %edi, %xmm0
49; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
50; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
51; AVX1-NEXT:    retq
52;
53; AVX2-LABEL: arg_i8_v16i8_undef:
54; AVX2:       # %bb.0:
55; AVX2-NEXT:    vmovd %edi, %xmm0
56; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
57; AVX2-NEXT:    retq
58;
59; AVX512F-LABEL: arg_i8_v16i8_undef:
60; AVX512F:       # %bb.0:
61; AVX512F-NEXT:    vmovd %edi, %xmm0
62; AVX512F-NEXT:    vpbroadcastb %xmm0, %xmm0
63; AVX512F-NEXT:    retq
64;
65; AVX512BW-LABEL: arg_i8_v16i8_undef:
66; AVX512BW:       # %bb.0:
67; AVX512BW-NEXT:    vpbroadcastb %edi, %xmm0
68; AVX512BW-NEXT:    retq
69;
70; X86AVX2-LABEL: arg_i8_v16i8_undef:
71; X86AVX2:       # %bb.0:
72; X86AVX2-NEXT:    vpbroadcastb {{[0-9]+}}(%esp), %xmm0
73; X86AVX2-NEXT:    retl
74  %ins = insertelement <16 x i8> undef, i8 %x, i32 %y
75  ret <16 x i8> %ins
76}
77
78define <8 x i16> @arg_i16_v8i16_undef(i16 %x, i32 %y) nounwind {
79; SSE-LABEL: arg_i16_v8i16_undef:
80; SSE:       # %bb.0:
81; SSE-NEXT:    movd %edi, %xmm0
82; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
83; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
84; SSE-NEXT:    retq
85;
86; AVX1-LABEL: arg_i16_v8i16_undef:
87; AVX1:       # %bb.0:
88; AVX1-NEXT:    vmovd %edi, %xmm0
89; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
90; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
91; AVX1-NEXT:    retq
92;
93; AVX2-LABEL: arg_i16_v8i16_undef:
94; AVX2:       # %bb.0:
95; AVX2-NEXT:    vmovd %edi, %xmm0
96; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
97; AVX2-NEXT:    retq
98;
99; AVX512F-LABEL: arg_i16_v8i16_undef:
100; AVX512F:       # %bb.0:
101; AVX512F-NEXT:    vmovd %edi, %xmm0
102; AVX512F-NEXT:    vpbroadcastw %xmm0, %xmm0
103; AVX512F-NEXT:    retq
104;
105; AVX512BW-LABEL: arg_i16_v8i16_undef:
106; AVX512BW:       # %bb.0:
107; AVX512BW-NEXT:    vpbroadcastw %edi, %xmm0
108; AVX512BW-NEXT:    retq
109;
110; X86AVX2-LABEL: arg_i16_v8i16_undef:
111; X86AVX2:       # %bb.0:
112; X86AVX2-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %xmm0
113; X86AVX2-NEXT:    retl
114  %ins = insertelement <8 x i16> undef, i16 %x, i32 %y
115  ret <8 x i16> %ins
116}
117
118define <4 x i32> @arg_i32_v4i32_undef(i32 %x, i32 %y) nounwind {
119; SSE-LABEL: arg_i32_v4i32_undef:
120; SSE:       # %bb.0:
121; SSE-NEXT:    movd %edi, %xmm0
122; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
123; SSE-NEXT:    retq
124;
125; AVX1-LABEL: arg_i32_v4i32_undef:
126; AVX1:       # %bb.0:
127; AVX1-NEXT:    vmovd %edi, %xmm0
128; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
129; AVX1-NEXT:    retq
130;
131; AVX2-LABEL: arg_i32_v4i32_undef:
132; AVX2:       # %bb.0:
133; AVX2-NEXT:    vmovd %edi, %xmm0
134; AVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
135; AVX2-NEXT:    retq
136;
137; AVX512-LABEL: arg_i32_v4i32_undef:
138; AVX512:       # %bb.0:
139; AVX512-NEXT:    vpbroadcastd %edi, %xmm0
140; AVX512-NEXT:    retq
141;
142; X86AVX2-LABEL: arg_i32_v4i32_undef:
143; X86AVX2:       # %bb.0:
144; X86AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm0
145; X86AVX2-NEXT:    retl
146  %ins = insertelement <4 x i32> undef, i32 %x, i32 %y
147  ret <4 x i32> %ins
148}
149
150define <2 x i64> @arg_i64_v2i64_undef(i64 %x, i32 %y) nounwind {
151; SSE-LABEL: arg_i64_v2i64_undef:
152; SSE:       # %bb.0:
153; SSE-NEXT:    movq %rdi, %xmm0
154; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
155; SSE-NEXT:    retq
156;
157; AVX1-LABEL: arg_i64_v2i64_undef:
158; AVX1:       # %bb.0:
159; AVX1-NEXT:    vmovq %rdi, %xmm0
160; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
161; AVX1-NEXT:    retq
162;
163; AVX2-LABEL: arg_i64_v2i64_undef:
164; AVX2:       # %bb.0:
165; AVX2-NEXT:    vmovq %rdi, %xmm0
166; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm0
167; AVX2-NEXT:    retq
168;
169; AVX512-LABEL: arg_i64_v2i64_undef:
170; AVX512:       # %bb.0:
171; AVX512-NEXT:    vpbroadcastq %rdi, %xmm0
172; AVX512-NEXT:    retq
173;
174; X86AVX2-LABEL: arg_i64_v2i64_undef:
175; X86AVX2:       # %bb.0:
176; X86AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
177; X86AVX2-NEXT:    retl
178  %ins = insertelement <2 x i64> undef, i64 %x, i32 %y
179  ret <2 x i64> %ins
180}
181
182define <4 x float> @arg_f32_v4f32_undef(float %x, i32 %y) nounwind {
183; SSE-LABEL: arg_f32_v4f32_undef:
184; SSE:       # %bb.0:
185; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
186; SSE-NEXT:    retq
187;
188; AVX1-LABEL: arg_f32_v4f32_undef:
189; AVX1:       # %bb.0:
190; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
191; AVX1-NEXT:    retq
192;
193; AVX2-LABEL: arg_f32_v4f32_undef:
194; AVX2:       # %bb.0:
195; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
196; AVX2-NEXT:    retq
197;
198; AVX512-LABEL: arg_f32_v4f32_undef:
199; AVX512:       # %bb.0:
200; AVX512-NEXT:    vbroadcastss %xmm0, %xmm0
201; AVX512-NEXT:    retq
202;
203; X86AVX2-LABEL: arg_f32_v4f32_undef:
204; X86AVX2:       # %bb.0:
205; X86AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm0
206; X86AVX2-NEXT:    retl
207  %ins = insertelement <4 x float> undef, float %x, i32 %y
208  ret <4 x float> %ins
209}
210
211define <2 x double> @arg_f64_v2f64_undef(double %x, i32 %y) nounwind {
212; SSE2-LABEL: arg_f64_v2f64_undef:
213; SSE2:       # %bb.0:
214; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
215; SSE2-NEXT:    retq
216;
217; SSE41-LABEL: arg_f64_v2f64_undef:
218; SSE41:       # %bb.0:
219; SSE41-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
220; SSE41-NEXT:    retq
221;
222; AVX-LABEL: arg_f64_v2f64_undef:
223; AVX:       # %bb.0:
224; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
225; AVX-NEXT:    retq
226;
227; X86AVX2-LABEL: arg_f64_v2f64_undef:
228; X86AVX2:       # %bb.0:
229; X86AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
230; X86AVX2-NEXT:    retl
231  %ins = insertelement <2 x double> undef, double %x, i32 %y
232  ret <2 x double> %ins
233}
234
235define <16 x i8> @load_i8_v16i8_undef(i8* %p, i32 %y) nounwind {
236; SSE2-LABEL: load_i8_v16i8_undef:
237; SSE2:       # %bb.0:
238; SSE2-NEXT:    movzbl (%rdi), %eax
239; SSE2-NEXT:    movd %eax, %xmm0
240; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
241; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
242; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
243; SSE2-NEXT:    retq
244;
245; SSE41-LABEL: load_i8_v16i8_undef:
246; SSE41:       # %bb.0:
247; SSE41-NEXT:    movzbl (%rdi), %eax
248; SSE41-NEXT:    movd %eax, %xmm0
249; SSE41-NEXT:    pxor %xmm1, %xmm1
250; SSE41-NEXT:    pshufb %xmm1, %xmm0
251; SSE41-NEXT:    retq
252;
253; AVX1-LABEL: load_i8_v16i8_undef:
254; AVX1:       # %bb.0:
255; AVX1-NEXT:    movzbl (%rdi), %eax
256; AVX1-NEXT:    vmovd %eax, %xmm0
257; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
258; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
259; AVX1-NEXT:    retq
260;
261; AVX2-LABEL: load_i8_v16i8_undef:
262; AVX2:       # %bb.0:
263; AVX2-NEXT:    vpbroadcastb (%rdi), %xmm0
264; AVX2-NEXT:    retq
265;
266; AVX512-LABEL: load_i8_v16i8_undef:
267; AVX512:       # %bb.0:
268; AVX512-NEXT:    vpbroadcastb (%rdi), %xmm0
269; AVX512-NEXT:    retq
270;
271; X86AVX2-LABEL: load_i8_v16i8_undef:
272; X86AVX2:       # %bb.0:
273; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
274; X86AVX2-NEXT:    vpbroadcastb (%eax), %xmm0
275; X86AVX2-NEXT:    retl
276  %x = load i8, i8* %p
277  %ins = insertelement <16 x i8> undef, i8 %x, i32 %y
278  ret <16 x i8> %ins
279}
280
281define <8 x i16> @load_i16_v8i16_undef(i16* %p, i32 %y) nounwind {
282; SSE-LABEL: load_i16_v8i16_undef:
283; SSE:       # %bb.0:
284; SSE-NEXT:    movzwl (%rdi), %eax
285; SSE-NEXT:    movd %eax, %xmm0
286; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
287; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
288; SSE-NEXT:    retq
289;
290; AVX1-LABEL: load_i16_v8i16_undef:
291; AVX1:       # %bb.0:
292; AVX1-NEXT:    movzwl (%rdi), %eax
293; AVX1-NEXT:    vmovd %eax, %xmm0
294; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
295; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
296; AVX1-NEXT:    retq
297;
298; AVX2-LABEL: load_i16_v8i16_undef:
299; AVX2:       # %bb.0:
300; AVX2-NEXT:    vpbroadcastw (%rdi), %xmm0
301; AVX2-NEXT:    retq
302;
303; AVX512-LABEL: load_i16_v8i16_undef:
304; AVX512:       # %bb.0:
305; AVX512-NEXT:    vpbroadcastw (%rdi), %xmm0
306; AVX512-NEXT:    retq
307;
308; X86AVX2-LABEL: load_i16_v8i16_undef:
309; X86AVX2:       # %bb.0:
310; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
311; X86AVX2-NEXT:    vpbroadcastw (%eax), %xmm0
312; X86AVX2-NEXT:    retl
313  %x = load i16, i16* %p
314  %ins = insertelement <8 x i16> undef, i16 %x, i32 %y
315  ret <8 x i16> %ins
316}
317
318define <4 x i32> @load_i32_v4i32_undef(i32* %p, i32 %y) nounwind {
319; SSE-LABEL: load_i32_v4i32_undef:
320; SSE:       # %bb.0:
321; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
322; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
323; SSE-NEXT:    retq
324;
325; AVX-LABEL: load_i32_v4i32_undef:
326; AVX:       # %bb.0:
327; AVX-NEXT:    vbroadcastss (%rdi), %xmm0
328; AVX-NEXT:    retq
329;
330; X86AVX2-LABEL: load_i32_v4i32_undef:
331; X86AVX2:       # %bb.0:
332; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
333; X86AVX2-NEXT:    vbroadcastss (%eax), %xmm0
334; X86AVX2-NEXT:    retl
335  %x = load i32, i32* %p
336  %ins = insertelement <4 x i32> undef, i32 %x, i32 %y
337  ret <4 x i32> %ins
338}
339
340define <2 x i64> @load_i64_v2i64_undef(i64* %p, i32 %y) nounwind {
341; SSE-LABEL: load_i64_v2i64_undef:
342; SSE:       # %bb.0:
343; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
344; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
345; SSE-NEXT:    retq
346;
347; AVX-LABEL: load_i64_v2i64_undef:
348; AVX:       # %bb.0:
349; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
350; AVX-NEXT:    retq
351;
352; X86AVX2-LABEL: load_i64_v2i64_undef:
353; X86AVX2:       # %bb.0:
354; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
355; X86AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
356; X86AVX2-NEXT:    retl
357  %x = load i64, i64* %p
358  %ins = insertelement <2 x i64> undef, i64 %x, i32 %y
359  ret <2 x i64> %ins
360}
361
362define <4 x float> @load_f32_v4f32_undef(float* %p, i32 %y) nounwind {
363; SSE-LABEL: load_f32_v4f32_undef:
364; SSE:       # %bb.0:
365; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
366; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
367; SSE-NEXT:    retq
368;
369; AVX-LABEL: load_f32_v4f32_undef:
370; AVX:       # %bb.0:
371; AVX-NEXT:    vbroadcastss (%rdi), %xmm0
372; AVX-NEXT:    retq
373;
374; X86AVX2-LABEL: load_f32_v4f32_undef:
375; X86AVX2:       # %bb.0:
376; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
377; X86AVX2-NEXT:    vbroadcastss (%eax), %xmm0
378; X86AVX2-NEXT:    retl
379  %x = load float, float* %p
380  %ins = insertelement <4 x float> undef, float %x, i32 %y
381  ret <4 x float> %ins
382}
383
384define <2 x double> @load_f64_v2f64_undef(double* %p, i32 %y) nounwind {
385; SSE2-LABEL: load_f64_v2f64_undef:
386; SSE2:       # %bb.0:
387; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
388; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
389; SSE2-NEXT:    retq
390;
391; SSE41-LABEL: load_f64_v2f64_undef:
392; SSE41:       # %bb.0:
393; SSE41-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
394; SSE41-NEXT:    retq
395;
396; AVX-LABEL: load_f64_v2f64_undef:
397; AVX:       # %bb.0:
398; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
399; AVX-NEXT:    retq
400;
401; X86AVX2-LABEL: load_f64_v2f64_undef:
402; X86AVX2:       # %bb.0:
403; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
404; X86AVX2-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
405; X86AVX2-NEXT:    retl
406  %x = load double, double* %p
407  %ins = insertelement <2 x double> undef, double %x, i32 %y
408  ret <2 x double> %ins
409}
410
411define <32 x i8> @arg_i8_v32i8_undef(i8 %x, i32 %y) nounwind {
412; SSE-LABEL: arg_i8_v32i8_undef:
413; SSE:       # %bb.0:
414; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
415; SSE-NEXT:    andl $31, %esi
416; SSE-NEXT:    movb %dil, -40(%rsp,%rsi)
417; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
418; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
419; SSE-NEXT:    retq
420;
421; AVX1-LABEL: arg_i8_v32i8_undef:
422; AVX1:       # %bb.0:
423; AVX1-NEXT:    vmovd %edi, %xmm0
424; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
425; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
426; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
427; AVX1-NEXT:    retq
428;
429; AVX2-LABEL: arg_i8_v32i8_undef:
430; AVX2:       # %bb.0:
431; AVX2-NEXT:    vmovd %edi, %xmm0
432; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
433; AVX2-NEXT:    retq
434;
435; AVX512F-LABEL: arg_i8_v32i8_undef:
436; AVX512F:       # %bb.0:
437; AVX512F-NEXT:    vmovd %edi, %xmm0
438; AVX512F-NEXT:    vpbroadcastb %xmm0, %ymm0
439; AVX512F-NEXT:    retq
440;
441; AVX512BW-LABEL: arg_i8_v32i8_undef:
442; AVX512BW:       # %bb.0:
443; AVX512BW-NEXT:    vpbroadcastb %edi, %ymm0
444; AVX512BW-NEXT:    retq
445;
446; X86AVX2-LABEL: arg_i8_v32i8_undef:
447; X86AVX2:       # %bb.0:
448; X86AVX2-NEXT:    vpbroadcastb {{[0-9]+}}(%esp), %ymm0
449; X86AVX2-NEXT:    retl
450  %ins = insertelement <32 x i8> undef, i8 %x, i32 %y
451  ret <32 x i8> %ins
452}
453
454define <16 x i16> @arg_i16_v16i16_undef(i16 %x, i32 %y) nounwind {
455; SSE-LABEL: arg_i16_v16i16_undef:
456; SSE:       # %bb.0:
457; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
458; SSE-NEXT:    andl $15, %esi
459; SSE-NEXT:    movw %di, -40(%rsp,%rsi,2)
460; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
461; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
462; SSE-NEXT:    retq
463;
464; AVX1-LABEL: arg_i16_v16i16_undef:
465; AVX1:       # %bb.0:
466; AVX1-NEXT:    vmovd %edi, %xmm0
467; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
468; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
469; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
470; AVX1-NEXT:    retq
471;
472; AVX2-LABEL: arg_i16_v16i16_undef:
473; AVX2:       # %bb.0:
474; AVX2-NEXT:    vmovd %edi, %xmm0
475; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
476; AVX2-NEXT:    retq
477;
478; AVX512F-LABEL: arg_i16_v16i16_undef:
479; AVX512F:       # %bb.0:
480; AVX512F-NEXT:    vmovd %edi, %xmm0
481; AVX512F-NEXT:    vpbroadcastw %xmm0, %ymm0
482; AVX512F-NEXT:    retq
483;
484; AVX512BW-LABEL: arg_i16_v16i16_undef:
485; AVX512BW:       # %bb.0:
486; AVX512BW-NEXT:    vpbroadcastw %edi, %ymm0
487; AVX512BW-NEXT:    retq
488;
489; X86AVX2-LABEL: arg_i16_v16i16_undef:
490; X86AVX2:       # %bb.0:
491; X86AVX2-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %ymm0
492; X86AVX2-NEXT:    retl
493  %ins = insertelement <16 x i16> undef, i16 %x, i32 %y
494  ret <16 x i16> %ins
495}
496
497define <8 x i32> @arg_i32_v8i32_undef(i32 %x, i32 %y) nounwind {
498; SSE-LABEL: arg_i32_v8i32_undef:
499; SSE:       # %bb.0:
500; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
501; SSE-NEXT:    andl $7, %esi
502; SSE-NEXT:    movl %edi, -40(%rsp,%rsi,4)
503; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
504; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
505; SSE-NEXT:    retq
506;
507; AVX1-LABEL: arg_i32_v8i32_undef:
508; AVX1:       # %bb.0:
509; AVX1-NEXT:    vmovd %edi, %xmm0
510; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
511; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
512; AVX1-NEXT:    retq
513;
514; AVX2-LABEL: arg_i32_v8i32_undef:
515; AVX2:       # %bb.0:
516; AVX2-NEXT:    vmovd %edi, %xmm0
517; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
518; AVX2-NEXT:    retq
519;
520; AVX512-LABEL: arg_i32_v8i32_undef:
521; AVX512:       # %bb.0:
522; AVX512-NEXT:    vpbroadcastd %edi, %ymm0
523; AVX512-NEXT:    retq
524;
525; X86AVX2-LABEL: arg_i32_v8i32_undef:
526; X86AVX2:       # %bb.0:
527; X86AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %ymm0
528; X86AVX2-NEXT:    retl
529  %ins = insertelement <8 x i32> undef, i32 %x, i32 %y
530  ret <8 x i32> %ins
531}
532
533define <4 x i64> @arg_i64_v4i64_undef(i64 %x, i32 %y) nounwind {
534; SSE-LABEL: arg_i64_v4i64_undef:
535; SSE:       # %bb.0:
536; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
537; SSE-NEXT:    andl $3, %esi
538; SSE-NEXT:    movq %rdi, -40(%rsp,%rsi,8)
539; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
540; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
541; SSE-NEXT:    retq
542;
543; AVX1-LABEL: arg_i64_v4i64_undef:
544; AVX1:       # %bb.0:
545; AVX1-NEXT:    vmovq %rdi, %xmm0
546; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
547; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
548; AVX1-NEXT:    retq
549;
550; AVX2-LABEL: arg_i64_v4i64_undef:
551; AVX2:       # %bb.0:
552; AVX2-NEXT:    vmovq %rdi, %xmm0
553; AVX2-NEXT:    vpbroadcastq %xmm0, %ymm0
554; AVX2-NEXT:    retq
555;
556; AVX512-LABEL: arg_i64_v4i64_undef:
557; AVX512:       # %bb.0:
558; AVX512-NEXT:    vpbroadcastq %rdi, %ymm0
559; AVX512-NEXT:    retq
560;
561; X86AVX2-LABEL: arg_i64_v4i64_undef:
562; X86AVX2:       # %bb.0:
563; X86AVX2-NEXT:    vbroadcastsd {{[0-9]+}}(%esp), %ymm0
564; X86AVX2-NEXT:    retl
565  %ins = insertelement <4 x i64> undef, i64 %x, i32 %y
566  ret <4 x i64> %ins
567}
568
569define <8 x float> @arg_f32_v8f32_undef(float %x, i32 %y) nounwind {
570; SSE-LABEL: arg_f32_v8f32_undef:
571; SSE:       # %bb.0:
572; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
573; SSE-NEXT:    andl $7, %edi
574; SSE-NEXT:    movss %xmm0, -40(%rsp,%rdi,4)
575; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
576; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
577; SSE-NEXT:    retq
578;
579; AVX1-LABEL: arg_f32_v8f32_undef:
580; AVX1:       # %bb.0:
581; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
582; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
583; AVX1-NEXT:    retq
584;
585; AVX2-LABEL: arg_f32_v8f32_undef:
586; AVX2:       # %bb.0:
587; AVX2-NEXT:    vbroadcastss %xmm0, %ymm0
588; AVX2-NEXT:    retq
589;
590; AVX512-LABEL: arg_f32_v8f32_undef:
591; AVX512:       # %bb.0:
592; AVX512-NEXT:    vbroadcastss %xmm0, %ymm0
593; AVX512-NEXT:    retq
594;
595; X86AVX2-LABEL: arg_f32_v8f32_undef:
596; X86AVX2:       # %bb.0:
597; X86AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %ymm0
598; X86AVX2-NEXT:    retl
599  %ins = insertelement <8 x float> undef, float %x, i32 %y
600  ret <8 x float> %ins
601}
602
603define <4 x double> @arg_f64_v4f64_undef(double %x, i32 %y) nounwind {
604; SSE-LABEL: arg_f64_v4f64_undef:
605; SSE:       # %bb.0:
606; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
607; SSE-NEXT:    andl $3, %edi
608; SSE-NEXT:    movsd %xmm0, -40(%rsp,%rdi,8)
609; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
610; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
611; SSE-NEXT:    retq
612;
613; AVX1-LABEL: arg_f64_v4f64_undef:
614; AVX1:       # %bb.0:
615; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
616; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
617; AVX1-NEXT:    retq
618;
619; AVX2-LABEL: arg_f64_v4f64_undef:
620; AVX2:       # %bb.0:
621; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
622; AVX2-NEXT:    retq
623;
624; AVX512-LABEL: arg_f64_v4f64_undef:
625; AVX512:       # %bb.0:
626; AVX512-NEXT:    vbroadcastsd %xmm0, %ymm0
627; AVX512-NEXT:    retq
628;
629; X86AVX2-LABEL: arg_f64_v4f64_undef:
630; X86AVX2:       # %bb.0:
631; X86AVX2-NEXT:    vbroadcastsd {{[0-9]+}}(%esp), %ymm0
632; X86AVX2-NEXT:    retl
633  %ins = insertelement <4 x double> undef, double %x, i32 %y
634  ret <4 x double> %ins
635}
636
637define <32 x i8> @load_i8_v32i8_undef(i8* %p, i32 %y) nounwind {
638; SSE-LABEL: load_i8_v32i8_undef:
639; SSE:       # %bb.0:
640; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
641; SSE-NEXT:    movb (%rdi), %al
642; SSE-NEXT:    andl $31, %esi
643; SSE-NEXT:    movb %al, -40(%rsp,%rsi)
644; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
645; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
646; SSE-NEXT:    retq
647;
648; AVX1-LABEL: load_i8_v32i8_undef:
649; AVX1:       # %bb.0:
650; AVX1-NEXT:    movzbl (%rdi), %eax
651; AVX1-NEXT:    vmovd %eax, %xmm0
652; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
653; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
654; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
655; AVX1-NEXT:    retq
656;
657; AVX2-LABEL: load_i8_v32i8_undef:
658; AVX2:       # %bb.0:
659; AVX2-NEXT:    vpbroadcastb (%rdi), %ymm0
660; AVX2-NEXT:    retq
661;
662; AVX512-LABEL: load_i8_v32i8_undef:
663; AVX512:       # %bb.0:
664; AVX512-NEXT:    vpbroadcastb (%rdi), %ymm0
665; AVX512-NEXT:    retq
666;
667; X86AVX2-LABEL: load_i8_v32i8_undef:
668; X86AVX2:       # %bb.0:
669; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
670; X86AVX2-NEXT:    vpbroadcastb (%eax), %ymm0
671; X86AVX2-NEXT:    retl
672  %x = load i8, i8* %p
673  %ins = insertelement <32 x i8> undef, i8 %x, i32 %y
674  ret <32 x i8> %ins
675}
676
677define <16 x i16> @load_i16_v16i16_undef(i16* %p, i32 %y) nounwind {
678; SSE-LABEL: load_i16_v16i16_undef:
679; SSE:       # %bb.0:
680; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
681; SSE-NEXT:    movzwl (%rdi), %eax
682; SSE-NEXT:    andl $15, %esi
683; SSE-NEXT:    movw %ax, -40(%rsp,%rsi,2)
684; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
685; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
686; SSE-NEXT:    retq
687;
688; AVX1-LABEL: load_i16_v16i16_undef:
689; AVX1:       # %bb.0:
690; AVX1-NEXT:    movzwl (%rdi), %eax
691; AVX1-NEXT:    vmovd %eax, %xmm0
692; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
693; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
694; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
695; AVX1-NEXT:    retq
696;
697; AVX2-LABEL: load_i16_v16i16_undef:
698; AVX2:       # %bb.0:
699; AVX2-NEXT:    vpbroadcastw (%rdi), %ymm0
700; AVX2-NEXT:    retq
701;
702; AVX512-LABEL: load_i16_v16i16_undef:
703; AVX512:       # %bb.0:
704; AVX512-NEXT:    vpbroadcastw (%rdi), %ymm0
705; AVX512-NEXT:    retq
706;
707; X86AVX2-LABEL: load_i16_v16i16_undef:
708; X86AVX2:       # %bb.0:
709; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
710; X86AVX2-NEXT:    vpbroadcastw (%eax), %ymm0
711; X86AVX2-NEXT:    retl
712  %x = load i16, i16* %p
713  %ins = insertelement <16 x i16> undef, i16 %x, i32 %y
714  ret <16 x i16> %ins
715}
716
717define <8 x i32> @load_i32_v8i32_undef(i32* %p, i32 %y) nounwind {
718; SSE-LABEL: load_i32_v8i32_undef:
719; SSE:       # %bb.0:
720; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
721; SSE-NEXT:    movl (%rdi), %eax
722; SSE-NEXT:    andl $7, %esi
723; SSE-NEXT:    movl %eax, -40(%rsp,%rsi,4)
724; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
725; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
726; SSE-NEXT:    retq
727;
728; AVX-LABEL: load_i32_v8i32_undef:
729; AVX:       # %bb.0:
730; AVX-NEXT:    vbroadcastss (%rdi), %ymm0
731; AVX-NEXT:    retq
732;
733; X86AVX2-LABEL: load_i32_v8i32_undef:
734; X86AVX2:       # %bb.0:
735; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
736; X86AVX2-NEXT:    vbroadcastss (%eax), %ymm0
737; X86AVX2-NEXT:    retl
738  %x = load i32, i32* %p
739  %ins = insertelement <8 x i32> undef, i32 %x, i32 %y
740  ret <8 x i32> %ins
741}
742
743define <4 x i64> @load_i64_v4i64_undef(i64* %p, i32 %y) nounwind {
744; SSE-LABEL: load_i64_v4i64_undef:
745; SSE:       # %bb.0:
746; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
747; SSE-NEXT:    movq (%rdi), %rax
748; SSE-NEXT:    andl $3, %esi
749; SSE-NEXT:    movq %rax, -40(%rsp,%rsi,8)
750; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
751; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
752; SSE-NEXT:    retq
753;
754; AVX-LABEL: load_i64_v4i64_undef:
755; AVX:       # %bb.0:
756; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
757; AVX-NEXT:    retq
758;
759; X86AVX2-LABEL: load_i64_v4i64_undef:
760; X86AVX2:       # %bb.0:
761; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
762; X86AVX2-NEXT:    vbroadcastsd (%eax), %ymm0
763; X86AVX2-NEXT:    retl
764  %x = load i64, i64* %p
765  %ins = insertelement <4 x i64> undef, i64 %x, i32 %y
766  ret <4 x i64> %ins
767}
768
769define <8 x float> @load_f32_v8f32_undef(float* %p, i32 %y) nounwind {
770; SSE-LABEL: load_f32_v8f32_undef:
771; SSE:       # %bb.0:
772; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
773; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
774; SSE-NEXT:    andl $7, %esi
775; SSE-NEXT:    movss %xmm0, -40(%rsp,%rsi,4)
776; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
777; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
778; SSE-NEXT:    retq
779;
780; AVX-LABEL: load_f32_v8f32_undef:
781; AVX:       # %bb.0:
782; AVX-NEXT:    vbroadcastss (%rdi), %ymm0
783; AVX-NEXT:    retq
784;
785; X86AVX2-LABEL: load_f32_v8f32_undef:
786; X86AVX2:       # %bb.0:
787; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
788; X86AVX2-NEXT:    vbroadcastss (%eax), %ymm0
789; X86AVX2-NEXT:    retl
790  %x = load float, float* %p
791  %ins = insertelement <8 x float> undef, float %x, i32 %y
792  ret <8 x float> %ins
793}
794
795define <4 x double> @load_f64_v4f64_undef(double* %p, i32 %y) nounwind {
796; SSE-LABEL: load_f64_v4f64_undef:
797; SSE:       # %bb.0:
798; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
799; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
800; SSE-NEXT:    andl $3, %esi
801; SSE-NEXT:    movsd %xmm0, -40(%rsp,%rsi,8)
802; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
803; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
804; SSE-NEXT:    retq
805;
806; AVX-LABEL: load_f64_v4f64_undef:
807; AVX:       # %bb.0:
808; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
809; AVX-NEXT:    retq
810;
811; X86AVX2-LABEL: load_f64_v4f64_undef:
812; X86AVX2:       # %bb.0:
813; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
814; X86AVX2-NEXT:    vbroadcastsd (%eax), %ymm0
815; X86AVX2-NEXT:    retl
816  %x = load double, double* %p
817  %ins = insertelement <4 x double> undef, double %x, i32 %y
818  ret <4 x double> %ins
819}
820
821;
822; Insertion into arg vectors
823;
824
825define <16 x i8> @arg_i8_v16i8(<16 x i8> %v, i8 %x, i32 %y) nounwind {
826; SSE-LABEL: arg_i8_v16i8:
827; SSE:       # %bb.0:
828; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
829; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
830; SSE-NEXT:    andl $15, %esi
831; SSE-NEXT:    movb %dil, -24(%rsp,%rsi)
832; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
833; SSE-NEXT:    retq
834;
835; AVX1OR2-LABEL: arg_i8_v16i8:
836; AVX1OR2:       # %bb.0:
837; AVX1OR2-NEXT:    # kill: def $esi killed $esi def $rsi
838; AVX1OR2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
839; AVX1OR2-NEXT:    andl $15, %esi
840; AVX1OR2-NEXT:    movb %dil, -24(%rsp,%rsi)
841; AVX1OR2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
842; AVX1OR2-NEXT:    retq
843;
844; AVX512F-LABEL: arg_i8_v16i8:
845; AVX512F:       # %bb.0:
846; AVX512F-NEXT:    # kill: def $esi killed $esi def $rsi
847; AVX512F-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
848; AVX512F-NEXT:    andl $15, %esi
849; AVX512F-NEXT:    movb %dil, -24(%rsp,%rsi)
850; AVX512F-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
851; AVX512F-NEXT:    retq
852;
853; AVX512BW-LABEL: arg_i8_v16i8:
854; AVX512BW:       # %bb.0:
855; AVX512BW-NEXT:    vpbroadcastb %esi, %xmm1
856; AVX512BW-NEXT:    vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
857; AVX512BW-NEXT:    vpbroadcastb %edi, %xmm0 {%k1}
858; AVX512BW-NEXT:    retq
859;
860; X86AVX2-LABEL: arg_i8_v16i8:
861; X86AVX2:       # %bb.0:
862; X86AVX2-NEXT:    pushl %ebp
863; X86AVX2-NEXT:    movl %esp, %ebp
864; X86AVX2-NEXT:    andl $-16, %esp
865; X86AVX2-NEXT:    subl $32, %esp
866; X86AVX2-NEXT:    movl 12(%ebp), %eax
867; X86AVX2-NEXT:    andl $15, %eax
868; X86AVX2-NEXT:    movb 8(%ebp), %cl
869; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
870; X86AVX2-NEXT:    movb %cl, (%esp,%eax)
871; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
872; X86AVX2-NEXT:    movl %ebp, %esp
873; X86AVX2-NEXT:    popl %ebp
874; X86AVX2-NEXT:    retl
875  %ins = insertelement <16 x i8> %v, i8 %x, i32 %y
876  ret <16 x i8> %ins
877}
878
879define <8 x i16> @arg_i16_v8i16(<8 x i16> %v, i16 %x, i32 %y) nounwind {
880; SSE-LABEL: arg_i16_v8i16:
881; SSE:       # %bb.0:
882; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
883; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
884; SSE-NEXT:    andl $7, %esi
885; SSE-NEXT:    movw %di, -24(%rsp,%rsi,2)
886; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
887; SSE-NEXT:    retq
888;
889; AVX1OR2-LABEL: arg_i16_v8i16:
890; AVX1OR2:       # %bb.0:
891; AVX1OR2-NEXT:    # kill: def $esi killed $esi def $rsi
892; AVX1OR2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
893; AVX1OR2-NEXT:    andl $7, %esi
894; AVX1OR2-NEXT:    movw %di, -24(%rsp,%rsi,2)
895; AVX1OR2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
896; AVX1OR2-NEXT:    retq
897;
898; AVX512F-LABEL: arg_i16_v8i16:
899; AVX512F:       # %bb.0:
900; AVX512F-NEXT:    # kill: def $esi killed $esi def $rsi
901; AVX512F-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
902; AVX512F-NEXT:    andl $7, %esi
903; AVX512F-NEXT:    movw %di, -24(%rsp,%rsi,2)
904; AVX512F-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
905; AVX512F-NEXT:    retq
906;
907; AVX512BW-LABEL: arg_i16_v8i16:
908; AVX512BW:       # %bb.0:
909; AVX512BW-NEXT:    vpbroadcastw %esi, %xmm1
910; AVX512BW-NEXT:    vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
911; AVX512BW-NEXT:    vpbroadcastw %edi, %xmm0 {%k1}
912; AVX512BW-NEXT:    retq
913;
914; X86AVX2-LABEL: arg_i16_v8i16:
915; X86AVX2:       # %bb.0:
916; X86AVX2-NEXT:    pushl %ebp
917; X86AVX2-NEXT:    movl %esp, %ebp
918; X86AVX2-NEXT:    andl $-16, %esp
919; X86AVX2-NEXT:    subl $32, %esp
920; X86AVX2-NEXT:    movl 12(%ebp), %eax
921; X86AVX2-NEXT:    andl $7, %eax
922; X86AVX2-NEXT:    movzwl 8(%ebp), %ecx
923; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
924; X86AVX2-NEXT:    movw %cx, (%esp,%eax,2)
925; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
926; X86AVX2-NEXT:    movl %ebp, %esp
927; X86AVX2-NEXT:    popl %ebp
928; X86AVX2-NEXT:    retl
929  %ins = insertelement <8 x i16> %v, i16 %x, i32 %y
930  ret <8 x i16> %ins
931}
932
933define <4 x i32> @arg_i32_v4i32(<4 x i32> %v, i32 %x, i32 %y) nounwind {
934; SSE-LABEL: arg_i32_v4i32:
935; SSE:       # %bb.0:
936; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
937; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
938; SSE-NEXT:    andl $3, %esi
939; SSE-NEXT:    movl %edi, -24(%rsp,%rsi,4)
940; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
941; SSE-NEXT:    retq
942;
943; AVX1OR2-LABEL: arg_i32_v4i32:
944; AVX1OR2:       # %bb.0:
945; AVX1OR2-NEXT:    # kill: def $esi killed $esi def $rsi
946; AVX1OR2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
947; AVX1OR2-NEXT:    andl $3, %esi
948; AVX1OR2-NEXT:    movl %edi, -24(%rsp,%rsi,4)
949; AVX1OR2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
950; AVX1OR2-NEXT:    retq
951;
952; AVX512-LABEL: arg_i32_v4i32:
953; AVX512:       # %bb.0:
954; AVX512-NEXT:    vpbroadcastd %esi, %xmm1
955; AVX512-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
956; AVX512-NEXT:    vpbroadcastd %edi, %xmm0 {%k1}
957; AVX512-NEXT:    retq
958;
959; X86AVX2-LABEL: arg_i32_v4i32:
960; X86AVX2:       # %bb.0:
961; X86AVX2-NEXT:    pushl %ebp
962; X86AVX2-NEXT:    movl %esp, %ebp
963; X86AVX2-NEXT:    andl $-16, %esp
964; X86AVX2-NEXT:    subl $32, %esp
965; X86AVX2-NEXT:    movl 12(%ebp), %eax
966; X86AVX2-NEXT:    andl $3, %eax
967; X86AVX2-NEXT:    movl 8(%ebp), %ecx
968; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
969; X86AVX2-NEXT:    movl %ecx, (%esp,%eax,4)
970; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
971; X86AVX2-NEXT:    movl %ebp, %esp
972; X86AVX2-NEXT:    popl %ebp
973; X86AVX2-NEXT:    retl
974  %ins = insertelement <4 x i32> %v, i32 %x, i32 %y
975  ret <4 x i32> %ins
976}
977
978define <2 x i64> @arg_i64_v2i64(<2 x i64> %v, i64 %x, i32 %y) nounwind {
979; SSE-LABEL: arg_i64_v2i64:
980; SSE:       # %bb.0:
981; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
982; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
983; SSE-NEXT:    andl $1, %esi
984; SSE-NEXT:    movq %rdi, -24(%rsp,%rsi,8)
985; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
986; SSE-NEXT:    retq
987;
988; AVX1OR2-LABEL: arg_i64_v2i64:
989; AVX1OR2:       # %bb.0:
990; AVX1OR2-NEXT:    # kill: def $esi killed $esi def $rsi
991; AVX1OR2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
992; AVX1OR2-NEXT:    andl $1, %esi
993; AVX1OR2-NEXT:    movq %rdi, -24(%rsp,%rsi,8)
994; AVX1OR2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
995; AVX1OR2-NEXT:    retq
996;
997; AVX512-LABEL: arg_i64_v2i64:
998; AVX512:       # %bb.0:
999; AVX512-NEXT:    movslq %esi, %rax
1000; AVX512-NEXT:    vpbroadcastq %rax, %xmm1
1001; AVX512-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
1002; AVX512-NEXT:    vpbroadcastq %rdi, %xmm0 {%k1}
1003; AVX512-NEXT:    retq
1004;
1005; X86AVX2-LABEL: arg_i64_v2i64:
1006; X86AVX2:       # %bb.0:
1007; X86AVX2-NEXT:    pushl %ebp
1008; X86AVX2-NEXT:    movl %esp, %ebp
1009; X86AVX2-NEXT:    pushl %esi
1010; X86AVX2-NEXT:    andl $-16, %esp
1011; X86AVX2-NEXT:    subl $48, %esp
1012; X86AVX2-NEXT:    movl 8(%ebp), %eax
1013; X86AVX2-NEXT:    movl 12(%ebp), %ecx
1014; X86AVX2-NEXT:    movl 16(%ebp), %edx
1015; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
1016; X86AVX2-NEXT:    leal (%edx,%edx), %esi
1017; X86AVX2-NEXT:    andl $3, %esi
1018; X86AVX2-NEXT:    movl %eax, (%esp,%esi,4)
1019; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
1020; X86AVX2-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
1021; X86AVX2-NEXT:    leal 1(%edx,%edx), %eax
1022; X86AVX2-NEXT:    andl $3, %eax
1023; X86AVX2-NEXT:    movl %ecx, 16(%esp,%eax,4)
1024; X86AVX2-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
1025; X86AVX2-NEXT:    leal -4(%ebp), %esp
1026; X86AVX2-NEXT:    popl %esi
1027; X86AVX2-NEXT:    popl %ebp
1028; X86AVX2-NEXT:    retl
1029  %ins = insertelement <2 x i64> %v, i64 %x, i32 %y
1030  ret <2 x i64> %ins
1031}
1032
1033define <4 x float> @arg_f32_v4f32(<4 x float> %v, float %x, i32 %y) nounwind {
1034; SSE2-LABEL: arg_f32_v4f32:
1035; SSE2:       # %bb.0:
1036; SSE2-NEXT:    # kill: def $edi killed $edi def $rdi
1037; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1038; SSE2-NEXT:    andl $3, %edi
1039; SSE2-NEXT:    movss %xmm1, -24(%rsp,%rdi,4)
1040; SSE2-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
1041; SSE2-NEXT:    retq
1042;
1043; SSE41-LABEL: arg_f32_v4f32:
1044; SSE41:       # %bb.0:
1045; SSE41-NEXT:    movaps %xmm0, %xmm2
1046; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
1047; SSE41-NEXT:    movd %edi, %xmm0
1048; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1049; SSE41-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1050; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm2
1051; SSE41-NEXT:    movaps %xmm2, %xmm0
1052; SSE41-NEXT:    retq
1053;
1054; AVX1-LABEL: arg_f32_v4f32:
1055; AVX1:       # %bb.0:
1056; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
1057; AVX1-NEXT:    vmovd %edi, %xmm2
1058; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1059; AVX1-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1060; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1061; AVX1-NEXT:    retq
1062;
1063; AVX2-LABEL: arg_f32_v4f32:
1064; AVX2:       # %bb.0:
1065; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
1066; AVX2-NEXT:    vmovd %edi, %xmm2
1067; AVX2-NEXT:    vpbroadcastd %xmm2, %xmm2
1068; AVX2-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1069; AVX2-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1070; AVX2-NEXT:    retq
1071;
1072; AVX512-LABEL: arg_f32_v4f32:
1073; AVX512:       # %bb.0:
1074; AVX512-NEXT:    vpbroadcastd %edi, %xmm2
1075; AVX512-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %k1
1076; AVX512-NEXT:    vbroadcastss %xmm1, %xmm0 {%k1}
1077; AVX512-NEXT:    retq
1078;
1079; X86AVX2-LABEL: arg_f32_v4f32:
1080; X86AVX2:       # %bb.0:
1081; X86AVX2-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm1
1082; X86AVX2-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1083; X86AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm2
1084; X86AVX2-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
1085; X86AVX2-NEXT:    retl
1086  %ins = insertelement <4 x float> %v, float %x, i32 %y
1087  ret <4 x float> %ins
1088}
1089
1090define <2 x double> @arg_f64_v2f64(<2 x double> %v, double %x, i32 %y) nounwind {
1091; SSE2-LABEL: arg_f64_v2f64:
1092; SSE2:       # %bb.0:
1093; SSE2-NEXT:    # kill: def $edi killed $edi def $rdi
1094; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1095; SSE2-NEXT:    andl $1, %edi
1096; SSE2-NEXT:    movsd %xmm1, -24(%rsp,%rdi,8)
1097; SSE2-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
1098; SSE2-NEXT:    retq
1099;
1100; SSE41-LABEL: arg_f64_v2f64:
1101; SSE41:       # %bb.0:
1102; SSE41-NEXT:    movapd %xmm0, %xmm2
1103; SSE41-NEXT:    movddup {{.*#+}} xmm1 = xmm1[0,0]
1104; SSE41-NEXT:    movslq %edi, %rax
1105; SSE41-NEXT:    movq %rax, %xmm0
1106; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1107; SSE41-NEXT:    pcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1108; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
1109; SSE41-NEXT:    movapd %xmm2, %xmm0
1110; SSE41-NEXT:    retq
1111;
1112; AVX1-LABEL: arg_f64_v2f64:
1113; AVX1:       # %bb.0:
1114; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
1115; AVX1-NEXT:    movslq %edi, %rax
1116; AVX1-NEXT:    vmovq %rax, %xmm2
1117; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1118; AVX1-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1119; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1120; AVX1-NEXT:    retq
1121;
1122; AVX2-LABEL: arg_f64_v2f64:
1123; AVX2:       # %bb.0:
1124; AVX2-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
1125; AVX2-NEXT:    movslq %edi, %rax
1126; AVX2-NEXT:    vmovq %rax, %xmm2
1127; AVX2-NEXT:    vpbroadcastq %xmm2, %xmm2
1128; AVX2-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1129; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1130; AVX2-NEXT:    retq
1131;
1132; AVX512-LABEL: arg_f64_v2f64:
1133; AVX512:       # %bb.0:
1134; AVX512-NEXT:    movslq %edi, %rax
1135; AVX512-NEXT:    vpbroadcastq %rax, %xmm2
1136; AVX512-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %k1
1137; AVX512-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0]
1138; AVX512-NEXT:    retq
1139;
1140; X86AVX2-LABEL: arg_f64_v2f64:
1141; X86AVX2:       # %bb.0:
1142; X86AVX2-NEXT:    pushl %ebp
1143; X86AVX2-NEXT:    movl %esp, %ebp
1144; X86AVX2-NEXT:    andl $-16, %esp
1145; X86AVX2-NEXT:    subl $32, %esp
1146; X86AVX2-NEXT:    movl 16(%ebp), %eax
1147; X86AVX2-NEXT:    andl $1, %eax
1148; X86AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
1149; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
1150; X86AVX2-NEXT:    vmovsd %xmm1, (%esp,%eax,8)
1151; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
1152; X86AVX2-NEXT:    movl %ebp, %esp
1153; X86AVX2-NEXT:    popl %ebp
1154; X86AVX2-NEXT:    retl
1155  %ins = insertelement <2 x double> %v, double %x, i32 %y
1156  ret <2 x double> %ins
1157}
1158
1159define <16 x i8> @load_i8_v16i8(<16 x i8> %v, i8* %p, i32 %y) nounwind {
1160; SSE-LABEL: load_i8_v16i8:
1161; SSE:       # %bb.0:
1162; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
1163; SSE-NEXT:    movb (%rdi), %al
1164; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1165; SSE-NEXT:    andl $15, %esi
1166; SSE-NEXT:    movb %al, -24(%rsp,%rsi)
1167; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
1168; SSE-NEXT:    retq
1169;
1170; AVX1OR2-LABEL: load_i8_v16i8:
1171; AVX1OR2:       # %bb.0:
1172; AVX1OR2-NEXT:    # kill: def $esi killed $esi def $rsi
1173; AVX1OR2-NEXT:    movb (%rdi), %al
1174; AVX1OR2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1175; AVX1OR2-NEXT:    andl $15, %esi
1176; AVX1OR2-NEXT:    movb %al, -24(%rsp,%rsi)
1177; AVX1OR2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
1178; AVX1OR2-NEXT:    retq
1179;
1180; AVX512F-LABEL: load_i8_v16i8:
1181; AVX512F:       # %bb.0:
1182; AVX512F-NEXT:    # kill: def $esi killed $esi def $rsi
1183; AVX512F-NEXT:    movb (%rdi), %al
1184; AVX512F-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1185; AVX512F-NEXT:    andl $15, %esi
1186; AVX512F-NEXT:    movb %al, -24(%rsp,%rsi)
1187; AVX512F-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
1188; AVX512F-NEXT:    retq
1189;
1190; AVX512BW-LABEL: load_i8_v16i8:
1191; AVX512BW:       # %bb.0:
1192; AVX512BW-NEXT:    vpbroadcastb %esi, %xmm1
1193; AVX512BW-NEXT:    vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
1194; AVX512BW-NEXT:    vpbroadcastb (%rdi), %xmm0 {%k1}
1195; AVX512BW-NEXT:    retq
1196;
1197; X86AVX2-LABEL: load_i8_v16i8:
1198; X86AVX2:       # %bb.0:
1199; X86AVX2-NEXT:    pushl %ebp
1200; X86AVX2-NEXT:    movl %esp, %ebp
1201; X86AVX2-NEXT:    andl $-16, %esp
1202; X86AVX2-NEXT:    subl $32, %esp
1203; X86AVX2-NEXT:    movl 12(%ebp), %eax
1204; X86AVX2-NEXT:    andl $15, %eax
1205; X86AVX2-NEXT:    movl 8(%ebp), %ecx
1206; X86AVX2-NEXT:    movb (%ecx), %cl
1207; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
1208; X86AVX2-NEXT:    movb %cl, (%esp,%eax)
1209; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
1210; X86AVX2-NEXT:    movl %ebp, %esp
1211; X86AVX2-NEXT:    popl %ebp
1212; X86AVX2-NEXT:    retl
1213  %x = load i8, i8* %p
1214  %ins = insertelement <16 x i8> %v, i8 %x, i32 %y
1215  ret <16 x i8> %ins
1216}
1217
1218define <8 x i16> @load_i16_v8i16(<8 x i16> %v, i16* %p, i32 %y) nounwind {
1219; SSE-LABEL: load_i16_v8i16:
1220; SSE:       # %bb.0:
1221; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
1222; SSE-NEXT:    movzwl (%rdi), %eax
1223; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1224; SSE-NEXT:    andl $7, %esi
1225; SSE-NEXT:    movw %ax, -24(%rsp,%rsi,2)
1226; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
1227; SSE-NEXT:    retq
1228;
1229; AVX1OR2-LABEL: load_i16_v8i16:
1230; AVX1OR2:       # %bb.0:
1231; AVX1OR2-NEXT:    # kill: def $esi killed $esi def $rsi
1232; AVX1OR2-NEXT:    movzwl (%rdi), %eax
1233; AVX1OR2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1234; AVX1OR2-NEXT:    andl $7, %esi
1235; AVX1OR2-NEXT:    movw %ax, -24(%rsp,%rsi,2)
1236; AVX1OR2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
1237; AVX1OR2-NEXT:    retq
1238;
1239; AVX512F-LABEL: load_i16_v8i16:
1240; AVX512F:       # %bb.0:
1241; AVX512F-NEXT:    # kill: def $esi killed $esi def $rsi
1242; AVX512F-NEXT:    movzwl (%rdi), %eax
1243; AVX512F-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1244; AVX512F-NEXT:    andl $7, %esi
1245; AVX512F-NEXT:    movw %ax, -24(%rsp,%rsi,2)
1246; AVX512F-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
1247; AVX512F-NEXT:    retq
1248;
1249; AVX512BW-LABEL: load_i16_v8i16:
1250; AVX512BW:       # %bb.0:
1251; AVX512BW-NEXT:    vpbroadcastw %esi, %xmm1
1252; AVX512BW-NEXT:    vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
1253; AVX512BW-NEXT:    vpbroadcastw (%rdi), %xmm0 {%k1}
1254; AVX512BW-NEXT:    retq
1255;
1256; X86AVX2-LABEL: load_i16_v8i16:
1257; X86AVX2:       # %bb.0:
1258; X86AVX2-NEXT:    pushl %ebp
1259; X86AVX2-NEXT:    movl %esp, %ebp
1260; X86AVX2-NEXT:    andl $-16, %esp
1261; X86AVX2-NEXT:    subl $32, %esp
1262; X86AVX2-NEXT:    movl 12(%ebp), %eax
1263; X86AVX2-NEXT:    andl $7, %eax
1264; X86AVX2-NEXT:    movl 8(%ebp), %ecx
1265; X86AVX2-NEXT:    movzwl (%ecx), %ecx
1266; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
1267; X86AVX2-NEXT:    movw %cx, (%esp,%eax,2)
1268; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
1269; X86AVX2-NEXT:    movl %ebp, %esp
1270; X86AVX2-NEXT:    popl %ebp
1271; X86AVX2-NEXT:    retl
1272  %x = load i16, i16* %p
1273  %ins = insertelement <8 x i16> %v, i16 %x, i32 %y
1274  ret <8 x i16> %ins
1275}
1276
1277define <4 x i32> @load_i32_v4i32(<4 x i32> %v, i32* %p, i32 %y) nounwind {
1278; SSE-LABEL: load_i32_v4i32:
1279; SSE:       # %bb.0:
1280; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
1281; SSE-NEXT:    movl (%rdi), %eax
1282; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1283; SSE-NEXT:    andl $3, %esi
1284; SSE-NEXT:    movl %eax, -24(%rsp,%rsi,4)
1285; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
1286; SSE-NEXT:    retq
1287;
1288; AVX1OR2-LABEL: load_i32_v4i32:
1289; AVX1OR2:       # %bb.0:
1290; AVX1OR2-NEXT:    # kill: def $esi killed $esi def $rsi
1291; AVX1OR2-NEXT:    movl (%rdi), %eax
1292; AVX1OR2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1293; AVX1OR2-NEXT:    andl $3, %esi
1294; AVX1OR2-NEXT:    movl %eax, -24(%rsp,%rsi,4)
1295; AVX1OR2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
1296; AVX1OR2-NEXT:    retq
1297;
1298; AVX512-LABEL: load_i32_v4i32:
1299; AVX512:       # %bb.0:
1300; AVX512-NEXT:    vpbroadcastd %esi, %xmm1
1301; AVX512-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
1302; AVX512-NEXT:    vpbroadcastd (%rdi), %xmm0 {%k1}
1303; AVX512-NEXT:    retq
1304;
1305; X86AVX2-LABEL: load_i32_v4i32:
1306; X86AVX2:       # %bb.0:
1307; X86AVX2-NEXT:    pushl %ebp
1308; X86AVX2-NEXT:    movl %esp, %ebp
1309; X86AVX2-NEXT:    andl $-16, %esp
1310; X86AVX2-NEXT:    subl $32, %esp
1311; X86AVX2-NEXT:    movl 12(%ebp), %eax
1312; X86AVX2-NEXT:    andl $3, %eax
1313; X86AVX2-NEXT:    movl 8(%ebp), %ecx
1314; X86AVX2-NEXT:    movl (%ecx), %ecx
1315; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
1316; X86AVX2-NEXT:    movl %ecx, (%esp,%eax,4)
1317; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
1318; X86AVX2-NEXT:    movl %ebp, %esp
1319; X86AVX2-NEXT:    popl %ebp
1320; X86AVX2-NEXT:    retl
1321  %x = load i32, i32* %p
1322  %ins = insertelement <4 x i32> %v, i32 %x, i32 %y
1323  ret <4 x i32> %ins
1324}
1325
1326define <2 x i64> @load_i64_v2i64(<2 x i64> %v, i64* %p, i32 %y) nounwind {
1327; SSE-LABEL: load_i64_v2i64:
1328; SSE:       # %bb.0:
1329; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
1330; SSE-NEXT:    movq (%rdi), %rax
1331; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1332; SSE-NEXT:    andl $1, %esi
1333; SSE-NEXT:    movq %rax, -24(%rsp,%rsi,8)
1334; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
1335; SSE-NEXT:    retq
1336;
1337; AVX1OR2-LABEL: load_i64_v2i64:
1338; AVX1OR2:       # %bb.0:
1339; AVX1OR2-NEXT:    # kill: def $esi killed $esi def $rsi
1340; AVX1OR2-NEXT:    movq (%rdi), %rax
1341; AVX1OR2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1342; AVX1OR2-NEXT:    andl $1, %esi
1343; AVX1OR2-NEXT:    movq %rax, -24(%rsp,%rsi,8)
1344; AVX1OR2-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
1345; AVX1OR2-NEXT:    retq
1346;
1347; AVX512-LABEL: load_i64_v2i64:
1348; AVX512:       # %bb.0:
1349; AVX512-NEXT:    movslq %esi, %rax
1350; AVX512-NEXT:    vpbroadcastq %rax, %xmm1
1351; AVX512-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
1352; AVX512-NEXT:    vpbroadcastq (%rdi), %xmm0 {%k1}
1353; AVX512-NEXT:    retq
1354;
1355; X86AVX2-LABEL: load_i64_v2i64:
1356; X86AVX2:       # %bb.0:
1357; X86AVX2-NEXT:    pushl %ebp
1358; X86AVX2-NEXT:    movl %esp, %ebp
1359; X86AVX2-NEXT:    pushl %esi
1360; X86AVX2-NEXT:    andl $-16, %esp
1361; X86AVX2-NEXT:    subl $48, %esp
1362; X86AVX2-NEXT:    movl 12(%ebp), %eax
1363; X86AVX2-NEXT:    movl 8(%ebp), %ecx
1364; X86AVX2-NEXT:    movl (%ecx), %edx
1365; X86AVX2-NEXT:    movl 4(%ecx), %ecx
1366; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
1367; X86AVX2-NEXT:    leal (%eax,%eax), %esi
1368; X86AVX2-NEXT:    andl $3, %esi
1369; X86AVX2-NEXT:    movl %edx, (%esp,%esi,4)
1370; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
1371; X86AVX2-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
1372; X86AVX2-NEXT:    leal 1(%eax,%eax), %eax
1373; X86AVX2-NEXT:    andl $3, %eax
1374; X86AVX2-NEXT:    movl %ecx, 16(%esp,%eax,4)
1375; X86AVX2-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
1376; X86AVX2-NEXT:    leal -4(%ebp), %esp
1377; X86AVX2-NEXT:    popl %esi
1378; X86AVX2-NEXT:    popl %ebp
1379; X86AVX2-NEXT:    retl
1380  %x = load i64, i64* %p
1381  %ins = insertelement <2 x i64> %v, i64 %x, i32 %y
1382  ret <2 x i64> %ins
1383}
1384
1385define <4 x float> @load_f32_v4f32(<4 x float> %v, float* %p, i32 %y) nounwind {
1386; SSE2-LABEL: load_f32_v4f32:
1387; SSE2:       # %bb.0:
1388; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
1389; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1390; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1391; SSE2-NEXT:    andl $3, %esi
1392; SSE2-NEXT:    movss %xmm1, -24(%rsp,%rsi,4)
1393; SSE2-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
1394; SSE2-NEXT:    retq
1395;
1396; SSE41-LABEL: load_f32_v4f32:
1397; SSE41:       # %bb.0:
1398; SSE41-NEXT:    movaps %xmm0, %xmm1
1399; SSE41-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1400; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0,0,0]
1401; SSE41-NEXT:    movd %esi, %xmm0
1402; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1403; SSE41-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1404; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm1
1405; SSE41-NEXT:    movaps %xmm1, %xmm0
1406; SSE41-NEXT:    retq
1407;
1408; AVX1-LABEL: load_f32_v4f32:
1409; AVX1:       # %bb.0:
1410; AVX1-NEXT:    vbroadcastss (%rdi), %xmm1
1411; AVX1-NEXT:    vmovd %esi, %xmm2
1412; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1413; AVX1-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1414; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1415; AVX1-NEXT:    retq
1416;
1417; AVX2-LABEL: load_f32_v4f32:
1418; AVX2:       # %bb.0:
1419; AVX2-NEXT:    vbroadcastss (%rdi), %xmm1
1420; AVX2-NEXT:    vmovd %esi, %xmm2
1421; AVX2-NEXT:    vpbroadcastd %xmm2, %xmm2
1422; AVX2-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1423; AVX2-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
1424; AVX2-NEXT:    retq
1425;
1426; AVX512-LABEL: load_f32_v4f32:
1427; AVX512:       # %bb.0:
1428; AVX512-NEXT:    vpbroadcastd %esi, %xmm1
1429; AVX512-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
1430; AVX512-NEXT:    vbroadcastss (%rdi), %xmm0 {%k1}
1431; AVX512-NEXT:    retq
1432;
1433; X86AVX2-LABEL: load_f32_v4f32:
1434; X86AVX2:       # %bb.0:
1435; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1436; X86AVX2-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %xmm1
1437; X86AVX2-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1438; X86AVX2-NEXT:    vbroadcastss (%eax), %xmm2
1439; X86AVX2-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
1440; X86AVX2-NEXT:    retl
1441  %x = load float, float* %p
1442  %ins = insertelement <4 x float> %v, float %x, i32 %y
1443  ret <4 x float> %ins
1444}
1445
1446define <2 x double> @load_f64_v2f64(<2 x double> %v, double* %p, i32 %y) nounwind {
1447; SSE2-LABEL: load_f64_v2f64:
1448; SSE2:       # %bb.0:
1449; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
1450; SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
1451; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1452; SSE2-NEXT:    andl $1, %esi
1453; SSE2-NEXT:    movsd %xmm1, -24(%rsp,%rsi,8)
1454; SSE2-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
1455; SSE2-NEXT:    retq
1456;
1457; SSE41-LABEL: load_f64_v2f64:
1458; SSE41:       # %bb.0:
1459; SSE41-NEXT:    movapd %xmm0, %xmm1
1460; SSE41-NEXT:    movddup {{.*#+}} xmm2 = mem[0,0]
1461; SSE41-NEXT:    movslq %esi, %rax
1462; SSE41-NEXT:    movq %rax, %xmm0
1463; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1464; SSE41-NEXT:    pcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1465; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
1466; SSE41-NEXT:    movapd %xmm1, %xmm0
1467; SSE41-NEXT:    retq
1468;
1469; AVX1-LABEL: load_f64_v2f64:
1470; AVX1:       # %bb.0:
1471; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
1472; AVX1-NEXT:    movslq %esi, %rax
1473; AVX1-NEXT:    vmovq %rax, %xmm2
1474; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1475; AVX1-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1476; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1477; AVX1-NEXT:    retq
1478;
1479; AVX2-LABEL: load_f64_v2f64:
1480; AVX2:       # %bb.0:
1481; AVX2-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
1482; AVX2-NEXT:    movslq %esi, %rax
1483; AVX2-NEXT:    vmovq %rax, %xmm2
1484; AVX2-NEXT:    vpbroadcastq %xmm2, %xmm2
1485; AVX2-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1486; AVX2-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1487; AVX2-NEXT:    retq
1488;
1489; AVX512-LABEL: load_f64_v2f64:
1490; AVX512:       # %bb.0:
1491; AVX512-NEXT:    movslq %esi, %rax
1492; AVX512-NEXT:    vpbroadcastq %rax, %xmm1
1493; AVX512-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
1494; AVX512-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = mem[0,0]
1495; AVX512-NEXT:    retq
1496;
1497; X86AVX2-LABEL: load_f64_v2f64:
1498; X86AVX2:       # %bb.0:
1499; X86AVX2-NEXT:    pushl %ebp
1500; X86AVX2-NEXT:    movl %esp, %ebp
1501; X86AVX2-NEXT:    andl $-16, %esp
1502; X86AVX2-NEXT:    subl $32, %esp
1503; X86AVX2-NEXT:    movl 12(%ebp), %eax
1504; X86AVX2-NEXT:    andl $1, %eax
1505; X86AVX2-NEXT:    movl 8(%ebp), %ecx
1506; X86AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
1507; X86AVX2-NEXT:    vmovaps %xmm0, (%esp)
1508; X86AVX2-NEXT:    vmovsd %xmm1, (%esp,%eax,8)
1509; X86AVX2-NEXT:    vmovaps (%esp), %xmm0
1510; X86AVX2-NEXT:    movl %ebp, %esp
1511; X86AVX2-NEXT:    popl %ebp
1512; X86AVX2-NEXT:    retl
1513  %x = load double, double* %p
1514  %ins = insertelement <2 x double> %v, double %x, i32 %y
1515  ret <2 x double> %ins
1516}
1517
1518define <32 x i8> @arg_i8_v32i8(<32 x i8> %v, i8 %x, i32 %y) nounwind {
1519; SSE-LABEL: arg_i8_v32i8:
1520; SSE:       # %bb.0:
1521; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
1522; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1523; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1524; SSE-NEXT:    andl $31, %esi
1525; SSE-NEXT:    movb %dil, -40(%rsp,%rsi)
1526; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
1527; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
1528; SSE-NEXT:    retq
1529;
1530; AVX1OR2-LABEL: arg_i8_v32i8:
1531; AVX1OR2:       # %bb.0:
1532; AVX1OR2-NEXT:    pushq %rbp
1533; AVX1OR2-NEXT:    movq %rsp, %rbp
1534; AVX1OR2-NEXT:    andq $-32, %rsp
1535; AVX1OR2-NEXT:    subq $64, %rsp
1536; AVX1OR2-NEXT:    # kill: def $esi killed $esi def $rsi
1537; AVX1OR2-NEXT:    vmovaps %ymm0, (%rsp)
1538; AVX1OR2-NEXT:    andl $31, %esi
1539; AVX1OR2-NEXT:    movb %dil, (%rsp,%rsi)
1540; AVX1OR2-NEXT:    vmovaps (%rsp), %ymm0
1541; AVX1OR2-NEXT:    movq %rbp, %rsp
1542; AVX1OR2-NEXT:    popq %rbp
1543; AVX1OR2-NEXT:    retq
1544;
1545; AVX512F-LABEL: arg_i8_v32i8:
1546; AVX512F:       # %bb.0:
1547; AVX512F-NEXT:    pushq %rbp
1548; AVX512F-NEXT:    movq %rsp, %rbp
1549; AVX512F-NEXT:    andq $-32, %rsp
1550; AVX512F-NEXT:    subq $64, %rsp
1551; AVX512F-NEXT:    # kill: def $esi killed $esi def $rsi
1552; AVX512F-NEXT:    vmovaps %ymm0, (%rsp)
1553; AVX512F-NEXT:    andl $31, %esi
1554; AVX512F-NEXT:    movb %dil, (%rsp,%rsi)
1555; AVX512F-NEXT:    vmovaps (%rsp), %ymm0
1556; AVX512F-NEXT:    movq %rbp, %rsp
1557; AVX512F-NEXT:    popq %rbp
1558; AVX512F-NEXT:    retq
1559;
1560; AVX512BW-LABEL: arg_i8_v32i8:
1561; AVX512BW:       # %bb.0:
1562; AVX512BW-NEXT:    vpbroadcastb %esi, %ymm1
1563; AVX512BW-NEXT:    vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
1564; AVX512BW-NEXT:    vpbroadcastb %edi, %ymm0 {%k1}
1565; AVX512BW-NEXT:    retq
1566;
1567; X86AVX2-LABEL: arg_i8_v32i8:
1568; X86AVX2:       # %bb.0:
1569; X86AVX2-NEXT:    pushl %ebp
1570; X86AVX2-NEXT:    movl %esp, %ebp
1571; X86AVX2-NEXT:    andl $-32, %esp
1572; X86AVX2-NEXT:    subl $64, %esp
1573; X86AVX2-NEXT:    movl 12(%ebp), %eax
1574; X86AVX2-NEXT:    andl $31, %eax
1575; X86AVX2-NEXT:    movb 8(%ebp), %cl
1576; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
1577; X86AVX2-NEXT:    movb %cl, (%esp,%eax)
1578; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
1579; X86AVX2-NEXT:    movl %ebp, %esp
1580; X86AVX2-NEXT:    popl %ebp
1581; X86AVX2-NEXT:    retl
1582  %ins = insertelement <32 x i8> %v, i8 %x, i32 %y
1583  ret <32 x i8> %ins
1584}
1585
1586define <16 x i16> @arg_i16_v16i16(<16 x i16> %v, i16 %x, i32 %y) nounwind {
1587; SSE-LABEL: arg_i16_v16i16:
1588; SSE:       # %bb.0:
1589; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
1590; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1591; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1592; SSE-NEXT:    andl $15, %esi
1593; SSE-NEXT:    movw %di, -40(%rsp,%rsi,2)
1594; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
1595; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
1596; SSE-NEXT:    retq
1597;
1598; AVX1OR2-LABEL: arg_i16_v16i16:
1599; AVX1OR2:       # %bb.0:
1600; AVX1OR2-NEXT:    pushq %rbp
1601; AVX1OR2-NEXT:    movq %rsp, %rbp
1602; AVX1OR2-NEXT:    andq $-32, %rsp
1603; AVX1OR2-NEXT:    subq $64, %rsp
1604; AVX1OR2-NEXT:    # kill: def $esi killed $esi def $rsi
1605; AVX1OR2-NEXT:    vmovaps %ymm0, (%rsp)
1606; AVX1OR2-NEXT:    andl $15, %esi
1607; AVX1OR2-NEXT:    movw %di, (%rsp,%rsi,2)
1608; AVX1OR2-NEXT:    vmovaps (%rsp), %ymm0
1609; AVX1OR2-NEXT:    movq %rbp, %rsp
1610; AVX1OR2-NEXT:    popq %rbp
1611; AVX1OR2-NEXT:    retq
1612;
1613; AVX512F-LABEL: arg_i16_v16i16:
1614; AVX512F:       # %bb.0:
1615; AVX512F-NEXT:    pushq %rbp
1616; AVX512F-NEXT:    movq %rsp, %rbp
1617; AVX512F-NEXT:    andq $-32, %rsp
1618; AVX512F-NEXT:    subq $64, %rsp
1619; AVX512F-NEXT:    # kill: def $esi killed $esi def $rsi
1620; AVX512F-NEXT:    vmovaps %ymm0, (%rsp)
1621; AVX512F-NEXT:    andl $15, %esi
1622; AVX512F-NEXT:    movw %di, (%rsp,%rsi,2)
1623; AVX512F-NEXT:    vmovaps (%rsp), %ymm0
1624; AVX512F-NEXT:    movq %rbp, %rsp
1625; AVX512F-NEXT:    popq %rbp
1626; AVX512F-NEXT:    retq
1627;
1628; AVX512BW-LABEL: arg_i16_v16i16:
1629; AVX512BW:       # %bb.0:
1630; AVX512BW-NEXT:    vpbroadcastw %esi, %ymm1
1631; AVX512BW-NEXT:    vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
1632; AVX512BW-NEXT:    vpbroadcastw %edi, %ymm0 {%k1}
1633; AVX512BW-NEXT:    retq
1634;
1635; X86AVX2-LABEL: arg_i16_v16i16:
1636; X86AVX2:       # %bb.0:
1637; X86AVX2-NEXT:    pushl %ebp
1638; X86AVX2-NEXT:    movl %esp, %ebp
1639; X86AVX2-NEXT:    andl $-32, %esp
1640; X86AVX2-NEXT:    subl $64, %esp
1641; X86AVX2-NEXT:    movl 12(%ebp), %eax
1642; X86AVX2-NEXT:    andl $15, %eax
1643; X86AVX2-NEXT:    movzwl 8(%ebp), %ecx
1644; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
1645; X86AVX2-NEXT:    movw %cx, (%esp,%eax,2)
1646; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
1647; X86AVX2-NEXT:    movl %ebp, %esp
1648; X86AVX2-NEXT:    popl %ebp
1649; X86AVX2-NEXT:    retl
1650  %ins = insertelement <16 x i16> %v, i16 %x, i32 %y
1651  ret <16 x i16> %ins
1652}
1653
1654define <8 x i32> @arg_i32_v8i32(<8 x i32> %v, i32 %x, i32 %y) nounwind {
1655; SSE-LABEL: arg_i32_v8i32:
1656; SSE:       # %bb.0:
1657; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
1658; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1659; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1660; SSE-NEXT:    andl $7, %esi
1661; SSE-NEXT:    movl %edi, -40(%rsp,%rsi,4)
1662; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
1663; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
1664; SSE-NEXT:    retq
1665;
1666; AVX1OR2-LABEL: arg_i32_v8i32:
1667; AVX1OR2:       # %bb.0:
1668; AVX1OR2-NEXT:    pushq %rbp
1669; AVX1OR2-NEXT:    movq %rsp, %rbp
1670; AVX1OR2-NEXT:    andq $-32, %rsp
1671; AVX1OR2-NEXT:    subq $64, %rsp
1672; AVX1OR2-NEXT:    # kill: def $esi killed $esi def $rsi
1673; AVX1OR2-NEXT:    vmovaps %ymm0, (%rsp)
1674; AVX1OR2-NEXT:    andl $7, %esi
1675; AVX1OR2-NEXT:    movl %edi, (%rsp,%rsi,4)
1676; AVX1OR2-NEXT:    vmovaps (%rsp), %ymm0
1677; AVX1OR2-NEXT:    movq %rbp, %rsp
1678; AVX1OR2-NEXT:    popq %rbp
1679; AVX1OR2-NEXT:    retq
1680;
1681; AVX512-LABEL: arg_i32_v8i32:
1682; AVX512:       # %bb.0:
1683; AVX512-NEXT:    vpbroadcastd %esi, %ymm1
1684; AVX512-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
1685; AVX512-NEXT:    vpbroadcastd %edi, %ymm0 {%k1}
1686; AVX512-NEXT:    retq
1687;
1688; X86AVX2-LABEL: arg_i32_v8i32:
1689; X86AVX2:       # %bb.0:
1690; X86AVX2-NEXT:    pushl %ebp
1691; X86AVX2-NEXT:    movl %esp, %ebp
1692; X86AVX2-NEXT:    andl $-32, %esp
1693; X86AVX2-NEXT:    subl $64, %esp
1694; X86AVX2-NEXT:    movl 12(%ebp), %eax
1695; X86AVX2-NEXT:    andl $7, %eax
1696; X86AVX2-NEXT:    movl 8(%ebp), %ecx
1697; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
1698; X86AVX2-NEXT:    movl %ecx, (%esp,%eax,4)
1699; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
1700; X86AVX2-NEXT:    movl %ebp, %esp
1701; X86AVX2-NEXT:    popl %ebp
1702; X86AVX2-NEXT:    retl
1703  %ins = insertelement <8 x i32> %v, i32 %x, i32 %y
1704  ret <8 x i32> %ins
1705}
1706
1707define <4 x i64> @arg_i64_v4i64(<4 x i64> %v, i64 %x, i32 %y) nounwind {
1708; SSE-LABEL: arg_i64_v4i64:
1709; SSE:       # %bb.0:
1710; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
1711; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1712; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1713; SSE-NEXT:    andl $3, %esi
1714; SSE-NEXT:    movq %rdi, -40(%rsp,%rsi,8)
1715; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
1716; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
1717; SSE-NEXT:    retq
1718;
1719; AVX1OR2-LABEL: arg_i64_v4i64:
1720; AVX1OR2:       # %bb.0:
1721; AVX1OR2-NEXT:    pushq %rbp
1722; AVX1OR2-NEXT:    movq %rsp, %rbp
1723; AVX1OR2-NEXT:    andq $-32, %rsp
1724; AVX1OR2-NEXT:    subq $64, %rsp
1725; AVX1OR2-NEXT:    # kill: def $esi killed $esi def $rsi
1726; AVX1OR2-NEXT:    vmovaps %ymm0, (%rsp)
1727; AVX1OR2-NEXT:    andl $3, %esi
1728; AVX1OR2-NEXT:    movq %rdi, (%rsp,%rsi,8)
1729; AVX1OR2-NEXT:    vmovaps (%rsp), %ymm0
1730; AVX1OR2-NEXT:    movq %rbp, %rsp
1731; AVX1OR2-NEXT:    popq %rbp
1732; AVX1OR2-NEXT:    retq
1733;
1734; AVX512-LABEL: arg_i64_v4i64:
1735; AVX512:       # %bb.0:
1736; AVX512-NEXT:    movslq %esi, %rax
1737; AVX512-NEXT:    vpbroadcastq %rax, %ymm1
1738; AVX512-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
1739; AVX512-NEXT:    vpbroadcastq %rdi, %ymm0 {%k1}
1740; AVX512-NEXT:    retq
1741;
1742; X86AVX2-LABEL: arg_i64_v4i64:
1743; X86AVX2:       # %bb.0:
1744; X86AVX2-NEXT:    pushl %ebp
1745; X86AVX2-NEXT:    movl %esp, %ebp
1746; X86AVX2-NEXT:    pushl %esi
1747; X86AVX2-NEXT:    andl $-32, %esp
1748; X86AVX2-NEXT:    subl $96, %esp
1749; X86AVX2-NEXT:    movl 8(%ebp), %eax
1750; X86AVX2-NEXT:    movl 12(%ebp), %ecx
1751; X86AVX2-NEXT:    movl 16(%ebp), %edx
1752; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
1753; X86AVX2-NEXT:    leal (%edx,%edx), %esi
1754; X86AVX2-NEXT:    andl $7, %esi
1755; X86AVX2-NEXT:    movl %eax, (%esp,%esi,4)
1756; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
1757; X86AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
1758; X86AVX2-NEXT:    leal 1(%edx,%edx), %eax
1759; X86AVX2-NEXT:    andl $7, %eax
1760; X86AVX2-NEXT:    movl %ecx, 32(%esp,%eax,4)
1761; X86AVX2-NEXT:    vmovaps {{[0-9]+}}(%esp), %ymm0
1762; X86AVX2-NEXT:    leal -4(%ebp), %esp
1763; X86AVX2-NEXT:    popl %esi
1764; X86AVX2-NEXT:    popl %ebp
1765; X86AVX2-NEXT:    retl
1766  %ins = insertelement <4 x i64> %v, i64 %x, i32 %y
1767  ret <4 x i64> %ins
1768}
1769
1770define <8 x float> @arg_f32_v8f32(<8 x float> %v, float %x, i32 %y) nounwind {
1771; SSE-LABEL: arg_f32_v8f32:
1772; SSE:       # %bb.0:
1773; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
1774; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1775; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1776; SSE-NEXT:    andl $7, %edi
1777; SSE-NEXT:    movss %xmm2, -40(%rsp,%rdi,4)
1778; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
1779; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
1780; SSE-NEXT:    retq
1781;
1782; AVX1-LABEL: arg_f32_v8f32:
1783; AVX1:       # %bb.0:
1784; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0]
1785; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
1786; AVX1-NEXT:    vmovd %edi, %xmm2
1787; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1788; AVX1-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
1789; AVX1-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2, %xmm2
1790; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
1791; AVX1-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
1792; AVX1-NEXT:    retq
1793;
1794; AVX2-LABEL: arg_f32_v8f32:
1795; AVX2:       # %bb.0:
1796; AVX2-NEXT:    vbroadcastss %xmm1, %ymm1
1797; AVX2-NEXT:    vmovd %edi, %xmm2
1798; AVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
1799; AVX2-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
1800; AVX2-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
1801; AVX2-NEXT:    retq
1802;
1803; AVX512-LABEL: arg_f32_v8f32:
1804; AVX512:       # %bb.0:
1805; AVX512-NEXT:    vpbroadcastd %edi, %ymm2
1806; AVX512-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %k1
1807; AVX512-NEXT:    vbroadcastss %xmm1, %ymm0 {%k1}
1808; AVX512-NEXT:    retq
1809;
1810; X86AVX2-LABEL: arg_f32_v8f32:
1811; X86AVX2:       # %bb.0:
1812; X86AVX2-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm1
1813; X86AVX2-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
1814; X86AVX2-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %ymm2
1815; X86AVX2-NEXT:    vblendvps %ymm1, %ymm2, %ymm0, %ymm0
1816; X86AVX2-NEXT:    retl
1817  %ins = insertelement <8 x float> %v, float %x, i32 %y
1818  ret <8 x float> %ins
1819}
1820
1821define <4 x double> @arg_f64_v4f64(<4 x double> %v, double %x, i32 %y) nounwind {
1822; SSE-LABEL: arg_f64_v4f64:
1823; SSE:       # %bb.0:
1824; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
1825; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1826; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1827; SSE-NEXT:    andl $3, %edi
1828; SSE-NEXT:    movsd %xmm2, -40(%rsp,%rdi,8)
1829; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
1830; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
1831; SSE-NEXT:    retq
1832;
1833; AVX1-LABEL: arg_f64_v4f64:
1834; AVX1:       # %bb.0:
1835; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
1836; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
1837; AVX1-NEXT:    movslq %edi, %rax
1838; AVX1-NEXT:    vmovq %rax, %xmm2
1839; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
1840; AVX1-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
1841; AVX1-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2, %xmm2
1842; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
1843; AVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
1844; AVX1-NEXT:    retq
1845;
1846; AVX2-LABEL: arg_f64_v4f64:
1847; AVX2:       # %bb.0:
1848; AVX2-NEXT:    vbroadcastsd %xmm1, %ymm1
1849; AVX2-NEXT:    movslq %edi, %rax
1850; AVX2-NEXT:    vmovq %rax, %xmm2
1851; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
1852; AVX2-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
1853; AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
1854; AVX2-NEXT:    retq
1855;
1856; AVX512-LABEL: arg_f64_v4f64:
1857; AVX512:       # %bb.0:
1858; AVX512-NEXT:    movslq %edi, %rax
1859; AVX512-NEXT:    vpbroadcastq %rax, %ymm2
1860; AVX512-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %k1
1861; AVX512-NEXT:    vbroadcastsd %xmm1, %ymm0 {%k1}
1862; AVX512-NEXT:    retq
1863;
1864; X86AVX2-LABEL: arg_f64_v4f64:
1865; X86AVX2:       # %bb.0:
1866; X86AVX2-NEXT:    pushl %ebp
1867; X86AVX2-NEXT:    movl %esp, %ebp
1868; X86AVX2-NEXT:    andl $-32, %esp
1869; X86AVX2-NEXT:    subl $64, %esp
1870; X86AVX2-NEXT:    movl 16(%ebp), %eax
1871; X86AVX2-NEXT:    andl $3, %eax
1872; X86AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
1873; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
1874; X86AVX2-NEXT:    vmovsd %xmm1, (%esp,%eax,8)
1875; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
1876; X86AVX2-NEXT:    movl %ebp, %esp
1877; X86AVX2-NEXT:    popl %ebp
1878; X86AVX2-NEXT:    retl
1879  %ins = insertelement <4 x double> %v, double %x, i32 %y
1880  ret <4 x double> %ins
1881}
1882
1883define <32 x i8> @load_i8_v32i8(<32 x i8> %v, i8* %p, i32 %y) nounwind {
1884; SSE-LABEL: load_i8_v32i8:
1885; SSE:       # %bb.0:
1886; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
1887; SSE-NEXT:    movb (%rdi), %al
1888; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1889; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1890; SSE-NEXT:    andl $31, %esi
1891; SSE-NEXT:    movb %al, -40(%rsp,%rsi)
1892; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
1893; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
1894; SSE-NEXT:    retq
1895;
1896; AVX1OR2-LABEL: load_i8_v32i8:
1897; AVX1OR2:       # %bb.0:
1898; AVX1OR2-NEXT:    pushq %rbp
1899; AVX1OR2-NEXT:    movq %rsp, %rbp
1900; AVX1OR2-NEXT:    andq $-32, %rsp
1901; AVX1OR2-NEXT:    subq $64, %rsp
1902; AVX1OR2-NEXT:    # kill: def $esi killed $esi def $rsi
1903; AVX1OR2-NEXT:    movb (%rdi), %al
1904; AVX1OR2-NEXT:    vmovaps %ymm0, (%rsp)
1905; AVX1OR2-NEXT:    andl $31, %esi
1906; AVX1OR2-NEXT:    movb %al, (%rsp,%rsi)
1907; AVX1OR2-NEXT:    vmovaps (%rsp), %ymm0
1908; AVX1OR2-NEXT:    movq %rbp, %rsp
1909; AVX1OR2-NEXT:    popq %rbp
1910; AVX1OR2-NEXT:    retq
1911;
1912; AVX512F-LABEL: load_i8_v32i8:
1913; AVX512F:       # %bb.0:
1914; AVX512F-NEXT:    pushq %rbp
1915; AVX512F-NEXT:    movq %rsp, %rbp
1916; AVX512F-NEXT:    andq $-32, %rsp
1917; AVX512F-NEXT:    subq $64, %rsp
1918; AVX512F-NEXT:    # kill: def $esi killed $esi def $rsi
1919; AVX512F-NEXT:    movb (%rdi), %al
1920; AVX512F-NEXT:    vmovaps %ymm0, (%rsp)
1921; AVX512F-NEXT:    andl $31, %esi
1922; AVX512F-NEXT:    movb %al, (%rsp,%rsi)
1923; AVX512F-NEXT:    vmovaps (%rsp), %ymm0
1924; AVX512F-NEXT:    movq %rbp, %rsp
1925; AVX512F-NEXT:    popq %rbp
1926; AVX512F-NEXT:    retq
1927;
1928; AVX512BW-LABEL: load_i8_v32i8:
1929; AVX512BW:       # %bb.0:
1930; AVX512BW-NEXT:    vpbroadcastb %esi, %ymm1
1931; AVX512BW-NEXT:    vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
1932; AVX512BW-NEXT:    vpbroadcastb (%rdi), %ymm0 {%k1}
1933; AVX512BW-NEXT:    retq
1934;
1935; X86AVX2-LABEL: load_i8_v32i8:
1936; X86AVX2:       # %bb.0:
1937; X86AVX2-NEXT:    pushl %ebp
1938; X86AVX2-NEXT:    movl %esp, %ebp
1939; X86AVX2-NEXT:    andl $-32, %esp
1940; X86AVX2-NEXT:    subl $64, %esp
1941; X86AVX2-NEXT:    movl 12(%ebp), %eax
1942; X86AVX2-NEXT:    andl $31, %eax
1943; X86AVX2-NEXT:    movl 8(%ebp), %ecx
1944; X86AVX2-NEXT:    movb (%ecx), %cl
1945; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
1946; X86AVX2-NEXT:    movb %cl, (%esp,%eax)
1947; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
1948; X86AVX2-NEXT:    movl %ebp, %esp
1949; X86AVX2-NEXT:    popl %ebp
1950; X86AVX2-NEXT:    retl
1951  %x = load i8, i8* %p
1952  %ins = insertelement <32 x i8> %v, i8 %x, i32 %y
1953  ret <32 x i8> %ins
1954}
1955
1956define <16 x i16> @load_i16_v16i16(<16 x i16> %v, i16* %p, i32 %y) nounwind {
1957; SSE-LABEL: load_i16_v16i16:
1958; SSE:       # %bb.0:
1959; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
1960; SSE-NEXT:    movzwl (%rdi), %eax
1961; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1962; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1963; SSE-NEXT:    andl $15, %esi
1964; SSE-NEXT:    movw %ax, -40(%rsp,%rsi,2)
1965; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
1966; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
1967; SSE-NEXT:    retq
1968;
1969; AVX1OR2-LABEL: load_i16_v16i16:
1970; AVX1OR2:       # %bb.0:
1971; AVX1OR2-NEXT:    pushq %rbp
1972; AVX1OR2-NEXT:    movq %rsp, %rbp
1973; AVX1OR2-NEXT:    andq $-32, %rsp
1974; AVX1OR2-NEXT:    subq $64, %rsp
1975; AVX1OR2-NEXT:    # kill: def $esi killed $esi def $rsi
1976; AVX1OR2-NEXT:    movzwl (%rdi), %eax
1977; AVX1OR2-NEXT:    vmovaps %ymm0, (%rsp)
1978; AVX1OR2-NEXT:    andl $15, %esi
1979; AVX1OR2-NEXT:    movw %ax, (%rsp,%rsi,2)
1980; AVX1OR2-NEXT:    vmovaps (%rsp), %ymm0
1981; AVX1OR2-NEXT:    movq %rbp, %rsp
1982; AVX1OR2-NEXT:    popq %rbp
1983; AVX1OR2-NEXT:    retq
1984;
1985; AVX512F-LABEL: load_i16_v16i16:
1986; AVX512F:       # %bb.0:
1987; AVX512F-NEXT:    pushq %rbp
1988; AVX512F-NEXT:    movq %rsp, %rbp
1989; AVX512F-NEXT:    andq $-32, %rsp
1990; AVX512F-NEXT:    subq $64, %rsp
1991; AVX512F-NEXT:    # kill: def $esi killed $esi def $rsi
1992; AVX512F-NEXT:    movzwl (%rdi), %eax
1993; AVX512F-NEXT:    vmovaps %ymm0, (%rsp)
1994; AVX512F-NEXT:    andl $15, %esi
1995; AVX512F-NEXT:    movw %ax, (%rsp,%rsi,2)
1996; AVX512F-NEXT:    vmovaps (%rsp), %ymm0
1997; AVX512F-NEXT:    movq %rbp, %rsp
1998; AVX512F-NEXT:    popq %rbp
1999; AVX512F-NEXT:    retq
2000;
2001; AVX512BW-LABEL: load_i16_v16i16:
2002; AVX512BW:       # %bb.0:
2003; AVX512BW-NEXT:    vpbroadcastw %esi, %ymm1
2004; AVX512BW-NEXT:    vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
2005; AVX512BW-NEXT:    vpbroadcastw (%rdi), %ymm0 {%k1}
2006; AVX512BW-NEXT:    retq
2007;
2008; X86AVX2-LABEL: load_i16_v16i16:
2009; X86AVX2:       # %bb.0:
2010; X86AVX2-NEXT:    pushl %ebp
2011; X86AVX2-NEXT:    movl %esp, %ebp
2012; X86AVX2-NEXT:    andl $-32, %esp
2013; X86AVX2-NEXT:    subl $64, %esp
2014; X86AVX2-NEXT:    movl 12(%ebp), %eax
2015; X86AVX2-NEXT:    andl $15, %eax
2016; X86AVX2-NEXT:    movl 8(%ebp), %ecx
2017; X86AVX2-NEXT:    movzwl (%ecx), %ecx
2018; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
2019; X86AVX2-NEXT:    movw %cx, (%esp,%eax,2)
2020; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
2021; X86AVX2-NEXT:    movl %ebp, %esp
2022; X86AVX2-NEXT:    popl %ebp
2023; X86AVX2-NEXT:    retl
2024  %x = load i16, i16* %p
2025  %ins = insertelement <16 x i16> %v, i16 %x, i32 %y
2026  ret <16 x i16> %ins
2027}
2028
2029define <8 x i32> @load_i32_v8i32(<8 x i32> %v, i32* %p, i32 %y) nounwind {
2030; SSE-LABEL: load_i32_v8i32:
2031; SSE:       # %bb.0:
2032; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
2033; SSE-NEXT:    movl (%rdi), %eax
2034; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
2035; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
2036; SSE-NEXT:    andl $7, %esi
2037; SSE-NEXT:    movl %eax, -40(%rsp,%rsi,4)
2038; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
2039; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
2040; SSE-NEXT:    retq
2041;
2042; AVX1OR2-LABEL: load_i32_v8i32:
2043; AVX1OR2:       # %bb.0:
2044; AVX1OR2-NEXT:    pushq %rbp
2045; AVX1OR2-NEXT:    movq %rsp, %rbp
2046; AVX1OR2-NEXT:    andq $-32, %rsp
2047; AVX1OR2-NEXT:    subq $64, %rsp
2048; AVX1OR2-NEXT:    # kill: def $esi killed $esi def $rsi
2049; AVX1OR2-NEXT:    movl (%rdi), %eax
2050; AVX1OR2-NEXT:    vmovaps %ymm0, (%rsp)
2051; AVX1OR2-NEXT:    andl $7, %esi
2052; AVX1OR2-NEXT:    movl %eax, (%rsp,%rsi,4)
2053; AVX1OR2-NEXT:    vmovaps (%rsp), %ymm0
2054; AVX1OR2-NEXT:    movq %rbp, %rsp
2055; AVX1OR2-NEXT:    popq %rbp
2056; AVX1OR2-NEXT:    retq
2057;
2058; AVX512-LABEL: load_i32_v8i32:
2059; AVX512:       # %bb.0:
2060; AVX512-NEXT:    vpbroadcastd %esi, %ymm1
2061; AVX512-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
2062; AVX512-NEXT:    vpbroadcastd (%rdi), %ymm0 {%k1}
2063; AVX512-NEXT:    retq
2064;
2065; X86AVX2-LABEL: load_i32_v8i32:
2066; X86AVX2:       # %bb.0:
2067; X86AVX2-NEXT:    pushl %ebp
2068; X86AVX2-NEXT:    movl %esp, %ebp
2069; X86AVX2-NEXT:    andl $-32, %esp
2070; X86AVX2-NEXT:    subl $64, %esp
2071; X86AVX2-NEXT:    movl 12(%ebp), %eax
2072; X86AVX2-NEXT:    andl $7, %eax
2073; X86AVX2-NEXT:    movl 8(%ebp), %ecx
2074; X86AVX2-NEXT:    movl (%ecx), %ecx
2075; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
2076; X86AVX2-NEXT:    movl %ecx, (%esp,%eax,4)
2077; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
2078; X86AVX2-NEXT:    movl %ebp, %esp
2079; X86AVX2-NEXT:    popl %ebp
2080; X86AVX2-NEXT:    retl
2081  %x = load i32, i32* %p
2082  %ins = insertelement <8 x i32> %v, i32 %x, i32 %y
2083  ret <8 x i32> %ins
2084}
2085
2086define <4 x i64> @load_i64_v4i64(<4 x i64> %v, i64* %p, i32 %y) nounwind {
2087; SSE-LABEL: load_i64_v4i64:
2088; SSE:       # %bb.0:
2089; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
2090; SSE-NEXT:    movq (%rdi), %rax
2091; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
2092; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
2093; SSE-NEXT:    andl $3, %esi
2094; SSE-NEXT:    movq %rax, -40(%rsp,%rsi,8)
2095; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
2096; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
2097; SSE-NEXT:    retq
2098;
2099; AVX1OR2-LABEL: load_i64_v4i64:
2100; AVX1OR2:       # %bb.0:
2101; AVX1OR2-NEXT:    pushq %rbp
2102; AVX1OR2-NEXT:    movq %rsp, %rbp
2103; AVX1OR2-NEXT:    andq $-32, %rsp
2104; AVX1OR2-NEXT:    subq $64, %rsp
2105; AVX1OR2-NEXT:    # kill: def $esi killed $esi def $rsi
2106; AVX1OR2-NEXT:    movq (%rdi), %rax
2107; AVX1OR2-NEXT:    vmovaps %ymm0, (%rsp)
2108; AVX1OR2-NEXT:    andl $3, %esi
2109; AVX1OR2-NEXT:    movq %rax, (%rsp,%rsi,8)
2110; AVX1OR2-NEXT:    vmovaps (%rsp), %ymm0
2111; AVX1OR2-NEXT:    movq %rbp, %rsp
2112; AVX1OR2-NEXT:    popq %rbp
2113; AVX1OR2-NEXT:    retq
2114;
2115; AVX512-LABEL: load_i64_v4i64:
2116; AVX512:       # %bb.0:
2117; AVX512-NEXT:    movslq %esi, %rax
2118; AVX512-NEXT:    vpbroadcastq %rax, %ymm1
2119; AVX512-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
2120; AVX512-NEXT:    vpbroadcastq (%rdi), %ymm0 {%k1}
2121; AVX512-NEXT:    retq
2122;
2123; X86AVX2-LABEL: load_i64_v4i64:
2124; X86AVX2:       # %bb.0:
2125; X86AVX2-NEXT:    pushl %ebp
2126; X86AVX2-NEXT:    movl %esp, %ebp
2127; X86AVX2-NEXT:    pushl %esi
2128; X86AVX2-NEXT:    andl $-32, %esp
2129; X86AVX2-NEXT:    subl $96, %esp
2130; X86AVX2-NEXT:    movl 12(%ebp), %eax
2131; X86AVX2-NEXT:    movl 8(%ebp), %ecx
2132; X86AVX2-NEXT:    movl (%ecx), %edx
2133; X86AVX2-NEXT:    movl 4(%ecx), %ecx
2134; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
2135; X86AVX2-NEXT:    leal (%eax,%eax), %esi
2136; X86AVX2-NEXT:    andl $7, %esi
2137; X86AVX2-NEXT:    movl %edx, (%esp,%esi,4)
2138; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
2139; X86AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
2140; X86AVX2-NEXT:    leal 1(%eax,%eax), %eax
2141; X86AVX2-NEXT:    andl $7, %eax
2142; X86AVX2-NEXT:    movl %ecx, 32(%esp,%eax,4)
2143; X86AVX2-NEXT:    vmovaps {{[0-9]+}}(%esp), %ymm0
2144; X86AVX2-NEXT:    leal -4(%ebp), %esp
2145; X86AVX2-NEXT:    popl %esi
2146; X86AVX2-NEXT:    popl %ebp
2147; X86AVX2-NEXT:    retl
2148  %x = load i64, i64* %p
2149  %ins = insertelement <4 x i64> %v, i64 %x, i32 %y
2150  ret <4 x i64> %ins
2151}
2152
2153define <8 x float> @load_f32_v8f32(<8 x float> %v, float* %p, i32 %y) nounwind {
2154; SSE-LABEL: load_f32_v8f32:
2155; SSE:       # %bb.0:
2156; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
2157; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2158; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
2159; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
2160; SSE-NEXT:    andl $7, %esi
2161; SSE-NEXT:    movss %xmm2, -40(%rsp,%rsi,4)
2162; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
2163; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
2164; SSE-NEXT:    retq
2165;
2166; AVX1-LABEL: load_f32_v8f32:
2167; AVX1:       # %bb.0:
2168; AVX1-NEXT:    vmovd %esi, %xmm1
2169; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
2170; AVX1-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
2171; AVX1-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
2172; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
2173; AVX1-NEXT:    vbroadcastss (%rdi), %ymm2
2174; AVX1-NEXT:    vblendvps %ymm1, %ymm2, %ymm0, %ymm0
2175; AVX1-NEXT:    retq
2176;
2177; AVX2-LABEL: load_f32_v8f32:
2178; AVX2:       # %bb.0:
2179; AVX2-NEXT:    vbroadcastss (%rdi), %ymm1
2180; AVX2-NEXT:    vmovd %esi, %xmm2
2181; AVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
2182; AVX2-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
2183; AVX2-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
2184; AVX2-NEXT:    retq
2185;
2186; AVX512-LABEL: load_f32_v8f32:
2187; AVX512:       # %bb.0:
2188; AVX512-NEXT:    vpbroadcastd %esi, %ymm1
2189; AVX512-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
2190; AVX512-NEXT:    vbroadcastss (%rdi), %ymm0 {%k1}
2191; AVX512-NEXT:    retq
2192;
2193; X86AVX2-LABEL: load_f32_v8f32:
2194; X86AVX2:       # %bb.0:
2195; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2196; X86AVX2-NEXT:    vpbroadcastd {{[0-9]+}}(%esp), %ymm1
2197; X86AVX2-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
2198; X86AVX2-NEXT:    vbroadcastss (%eax), %ymm2
2199; X86AVX2-NEXT:    vblendvps %ymm1, %ymm2, %ymm0, %ymm0
2200; X86AVX2-NEXT:    retl
2201  %x = load float, float* %p
2202  %ins = insertelement <8 x float> %v, float %x, i32 %y
2203  ret <8 x float> %ins
2204}
2205
2206define <4 x double> @load_f64_v4f64(<4 x double> %v, double* %p, i32 %y) nounwind {
2207; SSE-LABEL: load_f64_v4f64:
2208; SSE:       # %bb.0:
2209; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
2210; SSE-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
2211; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
2212; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
2213; SSE-NEXT:    andl $3, %esi
2214; SSE-NEXT:    movsd %xmm2, -40(%rsp,%rsi,8)
2215; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
2216; SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm1
2217; SSE-NEXT:    retq
2218;
2219; AVX1-LABEL: load_f64_v4f64:
2220; AVX1:       # %bb.0:
2221; AVX1-NEXT:    movslq %esi, %rax
2222; AVX1-NEXT:    vmovq %rax, %xmm1
2223; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
2224; AVX1-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
2225; AVX1-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1
2226; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
2227; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm2
2228; AVX1-NEXT:    vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
2229; AVX1-NEXT:    retq
2230;
2231; AVX2-LABEL: load_f64_v4f64:
2232; AVX2:       # %bb.0:
2233; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm1
2234; AVX2-NEXT:    movslq %esi, %rax
2235; AVX2-NEXT:    vmovq %rax, %xmm2
2236; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
2237; AVX2-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
2238; AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
2239; AVX2-NEXT:    retq
2240;
2241; AVX512-LABEL: load_f64_v4f64:
2242; AVX512:       # %bb.0:
2243; AVX512-NEXT:    movslq %esi, %rax
2244; AVX512-NEXT:    vpbroadcastq %rax, %ymm1
2245; AVX512-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
2246; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0 {%k1}
2247; AVX512-NEXT:    retq
2248;
2249; X86AVX2-LABEL: load_f64_v4f64:
2250; X86AVX2:       # %bb.0:
2251; X86AVX2-NEXT:    pushl %ebp
2252; X86AVX2-NEXT:    movl %esp, %ebp
2253; X86AVX2-NEXT:    andl $-32, %esp
2254; X86AVX2-NEXT:    subl $64, %esp
2255; X86AVX2-NEXT:    movl 12(%ebp), %eax
2256; X86AVX2-NEXT:    andl $3, %eax
2257; X86AVX2-NEXT:    movl 8(%ebp), %ecx
2258; X86AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
2259; X86AVX2-NEXT:    vmovaps %ymm0, (%esp)
2260; X86AVX2-NEXT:    vmovsd %xmm1, (%esp,%eax,8)
2261; X86AVX2-NEXT:    vmovaps (%esp), %ymm0
2262; X86AVX2-NEXT:    movl %ebp, %esp
2263; X86AVX2-NEXT:    popl %ebp
2264; X86AVX2-NEXT:    retl
2265  %x = load double, double* %p
2266  %ins = insertelement <4 x double> %v, double %x, i32 %y
2267  ret <4 x double> %ins
2268}
2269
2270; Don't die trying to insert to an invalid index.
2271
2272define i32 @PR44139(<16 x i64>* %p) {
2273; SSE-LABEL: PR44139:
2274; SSE:       # %bb.0:
2275; SSE-NEXT:    movl (%rdi), %eax
2276; SSE-NEXT:    leal 2147483647(%rax), %ecx
2277; SSE-NEXT:    testl %eax, %eax
2278; SSE-NEXT:    cmovnsl %eax, %ecx
2279; SSE-NEXT:    andl $-2147483648, %ecx # imm = 0x80000000
2280; SSE-NEXT:    addl %eax, %ecx
2281; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
2282; SSE-NEXT:    xorl %edx, %edx
2283; SSE-NEXT:    divl %ecx
2284; SSE-NEXT:    retq
2285;
2286; AVX-LABEL: PR44139:
2287; AVX:       # %bb.0:
2288; AVX-NEXT:    movl (%rdi), %eax
2289; AVX-NEXT:    leal 2147483647(%rax), %ecx
2290; AVX-NEXT:    testl %eax, %eax
2291; AVX-NEXT:    cmovnsl %eax, %ecx
2292; AVX-NEXT:    andl $-2147483648, %ecx # imm = 0x80000000
2293; AVX-NEXT:    addl %eax, %ecx
2294; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
2295; AVX-NEXT:    xorl %edx, %edx
2296; AVX-NEXT:    divl %ecx
2297; AVX-NEXT:    retq
2298;
2299; X86AVX2-LABEL: PR44139:
2300; X86AVX2:       # %bb.0:
2301; X86AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
2302; X86AVX2-NEXT:    movl (%eax), %eax
2303; X86AVX2-NEXT:    leal 2147483647(%eax), %ecx
2304; X86AVX2-NEXT:    testl %eax, %eax
2305; X86AVX2-NEXT:    cmovnsl %eax, %ecx
2306; X86AVX2-NEXT:    andl $-2147483648, %ecx # imm = 0x80000000
2307; X86AVX2-NEXT:    addl %eax, %ecx
2308; X86AVX2-NEXT:    xorl %edx, %edx
2309; X86AVX2-NEXT:    divl %ecx
2310; X86AVX2-NEXT:    retl
2311  %L = load <16 x i64>, <16 x i64>* %p
2312  %E1 = extractelement <16 x i64> %L, i64 0
2313  %tempvector = insertelement <16 x i64> undef, i64 %E1, i32 0
2314  %vector = shufflevector <16 x i64> %tempvector, <16 x i64> undef, <16 x i32> zeroinitializer
2315  %C3 = icmp sgt i64 9223372036854775807, -9223372036854775808
2316  %t0 = trunc <16 x i64> %vector to <16 x i32>
2317  %I4 = insertelement <16 x i64> %vector, i64 %E1, i1 %C3
2318  store <16 x i64> %I4, <16 x i64>* %p
2319  %elt = extractelement <16 x i32> %t0, i32 0
2320  %B = srem i32 %elt, -2147483648
2321  %B9 = udiv i32 %elt, %B
2322  ret i32 %B9
2323}
2324