1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefixes=CHECK,X86
3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefixes=CHECK,X64
4
5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx-builtins.c
6
7define <4 x double> @test_mm256_add_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
8; CHECK-LABEL: test_mm256_add_pd:
9; CHECK:       # %bb.0:
10; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
11; CHECK-NEXT:    ret{{[l|q]}}
12  %res = fadd <4 x double> %a0, %a1
13  ret <4 x double> %res
14}
15
16define <8 x float> @test_mm256_add_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
17; CHECK-LABEL: test_mm256_add_ps:
18; CHECK:       # %bb.0:
19; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
20; CHECK-NEXT:    ret{{[l|q]}}
21  %res = fadd <8 x float> %a0, %a1
22  ret <8 x float> %res
23}
24
25define <4 x double> @test_mm256_addsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
26; CHECK-LABEL: test_mm256_addsub_pd:
27; CHECK:       # %bb.0:
28; CHECK-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0
29; CHECK-NEXT:    ret{{[l|q]}}
30  %res = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
31  ret <4 x double> %res
32}
33declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
34
35define <8 x float> @test_mm256_addsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
36; CHECK-LABEL: test_mm256_addsub_ps:
37; CHECK:       # %bb.0:
38; CHECK-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0
39; CHECK-NEXT:    ret{{[l|q]}}
40  %res = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
41  ret <8 x float> %res
42}
43declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
44
45define <4 x double> @test_mm256_and_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
46; CHECK-LABEL: test_mm256_and_pd:
47; CHECK:       # %bb.0:
48; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
49; CHECK-NEXT:    ret{{[l|q]}}
50  %1 = bitcast <4 x double> %a0 to <4 x i64>
51  %2 = bitcast <4 x double> %a1 to <4 x i64>
52  %res = and <4 x i64> %1, %2
53  %bc = bitcast <4 x i64> %res to <4 x double>
54  ret <4 x double> %bc
55}
56
57define <8 x float> @test_mm256_and_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
58; CHECK-LABEL: test_mm256_and_ps:
59; CHECK:       # %bb.0:
60; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
61; CHECK-NEXT:    ret{{[l|q]}}
62  %1 = bitcast <8 x float> %a0 to <8 x i32>
63  %2 = bitcast <8 x float> %a1 to <8 x i32>
64  %res = and <8 x i32> %1, %2
65  %bc = bitcast <8 x i32> %res to <8 x float>
66  ret <8 x float> %bc
67}
68
69define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
70; CHECK-LABEL: test_mm256_andnot_pd:
71; CHECK:       # %bb.0:
72; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
73; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
74; CHECK-NEXT:    vxorps %ymm2, %ymm0, %ymm0
75; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
76; CHECK-NEXT:    ret{{[l|q]}}
77  %1 = bitcast <4 x double> %a0 to <4 x i64>
78  %2 = bitcast <4 x double> %a1 to <4 x i64>
79  %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1>
80  %res = and <4 x i64> %3, %2
81  %bc = bitcast <4 x i64> %res to <4 x double>
82  ret <4 x double> %bc
83}
84
85define <8 x float> @test_mm256_andnot_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
86; CHECK-LABEL: test_mm256_andnot_ps:
87; CHECK:       # %bb.0:
88; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
89; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
90; CHECK-NEXT:    vxorps %ymm2, %ymm0, %ymm0
91; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
92; CHECK-NEXT:    ret{{[l|q]}}
93  %1 = bitcast <8 x float> %a0 to <8 x i32>
94  %2 = bitcast <8 x float> %a1 to <8 x i32>
95  %3 = xor <8 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
96  %res = and <8 x i32> %3, %2
97  %bc = bitcast <8 x i32> %res to <8 x float>
98  ret <8 x float> %bc
99}
100
101define <4 x double> @test_mm256_blend_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
102; CHECK-LABEL: test_mm256_blend_pd:
103; CHECK:       # %bb.0:
104; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
105; CHECK-NEXT:    ret{{[l|q]}}
106  %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
107  ret <4 x double> %res
108}
109
110define <8 x float> @test_mm256_blend_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
111; CHECK-LABEL: test_mm256_blend_ps:
112; CHECK:       # %bb.0:
113; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6],ymm1[7]
114; CHECK-NEXT:    ret{{[l|q]}}
115  %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
116  ret <8 x float> %res
117}
118
119define <4 x double> @test_mm256_blendv_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) nounwind {
120; CHECK-LABEL: test_mm256_blendv_pd:
121; CHECK:       # %bb.0:
122; CHECK-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
123; CHECK-NEXT:    ret{{[l|q]}}
124  %res = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
125  ret <4 x double> %res
126}
127declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
128
129define <8 x float> @test_mm256_blendv_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind {
130; CHECK-LABEL: test_mm256_blendv_ps:
131; CHECK:       # %bb.0:
132; CHECK-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
133; CHECK-NEXT:    ret{{[l|q]}}
134  %res = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
135  ret <8 x float> %res
136}
137declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
138
139define <4 x double> @test_mm256_broadcast_pd(<2 x double>* %a0) nounwind {
140; X86-LABEL: test_mm256_broadcast_pd:
141; X86:       # %bb.0:
142; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
143; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
144; X86-NEXT:    retl
145;
146; X64-LABEL: test_mm256_broadcast_pd:
147; X64:       # %bb.0:
148; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
149; X64-NEXT:    retq
150  %ld = load <2 x double>, <2 x double>* %a0
151  %res = shufflevector <2 x double> %ld, <2 x double> %ld, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
152  ret <4 x double> %res
153}
154
155define <8 x float> @test_mm256_broadcast_ps(<4 x float>* %a0) nounwind {
156; X86-LABEL: test_mm256_broadcast_ps:
157; X86:       # %bb.0:
158; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
159; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
160; X86-NEXT:    retl
161;
162; X64-LABEL: test_mm256_broadcast_ps:
163; X64:       # %bb.0:
164; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
165; X64-NEXT:    retq
166  %ld = load <4 x float>, <4 x float>* %a0
167  %res = shufflevector <4 x float> %ld, <4 x float> %ld, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
168  ret <8 x float> %res
169}
170
171define <4 x double> @test_mm256_broadcast_sd(double* %a0) nounwind {
172; X86-LABEL: test_mm256_broadcast_sd:
173; X86:       # %bb.0:
174; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
175; X86-NEXT:    vbroadcastsd (%eax), %ymm0
176; X86-NEXT:    retl
177;
178; X64-LABEL: test_mm256_broadcast_sd:
179; X64:       # %bb.0:
180; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
181; X64-NEXT:    retq
182  %ld = load double, double* %a0
183  %ins0 = insertelement <4 x double> undef, double %ld, i32 0
184  %ins1 = insertelement <4 x double> %ins0, double %ld, i32 1
185  %ins2 = insertelement <4 x double> %ins1, double %ld, i32 2
186  %ins3 = insertelement <4 x double> %ins2, double %ld, i32 3
187  ret <4 x double> %ins3
188}
189
190define <4 x float> @test_mm_broadcast_ss(float* %a0) nounwind {
191; X86-LABEL: test_mm_broadcast_ss:
192; X86:       # %bb.0:
193; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
194; X86-NEXT:    vbroadcastss (%eax), %xmm0
195; X86-NEXT:    retl
196;
197; X64-LABEL: test_mm_broadcast_ss:
198; X64:       # %bb.0:
199; X64-NEXT:    vbroadcastss (%rdi), %xmm0
200; X64-NEXT:    retq
201  %ld = load float, float* %a0
202  %ins0 = insertelement <4 x float> undef, float %ld, i32 0
203  %ins1 = insertelement <4 x float> %ins0, float %ld, i32 1
204  %ins2 = insertelement <4 x float> %ins1, float %ld, i32 2
205  %ins3 = insertelement <4 x float> %ins2, float %ld, i32 3
206  ret <4 x float> %ins3
207}
208
209define <8 x float> @test_mm256_broadcast_ss(float* %a0) nounwind {
210; X86-LABEL: test_mm256_broadcast_ss:
211; X86:       # %bb.0:
212; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
213; X86-NEXT:    vbroadcastss (%eax), %ymm0
214; X86-NEXT:    retl
215;
216; X64-LABEL: test_mm256_broadcast_ss:
217; X64:       # %bb.0:
218; X64-NEXT:    vbroadcastss (%rdi), %ymm0
219; X64-NEXT:    retq
220  %ld = load float, float* %a0
221  %ins0 = insertelement <8 x float> undef, float %ld, i32 0
222  %ins1 = insertelement <8 x float> %ins0, float %ld, i32 1
223  %ins2 = insertelement <8 x float> %ins1, float %ld, i32 2
224  %ins3 = insertelement <8 x float> %ins2, float %ld, i32 3
225  %ins4 = insertelement <8 x float> %ins3, float %ld, i32 4
226  %ins5 = insertelement <8 x float> %ins4, float %ld, i32 5
227  %ins6 = insertelement <8 x float> %ins5, float %ld, i32 6
228  %ins7 = insertelement <8 x float> %ins6, float %ld, i32 7
229  ret <8 x float> %ins7
230}
231
232define <8 x float> @test_mm256_castpd_ps(<4 x double> %a0) nounwind {
233; CHECK-LABEL: test_mm256_castpd_ps:
234; CHECK:       # %bb.0:
235; CHECK-NEXT:    ret{{[l|q]}}
236  %res = bitcast <4 x double> %a0 to <8 x float>
237  ret <8 x float> %res
238}
239
240define <4 x i64> @test_mm256_castpd_si256(<4 x double> %a0) nounwind {
241; CHECK-LABEL: test_mm256_castpd_si256:
242; CHECK:       # %bb.0:
243; CHECK-NEXT:    ret{{[l|q]}}
244  %res = bitcast <4 x double> %a0 to <4 x i64>
245  ret <4 x i64> %res
246}
247
248define <4 x double> @test_mm256_castpd128_pd256(<2 x double> %a0) nounwind {
249; CHECK-LABEL: test_mm256_castpd128_pd256:
250; CHECK:       # %bb.0:
251; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
252; CHECK-NEXT:    ret{{[l|q]}}
253  %res = shufflevector <2 x double> %a0, <2 x double> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
254  ret <4 x double> %res
255}
256
257define <2 x double> @test_mm256_castpd256_pd128(<4 x double> %a0) nounwind {
258; CHECK-LABEL: test_mm256_castpd256_pd128:
259; CHECK:       # %bb.0:
260; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
261; CHECK-NEXT:    vzeroupper
262; CHECK-NEXT:    ret{{[l|q]}}
263  %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 0, i32 1>
264  ret <2 x double> %res
265}
266
267define <4 x double> @test_mm256_castps_pd(<8 x float> %a0) nounwind {
268; CHECK-LABEL: test_mm256_castps_pd:
269; CHECK:       # %bb.0:
270; CHECK-NEXT:    ret{{[l|q]}}
271  %res = bitcast <8 x float> %a0 to <4 x double>
272  ret <4 x double> %res
273}
274
275define <4 x i64> @test_mm256_castps_si256(<8 x float> %a0) nounwind {
276; CHECK-LABEL: test_mm256_castps_si256:
277; CHECK:       # %bb.0:
278; CHECK-NEXT:    ret{{[l|q]}}
279  %res = bitcast <8 x float> %a0 to <4 x i64>
280  ret <4 x i64> %res
281}
282
283define <8 x float> @test_mm256_castps128_ps256(<4 x float> %a0) nounwind {
284; CHECK-LABEL: test_mm256_castps128_ps256:
285; CHECK:       # %bb.0:
286; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
287; CHECK-NEXT:    ret{{[l|q]}}
288  %res = shufflevector <4 x float> %a0, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
289  ret <8 x float> %res
290}
291
292define <4 x float> @test_mm256_castps256_ps128(<8 x float> %a0) nounwind {
293; CHECK-LABEL: test_mm256_castps256_ps128:
294; CHECK:       # %bb.0:
295; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
296; CHECK-NEXT:    vzeroupper
297; CHECK-NEXT:    ret{{[l|q]}}
298  %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
299  ret <4 x float> %res
300}
301
302define <4 x i64> @test_mm256_castsi128_si256(<2 x i64> %a0) nounwind {
303; CHECK-LABEL: test_mm256_castsi128_si256:
304; CHECK:       # %bb.0:
305; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
306; CHECK-NEXT:    ret{{[l|q]}}
307  %res = shufflevector <2 x i64> %a0, <2 x i64> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
308  ret <4 x i64> %res
309}
310
311define <4 x double> @test_mm256_castsi256_pd(<4 x i64> %a0) nounwind {
312; CHECK-LABEL: test_mm256_castsi256_pd:
313; CHECK:       # %bb.0:
314; CHECK-NEXT:    ret{{[l|q]}}
315  %res = bitcast <4 x i64> %a0 to <4 x double>
316  ret <4 x double> %res
317}
318
319define <8 x float> @test_mm256_castsi256_ps(<4 x i64> %a0) nounwind {
320; CHECK-LABEL: test_mm256_castsi256_ps:
321; CHECK:       # %bb.0:
322; CHECK-NEXT:    ret{{[l|q]}}
323  %res = bitcast <4 x i64> %a0 to <8 x float>
324  ret <8 x float> %res
325}
326
327define <2 x i64> @test_mm256_castsi256_si128(<4 x i64> %a0) nounwind {
328; CHECK-LABEL: test_mm256_castsi256_si128:
329; CHECK:       # %bb.0:
330; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
331; CHECK-NEXT:    vzeroupper
332; CHECK-NEXT:    ret{{[l|q]}}
333  %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 0, i32 1>
334  ret <2 x i64> %res
335}
336
337define <4 x double> @test_mm256_ceil_pd(<4 x double> %a0) nounwind {
338; CHECK-LABEL: test_mm256_ceil_pd:
339; CHECK:       # %bb.0:
340; CHECK-NEXT:    vroundpd $2, %ymm0, %ymm0
341; CHECK-NEXT:    ret{{[l|q]}}
342  %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 2)
343  ret <4 x double> %res
344}
345declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
346
347define <8 x float> @test_mm256_ceil_ps(<8 x float> %a0) nounwind {
348; CHECK-LABEL: test_mm256_ceil_ps:
349; CHECK:       # %bb.0:
350; CHECK-NEXT:    vroundps $2, %ymm0, %ymm0
351; CHECK-NEXT:    ret{{[l|q]}}
352  %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 2)
353  ret <8 x float> %res
354}
355declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
356
357define <2 x double> @test_mm_cmp_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
358; CHECK-LABEL: test_mm_cmp_pd:
359; CHECK:       # %bb.0:
360; CHECK-NEXT:    vcmpgepd %xmm1, %xmm0, %xmm0
361; CHECK-NEXT:    ret{{[l|q]}}
362  %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 13)
363  ret <2 x double> %res
364}
365declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
366
367define <4 x double> @test_mm256_cmp_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
368; CHECK-LABEL: test_mm256_cmp_pd:
369; CHECK:       # %bb.0:
370; CHECK-NEXT:    vcmpgepd %ymm1, %ymm0, %ymm0
371; CHECK-NEXT:    ret{{[l|q]}}
372  %res = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 13)
373  ret <4 x double> %res
374}
375declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
376
377define <4 x float> @test_mm_cmp_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
378; CHECK-LABEL: test_mm_cmp_ps:
379; CHECK:       # %bb.0:
380; CHECK-NEXT:    vcmpgeps %xmm1, %xmm0, %xmm0
381; CHECK-NEXT:    ret{{[l|q]}}
382  %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 13)
383  ret <4 x float> %res
384}
385declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
386
387define <8 x float> @test_mm256_cmp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
388; CHECK-LABEL: test_mm256_cmp_ps:
389; CHECK:       # %bb.0:
390; CHECK-NEXT:    vcmpgeps %ymm1, %ymm0, %ymm0
391; CHECK-NEXT:    ret{{[l|q]}}
392  %res = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 13)
393  ret <8 x float> %res
394}
395declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
396
397define <2 x double> @test_mm_cmp_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
398; CHECK-LABEL: test_mm_cmp_sd:
399; CHECK:       # %bb.0:
400; CHECK-NEXT:    vcmpgesd %xmm1, %xmm0, %xmm0
401; CHECK-NEXT:    ret{{[l|q]}}
402  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 13)
403  ret <2 x double> %res
404}
405declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
406
407define <4 x float> @test_mm_cmp_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
408; CHECK-LABEL: test_mm_cmp_ss:
409; CHECK:       # %bb.0:
410; CHECK-NEXT:    vcmpgess %xmm1, %xmm0, %xmm0
411; CHECK-NEXT:    ret{{[l|q]}}
412  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 13)
413  ret <4 x float> %res
414}
415declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
416
417define <4 x double> @test_mm256_cvtepi32_pd(<2 x i64> %a0) nounwind {
418; CHECK-LABEL: test_mm256_cvtepi32_pd:
419; CHECK:       # %bb.0:
420; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm0
421; CHECK-NEXT:    ret{{[l|q]}}
422  %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
423  %res = sitofp <4 x i32> %arg0 to <4 x double>
424  ret <4 x double> %res
425}
426
427define <8 x float> @test_mm256_cvtepi32_ps(<4 x i64> %a0) nounwind {
428; CHECK-LABEL: test_mm256_cvtepi32_ps:
429; CHECK:       # %bb.0:
430; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
431; CHECK-NEXT:    ret{{[l|q]}}
432  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
433  %res = sitofp <8 x i32> %arg0 to <8 x float>
434  ret <8 x float> %res
435}
436
437define <2 x i64> @test_mm256_cvtpd_epi32(<4 x double> %a0) nounwind {
438; CHECK-LABEL: test_mm256_cvtpd_epi32:
439; CHECK:       # %bb.0:
440; CHECK-NEXT:    vcvtpd2dq %ymm0, %xmm0
441; CHECK-NEXT:    vzeroupper
442; CHECK-NEXT:    ret{{[l|q]}}
443  %cvt = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0)
444  %res = bitcast <4 x i32> %cvt to <2 x i64>
445  ret <2 x i64> %res
446}
447declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
448
449define <4 x float> @test_mm256_cvtpd_ps(<4 x double> %a0) nounwind {
450; CHECK-LABEL: test_mm256_cvtpd_ps:
451; CHECK:       # %bb.0:
452; CHECK-NEXT:    vcvtpd2ps %ymm0, %xmm0
453; CHECK-NEXT:    vzeroupper
454; CHECK-NEXT:    ret{{[l|q]}}
455  %res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0)
456  ret <4 x float> %res
457}
458declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) nounwind readnone
459
460define <4 x i64> @test_mm256_cvtps_epi32(<8 x float> %a0) nounwind {
461; CHECK-LABEL: test_mm256_cvtps_epi32:
462; CHECK:       # %bb.0:
463; CHECK-NEXT:    vcvtps2dq %ymm0, %ymm0
464; CHECK-NEXT:    ret{{[l|q]}}
465  %cvt = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0)
466  %res = bitcast <8 x i32> %cvt to <4 x i64>
467  ret <4 x i64> %res
468}
469declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
470
471define <4 x double> @test_mm256_cvtps_pd(<4 x float> %a0) nounwind {
472; CHECK-LABEL: test_mm256_cvtps_pd:
473; CHECK:       # %bb.0:
474; CHECK-NEXT:    vcvtps2pd %xmm0, %ymm0
475; CHECK-NEXT:    ret{{[l|q]}}
476  %res = fpext <4 x float> %a0 to <4 x double>
477  ret <4 x double> %res
478}
479
480define <2 x i64> @test_mm256_cvttpd_epi32(<4 x double> %a0) nounwind {
481; CHECK-LABEL: test_mm256_cvttpd_epi32:
482; CHECK:       # %bb.0:
483; CHECK-NEXT:    vcvttpd2dq %ymm0, %xmm0
484; CHECK-NEXT:    vzeroupper
485; CHECK-NEXT:    ret{{[l|q]}}
486  %cvt = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0)
487  %res = bitcast <4 x i32> %cvt to <2 x i64>
488  ret <2 x i64> %res
489}
490declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
491
492define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
493; CHECK-LABEL: test_mm256_cvttps_epi32:
494; CHECK:       # %bb.0:
495; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
496; CHECK-NEXT:    ret{{[l|q]}}
497  %cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0)
498  %res = bitcast <8 x i32> %cvt to <4 x i64>
499  ret <4 x i64> %res
500}
501declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
502
503define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
504; CHECK-LABEL: test_mm256_div_pd:
505; CHECK:       # %bb.0:
506; CHECK-NEXT:    vdivpd %ymm1, %ymm0, %ymm0
507; CHECK-NEXT:    ret{{[l|q]}}
508  %res = fdiv <4 x double> %a0, %a1
509  ret <4 x double> %res
510}
511
512define <8 x float> @test_mm256_div_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
513; CHECK-LABEL: test_mm256_div_ps:
514; CHECK:       # %bb.0:
515; CHECK-NEXT:    vdivps %ymm1, %ymm0, %ymm0
516; CHECK-NEXT:    ret{{[l|q]}}
517  %res = fdiv <8 x float> %a0, %a1
518  ret <8 x float> %res
519}
520
521define <8 x float> @test_mm256_dp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
522; CHECK-LABEL: test_mm256_dp_ps:
523; CHECK:       # %bb.0:
524; CHECK-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0
525; CHECK-NEXT:    ret{{[l|q]}}
526  %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
527  ret <8 x float> %res
528}
529declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
530
531define i32 @test_mm256_extract_epi8(<4 x i64> %a0) nounwind {
532; CHECK-LABEL: test_mm256_extract_epi8:
533; CHECK:       # %bb.0:
534; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
535; CHECK-NEXT:    vpextrb $15, %xmm0, %eax
536; CHECK-NEXT:    movzbl %al, %eax
537; CHECK-NEXT:    vzeroupper
538; CHECK-NEXT:    ret{{[l|q]}}
539  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
540  %ext = extractelement <32 x i8> %arg0, i32 31
541  %res = zext i8 %ext to i32
542  ret i32 %res
543}
544
545define i32 @test_mm256_extract_epi16(<4 x i64> %a0) nounwind {
546; CHECK-LABEL: test_mm256_extract_epi16:
547; CHECK:       # %bb.0:
548; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
549; CHECK-NEXT:    vpextrw $3, %xmm0, %eax
550; CHECK-NEXT:    movzwl %ax, %eax
551; CHECK-NEXT:    vzeroupper
552; CHECK-NEXT:    ret{{[l|q]}}
553  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
554  %ext = extractelement <16 x i16> %arg0, i32 11
555  %res = zext i16 %ext to i32
556  ret i32 %res
557}
558
559define i32 @test_mm256_extract_epi32(<4 x i64> %a0) nounwind {
560; CHECK-LABEL: test_mm256_extract_epi32:
561; CHECK:       # %bb.0:
562; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
563; CHECK-NEXT:    vextractps $1, %xmm0, %eax
564; CHECK-NEXT:    vzeroupper
565; CHECK-NEXT:    ret{{[l|q]}}
566  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
567  %res = extractelement <8 x i32> %arg0, i32 5
568  ret i32 %res
569}
570
571define i64 @test_mm256_extract_epi64(<4 x i64> %a0) nounwind {
572; X86-LABEL: test_mm256_extract_epi64:
573; X86:       # %bb.0:
574; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
575; X86-NEXT:    vextractps $2, %xmm0, %eax
576; X86-NEXT:    vextractps $3, %xmm0, %edx
577; X86-NEXT:    vzeroupper
578; X86-NEXT:    retl
579;
580; X64-LABEL: test_mm256_extract_epi64:
581; X64:       # %bb.0:
582; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
583; X64-NEXT:    vpextrq $1, %xmm0, %rax
584; X64-NEXT:    vzeroupper
585; X64-NEXT:    retq
586  %res = extractelement <4 x i64> %a0, i32 3
587  ret i64 %res
588}
589
590define <2 x double> @test_mm256_extractf128_pd(<4 x double> %a0) nounwind {
591; CHECK-LABEL: test_mm256_extractf128_pd:
592; CHECK:       # %bb.0:
593; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
594; CHECK-NEXT:    vzeroupper
595; CHECK-NEXT:    ret{{[l|q]}}
596  %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 2, i32 3>
597  ret <2 x double> %res
598}
599
600define <4 x float> @test_mm256_extractf128_ps(<8 x float> %a0) nounwind {
601; CHECK-LABEL: test_mm256_extractf128_ps:
602; CHECK:       # %bb.0:
603; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
604; CHECK-NEXT:    vzeroupper
605; CHECK-NEXT:    ret{{[l|q]}}
606  %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
607  ret <4 x float> %res
608}
609
610define <2 x i64> @test_mm256_extractf128_si256(<4 x i64> %a0) nounwind {
611; CHECK-LABEL: test_mm256_extractf128_si256:
612; CHECK:       # %bb.0:
613; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
614; CHECK-NEXT:    vzeroupper
615; CHECK-NEXT:    ret{{[l|q]}}
616  %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
617  ret <2 x i64> %res
618}
619
620define <4 x double> @test_mm256_floor_pd(<4 x double> %a0) nounwind {
621; CHECK-LABEL: test_mm256_floor_pd:
622; CHECK:       # %bb.0:
623; CHECK-NEXT:    vroundpd $1, %ymm0, %ymm0
624; CHECK-NEXT:    ret{{[l|q]}}
625  %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 1)
626  ret <4 x double> %res
627}
628
629define <8 x float> @test_mm256_floor_ps(<8 x float> %a0) nounwind {
630; CHECK-LABEL: test_mm256_floor_ps:
631; CHECK:       # %bb.0:
632; CHECK-NEXT:    vroundps $1, %ymm0, %ymm0
633; CHECK-NEXT:    ret{{[l|q]}}
634  %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 1)
635  ret <8 x float> %res
636}
637
638define <4 x double> @test_mm256_hadd_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
639; CHECK-LABEL: test_mm256_hadd_pd:
640; CHECK:       # %bb.0:
641; CHECK-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
642; CHECK-NEXT:    ret{{[l|q]}}
643  %res = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
644  ret <4 x double> %res
645}
646declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
647
648define <8 x float> @test_mm256_hadd_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
649; CHECK-LABEL: test_mm256_hadd_ps:
650; CHECK:       # %bb.0:
651; CHECK-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
652; CHECK-NEXT:    ret{{[l|q]}}
653  %res = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
654  ret <8 x float> %res
655}
656declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
657
658define <4 x double> @test_mm256_hsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
659; CHECK-LABEL: test_mm256_hsub_pd:
660; CHECK:       # %bb.0:
661; CHECK-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
662; CHECK-NEXT:    ret{{[l|q]}}
663  %res = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
664  ret <4 x double> %res
665}
666declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
667
668define <8 x float> @test_mm256_hsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
669; CHECK-LABEL: test_mm256_hsub_ps:
670; CHECK:       # %bb.0:
671; CHECK-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
672; CHECK-NEXT:    ret{{[l|q]}}
673  %res = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
674  ret <8 x float> %res
675}
676declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
677
678define <4 x i64> @test_mm256_insert_epi8(<4 x i64> %a0, i8 %a1) nounwind {
679; X86-LABEL: test_mm256_insert_epi8:
680; X86:       # %bb.0:
681; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
682; X86-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm1
683; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
684; X86-NEXT:    retl
685;
686; X64-LABEL: test_mm256_insert_epi8:
687; X64:       # %bb.0:
688; X64-NEXT:    movzbl %dil, %eax
689; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm1
690; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
691; X64-NEXT:    retq
692  %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
693  %res = insertelement <32 x i8> %arg0, i8 %a1, i32 4
694  %bc = bitcast <32 x i8> %res to <4 x i64>
695  ret <4 x i64> %bc
696}
697
698define <4 x i64> @test_mm256_insert_epi16(<4 x i64> %a0, i16 %a1) nounwind {
699; X86-LABEL: test_mm256_insert_epi16:
700; X86:       # %bb.0:
701; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
702; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
703; X86-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
704; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
705; X86-NEXT:    retl
706;
707; X64-LABEL: test_mm256_insert_epi16:
708; X64:       # %bb.0:
709; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
710; X64-NEXT:    vpinsrw $6, %edi, %xmm1, %xmm1
711; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
712; X64-NEXT:    retq
713  %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
714  %res = insertelement <16 x i16> %arg0, i16 %a1, i32 14
715  %bc = bitcast <16 x i16> %res to <4 x i64>
716  ret <4 x i64> %bc
717}
718
719define <4 x i64> @test_mm256_insert_epi32(<4 x i64> %a0, i32 %a1) nounwind {
720; X86-LABEL: test_mm256_insert_epi32:
721; X86:       # %bb.0:
722; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm1
723; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
724; X86-NEXT:    retl
725;
726; X64-LABEL: test_mm256_insert_epi32:
727; X64:       # %bb.0:
728; X64-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm1
729; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
730; X64-NEXT:    retq
731  %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
732  %res = insertelement <8 x i32> %arg0, i32 %a1, i32 3
733  %bc = bitcast <8 x i32> %res to <4 x i64>
734  ret <4 x i64> %bc
735}
736
737define <4 x i64> @test_mm256_insert_epi64(<4 x i64> %a0, i64 %a1) nounwind {
738; X86-LABEL: test_mm256_insert_epi64:
739; X86:       # %bb.0:
740; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
741; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
742; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
743; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
744; X86-NEXT:    retl
745;
746; X64-LABEL: test_mm256_insert_epi64:
747; X64:       # %bb.0:
748; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
749; X64-NEXT:    vpinsrq $1, %rdi, %xmm1, %xmm1
750; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
751; X64-NEXT:    retq
752  %res = insertelement <4 x i64> %a0, i64 %a1, i32 3
753  ret <4 x i64> %res
754}
755
756define <4 x double> @test_mm256_insertf128_pd(<4 x double> %a0, <2 x double> %a1) nounwind {
757; CHECK-LABEL: test_mm256_insertf128_pd:
758; CHECK:       # %bb.0:
759; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
760; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
761; CHECK-NEXT:    ret{{[l|q]}}
762  %ext = shufflevector <2 x double> %a1, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
763  %res = shufflevector <4 x double> %a0, <4 x double> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
764  ret <4 x double> %res
765}
766
767define <8 x float> @test_mm256_insertf128_ps(<8 x float> %a0, <4 x float> %a1) nounwind {
768; CHECK-LABEL: test_mm256_insertf128_ps:
769; CHECK:       # %bb.0:
770; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
771; CHECK-NEXT:    ret{{[l|q]}}
772  %ext = shufflevector <4 x float> %a1, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
773  %res = shufflevector <8 x float> %a0, <8 x float> %ext, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
774  ret <8 x float> %res
775}
776
777define <4 x i64> @test_mm256_insertf128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
778; CHECK-LABEL: test_mm256_insertf128_si256:
779; CHECK:       # %bb.0:
780; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
781; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
782; CHECK-NEXT:    ret{{[l|q]}}
783  %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
784  %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
785  ret <4 x i64> %res
786}
787
788define <4 x i64> @test_mm256_lddqu_si256(<4 x i64>* %a0) nounwind {
789; X86-LABEL: test_mm256_lddqu_si256:
790; X86:       # %bb.0:
791; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
792; X86-NEXT:    vlddqu (%eax), %ymm0
793; X86-NEXT:    retl
794;
795; X64-LABEL: test_mm256_lddqu_si256:
796; X64:       # %bb.0:
797; X64-NEXT:    vlddqu (%rdi), %ymm0
798; X64-NEXT:    retq
799  %arg0 = bitcast <4 x i64>* %a0 to i8*
800  %res = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %arg0)
801  %bc = bitcast <32 x i8> %res to <4 x i64>
802  ret <4 x i64> %bc
803}
804declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readnone
805
806define <4 x double> @test_mm256_load_pd(double* %a0) nounwind {
807; X86-LABEL: test_mm256_load_pd:
808; X86:       # %bb.0:
809; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
810; X86-NEXT:    vmovaps (%eax), %ymm0
811; X86-NEXT:    retl
812;
813; X64-LABEL: test_mm256_load_pd:
814; X64:       # %bb.0:
815; X64-NEXT:    vmovaps (%rdi), %ymm0
816; X64-NEXT:    retq
817  %arg0 = bitcast double* %a0 to <4 x double>*
818  %res = load <4 x double>, <4 x double>* %arg0, align 32
819  ret <4 x double> %res
820}
821
822define <8 x float> @test_mm256_load_ps(float* %a0) nounwind {
823; X86-LABEL: test_mm256_load_ps:
824; X86:       # %bb.0:
825; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
826; X86-NEXT:    vmovaps (%eax), %ymm0
827; X86-NEXT:    retl
828;
829; X64-LABEL: test_mm256_load_ps:
830; X64:       # %bb.0:
831; X64-NEXT:    vmovaps (%rdi), %ymm0
832; X64-NEXT:    retq
833  %arg0 = bitcast float* %a0 to <8 x float>*
834  %res = load <8 x float>, <8 x float>* %arg0, align 32
835  ret <8 x float> %res
836}
837
838define <4 x i64> @test_mm256_load_si256(<4 x i64>* %a0) nounwind {
839; X86-LABEL: test_mm256_load_si256:
840; X86:       # %bb.0:
841; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
842; X86-NEXT:    vmovaps (%eax), %ymm0
843; X86-NEXT:    retl
844;
845; X64-LABEL: test_mm256_load_si256:
846; X64:       # %bb.0:
847; X64-NEXT:    vmovaps (%rdi), %ymm0
848; X64-NEXT:    retq
849  %res = load <4 x i64>, <4 x i64>* %a0, align 32
850  ret <4 x i64> %res
851}
852
853define <4 x double> @test_mm256_loadu_pd(double* %a0) nounwind {
854; X86-LABEL: test_mm256_loadu_pd:
855; X86:       # %bb.0:
856; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
857; X86-NEXT:    vmovups (%eax), %ymm0
858; X86-NEXT:    retl
859;
860; X64-LABEL: test_mm256_loadu_pd:
861; X64:       # %bb.0:
862; X64-NEXT:    vmovups (%rdi), %ymm0
863; X64-NEXT:    retq
864  %arg0 = bitcast double* %a0 to <4 x double>*
865  %res = load <4 x double>, <4 x double>* %arg0, align 1
866  ret <4 x double> %res
867}
868
869define <8 x float> @test_mm256_loadu_ps(float* %a0) nounwind {
870; X86-LABEL: test_mm256_loadu_ps:
871; X86:       # %bb.0:
872; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
873; X86-NEXT:    vmovups (%eax), %ymm0
874; X86-NEXT:    retl
875;
876; X64-LABEL: test_mm256_loadu_ps:
877; X64:       # %bb.0:
878; X64-NEXT:    vmovups (%rdi), %ymm0
879; X64-NEXT:    retq
880  %arg0 = bitcast float* %a0 to <8 x float>*
881  %res = load <8 x float>, <8 x float>* %arg0, align 1
882  ret <8 x float> %res
883}
884
885define <4 x i64> @test_mm256_loadu_si256(<4 x i64>* %a0) nounwind {
886; X86-LABEL: test_mm256_loadu_si256:
887; X86:       # %bb.0:
888; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
889; X86-NEXT:    vmovups (%eax), %ymm0
890; X86-NEXT:    retl
891;
892; X64-LABEL: test_mm256_loadu_si256:
893; X64:       # %bb.0:
894; X64-NEXT:    vmovups (%rdi), %ymm0
895; X64-NEXT:    retq
896  %res = load <4 x i64>, <4 x i64>* %a0, align 1
897  ret <4 x i64> %res
898}
899
900define <8 x float> @test_mm256_loadu2_m128(float* %a0, float* %a1) nounwind {
901; X86-LABEL: test_mm256_loadu2_m128:
902; X86:       # %bb.0:
903; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
904; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
905; X86-NEXT:    vmovups (%eax), %xmm0
906; X86-NEXT:    vinsertf128 $1, (%ecx), %ymm0, %ymm0
907; X86-NEXT:    retl
908;
909; X64-LABEL: test_mm256_loadu2_m128:
910; X64:       # %bb.0:
911; X64-NEXT:    vmovups (%rsi), %xmm0
912; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
913; X64-NEXT:    retq
914  %arg0 = bitcast float* %a0 to <4 x float>*
915  %hi4 = load <4 x float>, <4 x float>* %arg0, align 1
916  %hi8 = shufflevector <4 x float> %hi4, <4 x float> %hi4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
917  %arg1 = bitcast float* %a1 to <4 x float>*
918  %lo4 = load <4 x float>, <4 x float>* %arg1, align 1
919  %lo8 = shufflevector <4 x float> %lo4, <4 x float> %lo4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
920  %res = shufflevector <8 x float> %lo8, <8 x float> %hi8, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
921  ret <8 x float> %res
922}
923
924define <4 x double> @test_mm256_loadu2_m128d(double* %a0, double* %a1) nounwind {
925; X86-LABEL: test_mm256_loadu2_m128d:
926; X86:       # %bb.0:
927; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
928; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
929; X86-NEXT:    vmovups (%eax), %xmm0
930; X86-NEXT:    vinsertf128 $1, (%ecx), %ymm0, %ymm0
931; X86-NEXT:    retl
932;
933; X64-LABEL: test_mm256_loadu2_m128d:
934; X64:       # %bb.0:
935; X64-NEXT:    vmovups (%rsi), %xmm0
936; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
937; X64-NEXT:    retq
938  %arg0 = bitcast double* %a0 to <2 x double>*
939  %hi2 = load <2 x double>, <2 x double>* %arg0, align 1
940  %hi4 = shufflevector <2 x double> %hi2, <2 x double> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
941  %arg1 = bitcast double* %a1 to <2 x double>*
942  %lo2 = load <2 x double>, <2 x double>* %arg1, align 1
943  %lo4 = shufflevector <2 x double> %lo2, <2 x double> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
944  %res = shufflevector <4 x double> %lo4, <4 x double> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
945  ret <4 x double> %res
946}
947
948define <4 x i64> @test_mm256_loadu2_m128i(i64* %a0, i64* %a1) nounwind {
949; X86-LABEL: test_mm256_loadu2_m128i:
950; X86:       # %bb.0:
951; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
952; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
953; X86-NEXT:    vmovups (%eax), %xmm0
954; X86-NEXT:    vinsertf128 $1, (%ecx), %ymm0, %ymm0
955; X86-NEXT:    retl
956;
957; X64-LABEL: test_mm256_loadu2_m128i:
958; X64:       # %bb.0:
959; X64-NEXT:    vmovups (%rsi), %xmm0
960; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
961; X64-NEXT:    retq
962  %arg0 = bitcast i64* %a0 to <2 x i64>*
963  %hi2 = load <2 x i64>, <2 x i64>* %arg0, align 1
964  %hi4 = shufflevector <2 x i64> %hi2, <2 x i64> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
965  %arg1 = bitcast i64* %a1 to <2 x i64>*
966  %lo2 = load <2 x i64>, <2 x i64>* %arg1, align 1
967  %lo4 = shufflevector <2 x i64> %lo2, <2 x i64> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
968  %res = shufflevector <4 x i64> %lo4, <4 x i64> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
969  ret <4 x i64> %res
970}
971
972define <2 x double> @test_mm_maskload_pd(double* %a0, <2 x i64> %a1) nounwind {
973; X86-LABEL: test_mm_maskload_pd:
974; X86:       # %bb.0:
975; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
976; X86-NEXT:    vmaskmovpd (%eax), %xmm0, %xmm0
977; X86-NEXT:    retl
978;
979; X64-LABEL: test_mm_maskload_pd:
980; X64:       # %bb.0:
981; X64-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm0
982; X64-NEXT:    retq
983  %arg0 = bitcast double* %a0 to i8*
984  %res = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %arg0, <2 x i64> %a1)
985  ret <2 x double> %res
986}
987declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) nounwind readnone
988
989define <4 x double> @test_mm256_maskload_pd(double* %a0, <4 x i64> %a1) nounwind {
990; X86-LABEL: test_mm256_maskload_pd:
991; X86:       # %bb.0:
992; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
993; X86-NEXT:    vmaskmovpd (%eax), %ymm0, %ymm0
994; X86-NEXT:    retl
995;
996; X64-LABEL: test_mm256_maskload_pd:
997; X64:       # %bb.0:
998; X64-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
999; X64-NEXT:    retq
1000  %arg0 = bitcast double* %a0 to i8*
1001  %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %arg0, <4 x i64> %a1)
1002  ret <4 x double> %res
1003}
1004declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>) nounwind readnone
1005
1006define <4 x float> @test_mm_maskload_ps(float* %a0, <2 x i64> %a1) nounwind {
1007; X86-LABEL: test_mm_maskload_ps:
1008; X86:       # %bb.0:
1009; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1010; X86-NEXT:    vmaskmovps (%eax), %xmm0, %xmm0
1011; X86-NEXT:    retl
1012;
1013; X64-LABEL: test_mm_maskload_ps:
1014; X64:       # %bb.0:
1015; X64-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
1016; X64-NEXT:    retq
1017  %arg0 = bitcast float* %a0 to i8*
1018  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1019  %res = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %arg0, <4 x i32> %arg1)
1020  ret <4 x float> %res
1021}
1022declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) nounwind readnone
1023
1024define <8 x float> @test_mm256_maskload_ps(float* %a0, <4 x i64> %a1) nounwind {
1025; X86-LABEL: test_mm256_maskload_ps:
1026; X86:       # %bb.0:
1027; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1028; X86-NEXT:    vmaskmovps (%eax), %ymm0, %ymm0
1029; X86-NEXT:    retl
1030;
1031; X64-LABEL: test_mm256_maskload_ps:
1032; X64:       # %bb.0:
1033; X64-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
1034; X64-NEXT:    retq
1035  %arg0 = bitcast float* %a0 to i8*
1036  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1037  %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %arg0, <8 x i32> %arg1)
1038  ret <8 x float> %res
1039}
1040declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) nounwind readnone
1041
1042define void @test_mm_maskstore_pd(double* %a0, <2 x i64> %a1, <2 x double> %a2) nounwind {
1043; X86-LABEL: test_mm_maskstore_pd:
1044; X86:       # %bb.0:
1045; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1046; X86-NEXT:    vmaskmovpd %xmm1, %xmm0, (%eax)
1047; X86-NEXT:    retl
1048;
1049; X64-LABEL: test_mm_maskstore_pd:
1050; X64:       # %bb.0:
1051; X64-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi)
1052; X64-NEXT:    retq
1053  %arg0 = bitcast double* %a0 to i8*
1054  call void @llvm.x86.avx.maskstore.pd(i8* %arg0, <2 x i64> %a1, <2 x double> %a2)
1055  ret void
1056}
1057declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind readnone
1058
1059define void @test_mm256_maskstore_pd(double* %a0, <4 x i64> %a1, <4 x double> %a2) nounwind {
1060; X86-LABEL: test_mm256_maskstore_pd:
1061; X86:       # %bb.0:
1062; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1063; X86-NEXT:    vmaskmovpd %ymm1, %ymm0, (%eax)
1064; X86-NEXT:    vzeroupper
1065; X86-NEXT:    retl
1066;
1067; X64-LABEL: test_mm256_maskstore_pd:
1068; X64:       # %bb.0:
1069; X64-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi)
1070; X64-NEXT:    vzeroupper
1071; X64-NEXT:    retq
1072  %arg0 = bitcast double* %a0 to i8*
1073  call void @llvm.x86.avx.maskstore.pd.256(i8* %arg0, <4 x i64> %a1, <4 x double> %a2)
1074  ret void
1075}
1076declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwind readnone
1077
1078define void @test_mm_maskstore_ps(float* %a0, <2 x i64> %a1, <4 x float> %a2) nounwind {
1079; X86-LABEL: test_mm_maskstore_ps:
1080; X86:       # %bb.0:
1081; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1082; X86-NEXT:    vmaskmovps %xmm1, %xmm0, (%eax)
1083; X86-NEXT:    retl
1084;
1085; X64-LABEL: test_mm_maskstore_ps:
1086; X64:       # %bb.0:
1087; X64-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
1088; X64-NEXT:    retq
1089  %arg0 = bitcast float* %a0 to i8*
1090  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1091  call void @llvm.x86.avx.maskstore.ps(i8* %arg0, <4 x i32> %arg1, <4 x float> %a2)
1092  ret void
1093}
1094declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind readnone
1095
1096define void @test_mm256_maskstore_ps(float* %a0, <4 x i64> %a1, <8 x float> %a2) nounwind {
1097; X86-LABEL: test_mm256_maskstore_ps:
1098; X86:       # %bb.0:
1099; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1100; X86-NEXT:    vmaskmovps %ymm1, %ymm0, (%eax)
1101; X86-NEXT:    vzeroupper
1102; X86-NEXT:    retl
1103;
1104; X64-LABEL: test_mm256_maskstore_ps:
1105; X64:       # %bb.0:
1106; X64-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi)
1107; X64-NEXT:    vzeroupper
1108; X64-NEXT:    retq
1109  %arg0 = bitcast float* %a0 to i8*
1110  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1111  call void @llvm.x86.avx.maskstore.ps.256(i8* %arg0, <8 x i32> %arg1, <8 x float> %a2)
1112  ret void
1113}
1114declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind readnone
1115
1116define <4 x double> @test_mm256_max_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1117; CHECK-LABEL: test_mm256_max_pd:
1118; CHECK:       # %bb.0:
1119; CHECK-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
1120; CHECK-NEXT:    ret{{[l|q]}}
1121  %res = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
1122  ret <4 x double> %res
1123}
1124declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
1125
1126define <8 x float> @test_mm256_max_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1127; CHECK-LABEL: test_mm256_max_ps:
1128; CHECK:       # %bb.0:
1129; CHECK-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
1130; CHECK-NEXT:    ret{{[l|q]}}
1131  %res = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
1132  ret <8 x float> %res
1133}
1134declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
1135
1136define <4 x double> @test_mm256_min_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1137; CHECK-LABEL: test_mm256_min_pd:
1138; CHECK:       # %bb.0:
1139; CHECK-NEXT:    vminpd %ymm1, %ymm0, %ymm0
1140; CHECK-NEXT:    ret{{[l|q]}}
1141  %res = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
1142  ret <4 x double> %res
1143}
1144declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
1145
1146define <8 x float> @test_mm256_min_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1147; CHECK-LABEL: test_mm256_min_ps:
1148; CHECK:       # %bb.0:
1149; CHECK-NEXT:    vminps %ymm1, %ymm0, %ymm0
1150; CHECK-NEXT:    ret{{[l|q]}}
1151  %res = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
1152  ret <8 x float> %res
1153}
1154declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
1155
1156define <4 x double> @test_mm256_movedup_pd(<4 x double> %a0) nounwind {
1157; CHECK-LABEL: test_mm256_movedup_pd:
1158; CHECK:       # %bb.0:
1159; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
1160; CHECK-NEXT:    ret{{[l|q]}}
1161  %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
1162  ret <4 x double> %res
1163}
1164
1165define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) nounwind {
1166; CHECK-LABEL: test_mm256_movehdup_ps:
1167; CHECK:       # %bb.0:
1168; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
1169; CHECK-NEXT:    ret{{[l|q]}}
1170  %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
1171  ret <8 x float> %res
1172}
1173
1174define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) nounwind {
1175; CHECK-LABEL: test_mm256_moveldup_ps:
1176; CHECK:       # %bb.0:
1177; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
1178; CHECK-NEXT:    ret{{[l|q]}}
1179  %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1180  ret <8 x float> %res
1181}
1182
1183define i32 @test_mm256_movemask_pd(<4 x double> %a0) nounwind {
1184; CHECK-LABEL: test_mm256_movemask_pd:
1185; CHECK:       # %bb.0:
1186; CHECK-NEXT:    vmovmskpd %ymm0, %eax
1187; CHECK-NEXT:    vzeroupper
1188; CHECK-NEXT:    ret{{[l|q]}}
1189  %res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
1190  ret i32 %res
1191}
1192declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
1193
1194define i32 @test_mm256_movemask_ps(<8 x float> %a0) nounwind {
1195; CHECK-LABEL: test_mm256_movemask_ps:
1196; CHECK:       # %bb.0:
1197; CHECK-NEXT:    vmovmskps %ymm0, %eax
1198; CHECK-NEXT:    vzeroupper
1199; CHECK-NEXT:    ret{{[l|q]}}
1200  %res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
1201  ret i32 %res
1202}
1203declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
1204
1205define <4 x double> @test_mm256_mul_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1206; CHECK-LABEL: test_mm256_mul_pd:
1207; CHECK:       # %bb.0:
1208; CHECK-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
1209; CHECK-NEXT:    ret{{[l|q]}}
1210  %res = fmul <4 x double> %a0, %a1
1211  ret <4 x double> %res
1212}
1213
1214define <8 x float> @test_mm256_mul_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1215; CHECK-LABEL: test_mm256_mul_ps:
1216; CHECK:       # %bb.0:
1217; CHECK-NEXT:    vmulps %ymm1, %ymm0, %ymm0
1218; CHECK-NEXT:    ret{{[l|q]}}
1219  %res = fmul <8 x float> %a0, %a1
1220  ret <8 x float> %res
1221}
1222
1223define <4 x double> @test_mm256_or_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1224; CHECK-LABEL: test_mm256_or_pd:
1225; CHECK:       # %bb.0:
1226; CHECK-NEXT:    vorps %ymm1, %ymm0, %ymm0
1227; CHECK-NEXT:    ret{{[l|q]}}
1228  %1 = bitcast <4 x double> %a0 to <4 x i64>
1229  %2 = bitcast <4 x double> %a1 to <4 x i64>
1230  %res = or <4 x i64> %1, %2
1231  %bc = bitcast <4 x i64> %res to <4 x double>
1232  ret <4 x double> %bc
1233}
1234
1235define <8 x float> @test_mm256_or_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1236; CHECK-LABEL: test_mm256_or_ps:
1237; CHECK:       # %bb.0:
1238; CHECK-NEXT:    vorps %ymm1, %ymm0, %ymm0
1239; CHECK-NEXT:    ret{{[l|q]}}
1240  %1 = bitcast <8 x float> %a0 to <8 x i32>
1241  %2 = bitcast <8 x float> %a1 to <8 x i32>
1242  %res = or <8 x i32> %1, %2
1243  %bc = bitcast <8 x i32> %res to <8 x float>
1244  ret <8 x float> %bc
1245}
1246
1247define <2 x double> @test_mm_permute_pd(<2 x double> %a0) nounwind {
1248; CHECK-LABEL: test_mm_permute_pd:
1249; CHECK:       # %bb.0:
1250; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1251; CHECK-NEXT:    ret{{[l|q]}}
1252  %res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> <i32 1, i32 0>
1253  ret <2 x double> %res
1254}
1255
1256define <4 x double> @test_mm256_permute_pd(<4 x double> %a0) nounwind {
1257; CHECK-LABEL: test_mm256_permute_pd:
1258; CHECK:       # %bb.0:
1259; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
1260; CHECK-NEXT:    ret{{[l|q]}}
1261  %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
1262  ret <4 x double> %res
1263}
1264
1265define <4 x float> @test_mm_permute_ps(<4 x float> %a0) nounwind {
1266; CHECK-LABEL: test_mm_permute_ps:
1267; CHECK:       # %bb.0:
1268; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
1269; CHECK-NEXT:    ret{{[l|q]}}
1270  %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1271  ret <4 x float> %res
1272}
1273
1274define <4 x float> @test2_mm_permute_ps(<4 x float> %a0) nounwind {
1275; CHECK-LABEL: test2_mm_permute_ps:
1276; CHECK:       # %bb.0:
1277; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,3]
1278; CHECK-NEXT:    ret{{[l|q]}}
1279  %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
1280  ret <4 x float> %res
1281}
1282
1283define <8 x float> @test_mm256_permute_ps(<8 x float> %a0) nounwind {
1284; CHECK-LABEL: test_mm256_permute_ps:
1285; CHECK:       # %bb.0:
1286; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1287; CHECK-NEXT:    ret{{[l|q]}}
1288  %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
1289  ret <8 x float> %res
1290}
1291
1292define <4 x double> @test_mm256_permute2f128_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1293; CHECK-LABEL: test_mm256_permute2f128_pd:
1294; CHECK:       # %bb.0:
1295; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm1[0,1]
1296; CHECK-NEXT:    ret{{[l|q]}}
1297  %res = shufflevector <4 x double> zeroinitializer, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1298  ret <4 x double> %res
1299}
1300declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
1301
1302; PR26667
1303define <8 x float> @test_mm256_permute2f128_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1304; CHECK-LABEL: test_mm256_permute2f128_ps:
1305; CHECK:       # %bb.0:
1306; CHECK-NEXT:    vmovaps %ymm1, %ymm0
1307; CHECK-NEXT:    ret{{[l|q]}}
1308  %res = shufflevector <8 x float> %a1, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
1309  ret <8 x float> %res
1310}
1311declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
1312
1313define <4 x i64> @test_mm256_permute2f128_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1314; CHECK-LABEL: test_mm256_permute2f128_si256:
1315; CHECK:       # %bb.0:
1316; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
1317; CHECK-NEXT:    ret{{[l|q]}}
1318  %1 = bitcast <4 x i64> %a0 to <8 x i32>
1319  %2 = bitcast <4 x i64> %a1 to <8 x i32>
1320  %res = shufflevector <8 x i32> %2, <8 x i32> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1321  %bc = bitcast <8 x i32> %res to <4 x i64>
1322  ret <4 x i64> %bc
1323}
1324declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
1325
1326define <2 x double> @test_mm_permutevar_pd(<2 x double> %a0, <2 x i64> %a1) nounwind {
1327; CHECK-LABEL: test_mm_permutevar_pd:
1328; CHECK:       # %bb.0:
1329; CHECK-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
1330; CHECK-NEXT:    ret{{[l|q]}}
1331  %res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
1332  ret <2 x double> %res
1333}
1334declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone
1335
1336define <4 x double> @test_mm256_permutevar_pd(<4 x double> %a0, <4 x i64> %a1) nounwind {
1337; CHECK-LABEL: test_mm256_permutevar_pd:
1338; CHECK:       # %bb.0:
1339; CHECK-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
1340; CHECK-NEXT:    ret{{[l|q]}}
1341  %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
1342  ret <4 x double> %res
1343}
1344declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
1345
1346define <4 x float> @test_mm_permutevar_ps(<4 x float> %a0, <2 x i64> %a1) nounwind {
1347; CHECK-LABEL: test_mm_permutevar_ps:
1348; CHECK:       # %bb.0:
1349; CHECK-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
1350; CHECK-NEXT:    ret{{[l|q]}}
1351  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1352  %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %arg1)
1353  ret <4 x float> %res
1354}
1355declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone
1356
1357define <8 x float> @test_mm256_permutevar_ps(<8 x float> %a0, <4 x i64> %a1) nounwind {
1358; CHECK-LABEL: test_mm256_permutevar_ps:
1359; CHECK:       # %bb.0:
1360; CHECK-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
1361; CHECK-NEXT:    ret{{[l|q]}}
1362  %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1363  %res = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %arg1)
1364  ret <8 x float> %res
1365}
1366declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone
1367
1368define <8 x float> @test_mm256_rcp_ps(<8 x float> %a0) nounwind {
1369; CHECK-LABEL: test_mm256_rcp_ps:
1370; CHECK:       # %bb.0:
1371; CHECK-NEXT:    vrcpps %ymm0, %ymm0
1372; CHECK-NEXT:    ret{{[l|q]}}
1373  %res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
1374  ret <8 x float> %res
1375}
1376declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
1377
1378define <4 x double> @test_mm256_round_pd(<4 x double> %a0) nounwind {
1379; CHECK-LABEL: test_mm256_round_pd:
1380; CHECK:       # %bb.0:
1381; CHECK-NEXT:    vroundpd $4, %ymm0, %ymm0
1382; CHECK-NEXT:    ret{{[l|q]}}
1383  %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 4)
1384  ret <4 x double> %res
1385}
1386
1387define <8 x float> @test_mm256_round_ps(<8 x float> %a0) nounwind {
1388; CHECK-LABEL: test_mm256_round_ps:
1389; CHECK:       # %bb.0:
1390; CHECK-NEXT:    vroundps $4, %ymm0, %ymm0
1391; CHECK-NEXT:    ret{{[l|q]}}
1392  %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 4)
1393  ret <8 x float> %res
1394}
1395
1396define <8 x float> @test_mm256_rsqrt_ps(<8 x float> %a0) nounwind {
1397; CHECK-LABEL: test_mm256_rsqrt_ps:
1398; CHECK:       # %bb.0:
1399; CHECK-NEXT:    vrsqrtps %ymm0, %ymm0
1400; CHECK-NEXT:    ret{{[l|q]}}
1401  %res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
1402  ret <8 x float> %res
1403}
1404declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
1405
1406define <4 x i64> @test_mm256_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
1407; X86-LABEL: test_mm256_set_epi8:
1408; X86:       # %bb.0:
1409; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1410; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
1411; X86-NEXT:    vmovd %ecx, %xmm0
1412; X86-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
1413; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1414; X86-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
1415; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1416; X86-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
1417; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1418; X86-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
1419; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1420; X86-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
1421; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1422; X86-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
1423; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1424; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
1425; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1426; X86-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
1427; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1428; X86-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
1429; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1430; X86-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
1431; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1432; X86-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
1433; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1434; X86-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
1435; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1436; X86-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
1437; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1438; X86-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
1439; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1440; X86-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
1441; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1442; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
1443; X86-NEXT:    vmovd %ecx, %xmm1
1444; X86-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
1445; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1446; X86-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
1447; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1448; X86-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
1449; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1450; X86-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
1451; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1452; X86-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
1453; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1454; X86-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
1455; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1456; X86-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
1457; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1458; X86-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
1459; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1460; X86-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
1461; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1462; X86-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
1463; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1464; X86-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
1465; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1466; X86-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
1467; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1468; X86-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
1469; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1470; X86-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
1471; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1472; X86-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
1473; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1474; X86-NEXT:    retl
1475;
1476; X64-LABEL: test_mm256_set_epi8:
1477; X64:       # %bb.0:
1478; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
1479; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1480; X64-NEXT:    vmovd %eax, %xmm0
1481; X64-NEXT:    vpinsrb $1, %r10d, %xmm0, %xmm0
1482; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1483; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
1484; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1485; X64-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
1486; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1487; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
1488; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1489; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
1490; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1491; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
1492; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1493; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
1494; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1495; X64-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
1496; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1497; X64-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
1498; X64-NEXT:    movzbl %r9b, %eax
1499; X64-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
1500; X64-NEXT:    movzbl %r8b, %eax
1501; X64-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
1502; X64-NEXT:    movzbl %cl, %eax
1503; X64-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
1504; X64-NEXT:    movzbl %dl, %eax
1505; X64-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
1506; X64-NEXT:    movzbl %sil, %eax
1507; X64-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
1508; X64-NEXT:    movzbl %dil, %eax
1509; X64-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
1510; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1511; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
1512; X64-NEXT:    vmovd %ecx, %xmm1
1513; X64-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
1514; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1515; X64-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
1516; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1517; X64-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
1518; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1519; X64-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
1520; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1521; X64-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
1522; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1523; X64-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
1524; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1525; X64-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
1526; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1527; X64-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
1528; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1529; X64-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
1530; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1531; X64-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
1532; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1533; X64-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
1534; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1535; X64-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
1536; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1537; X64-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
1538; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1539; X64-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
1540; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1541; X64-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
1542; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1543; X64-NEXT:    retq
1544  %res0  = insertelement <32 x i8> undef,  i8 %a31, i32 0
1545  %res1  = insertelement <32 x i8> %res0,  i8 %a30, i32 1
1546  %res2  = insertelement <32 x i8> %res1,  i8 %a29, i32 2
1547  %res3  = insertelement <32 x i8> %res2,  i8 %a28, i32 3
1548  %res4  = insertelement <32 x i8> %res3,  i8 %a27, i32 4
1549  %res5  = insertelement <32 x i8> %res4,  i8 %a26, i32 5
1550  %res6  = insertelement <32 x i8> %res5,  i8 %a25, i32 6
1551  %res7  = insertelement <32 x i8> %res6,  i8 %a24, i32 7
1552  %res8  = insertelement <32 x i8> %res7,  i8 %a23, i32 8
1553  %res9  = insertelement <32 x i8> %res8,  i8 %a22, i32 9
1554  %res10 = insertelement <32 x i8> %res9,  i8 %a21, i32 10
1555  %res11 = insertelement <32 x i8> %res10, i8 %a20, i32 11
1556  %res12 = insertelement <32 x i8> %res11, i8 %a19, i32 12
1557  %res13 = insertelement <32 x i8> %res12, i8 %a18, i32 13
1558  %res14 = insertelement <32 x i8> %res13, i8 %a17, i32 14
1559  %res15 = insertelement <32 x i8> %res14, i8 %a16, i32 15
1560  %res16 = insertelement <32 x i8> %res15, i8 %a15, i32 16
1561  %res17 = insertelement <32 x i8> %res16, i8 %a14, i32 17
1562  %res18 = insertelement <32 x i8> %res17, i8 %a13, i32 18
1563  %res19 = insertelement <32 x i8> %res18, i8 %a12, i32 19
1564  %res20 = insertelement <32 x i8> %res19, i8 %a11, i32 20
1565  %res21 = insertelement <32 x i8> %res20, i8 %a10, i32 21
1566  %res22 = insertelement <32 x i8> %res21, i8 %a9 , i32 22
1567  %res23 = insertelement <32 x i8> %res22, i8 %a8 , i32 23
1568  %res24 = insertelement <32 x i8> %res23, i8 %a7 , i32 24
1569  %res25 = insertelement <32 x i8> %res24, i8 %a6 , i32 25
1570  %res26 = insertelement <32 x i8> %res25, i8 %a5 , i32 26
1571  %res27 = insertelement <32 x i8> %res26, i8 %a4 , i32 27
1572  %res28 = insertelement <32 x i8> %res27, i8 %a3 , i32 28
1573  %res29 = insertelement <32 x i8> %res28, i8 %a2 , i32 29
1574  %res30 = insertelement <32 x i8> %res29, i8 %a1 , i32 30
1575  %res31 = insertelement <32 x i8> %res30, i8 %a0 , i32 31
1576  %res = bitcast <32 x i8> %res31 to <4 x i64>
1577  ret <4 x i64> %res
1578}
1579
1580define <4 x i64> @test_mm256_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
1581; X86-LABEL: test_mm256_set_epi16:
1582; X86:       # %bb.0:
1583; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1584; X86-NEXT:    vmovd %eax, %xmm0
1585; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1586; X86-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
1587; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1588; X86-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
1589; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1590; X86-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
1591; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1592; X86-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
1593; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1594; X86-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
1595; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1596; X86-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
1597; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1598; X86-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
1599; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1600; X86-NEXT:    vmovd %eax, %xmm1
1601; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1602; X86-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
1603; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1604; X86-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
1605; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1606; X86-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
1607; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1608; X86-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
1609; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1610; X86-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
1611; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1612; X86-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
1613; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1614; X86-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
1615; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1616; X86-NEXT:    retl
1617;
1618; X64-LABEL: test_mm256_set_epi16:
1619; X64:       # %bb.0:
1620; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1621; X64-NEXT:    vmovd %eax, %xmm0
1622; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1623; X64-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
1624; X64-NEXT:    vpinsrw $2, %r9d, %xmm0, %xmm0
1625; X64-NEXT:    vpinsrw $3, %r8d, %xmm0, %xmm0
1626; X64-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
1627; X64-NEXT:    vpinsrw $5, %edx, %xmm0, %xmm0
1628; X64-NEXT:    vpinsrw $6, %esi, %xmm0, %xmm0
1629; X64-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0
1630; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1631; X64-NEXT:    vmovd %eax, %xmm1
1632; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1633; X64-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
1634; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1635; X64-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
1636; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1637; X64-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
1638; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1639; X64-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
1640; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1641; X64-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
1642; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1643; X64-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
1644; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1645; X64-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
1646; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1647; X64-NEXT:    retq
1648  %res0  = insertelement <16 x i16> undef,  i16 %a15, i32 0
1649  %res1  = insertelement <16 x i16> %res0,  i16 %a14, i32 1
1650  %res2  = insertelement <16 x i16> %res1,  i16 %a13, i32 2
1651  %res3  = insertelement <16 x i16> %res2,  i16 %a12, i32 3
1652  %res4  = insertelement <16 x i16> %res3,  i16 %a11, i32 4
1653  %res5  = insertelement <16 x i16> %res4,  i16 %a10, i32 5
1654  %res6  = insertelement <16 x i16> %res5,  i16 %a9 , i32 6
1655  %res7  = insertelement <16 x i16> %res6,  i16 %a8 , i32 7
1656  %res8  = insertelement <16 x i16> %res7,  i16 %a7 , i32 8
1657  %res9  = insertelement <16 x i16> %res8,  i16 %a6 , i32 9
1658  %res10 = insertelement <16 x i16> %res9,  i16 %a5 , i32 10
1659  %res11 = insertelement <16 x i16> %res10, i16 %a4 , i32 11
1660  %res12 = insertelement <16 x i16> %res11, i16 %a3 , i32 12
1661  %res13 = insertelement <16 x i16> %res12, i16 %a2 , i32 13
1662  %res14 = insertelement <16 x i16> %res13, i16 %a1 , i32 14
1663  %res15 = insertelement <16 x i16> %res14, i16 %a0 , i32 15
1664  %res = bitcast <16 x i16> %res15 to <4 x i64>
1665  ret <4 x i64> %res
1666}
1667
1668define <4 x i64> @test_mm256_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
1669; X86-LABEL: test_mm256_set_epi32:
1670; X86:       # %bb.0:
1671; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1672; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1673; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
1674; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
1675; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1676; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1677; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
1678; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
1679; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1680; X86-NEXT:    retl
1681;
1682; X64-LABEL: test_mm256_set_epi32:
1683; X64:       # %bb.0:
1684; X64-NEXT:    vmovd %ecx, %xmm0
1685; X64-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
1686; X64-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0
1687; X64-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
1688; X64-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1689; X64-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
1690; X64-NEXT:    vpinsrd $2, %r9d, %xmm1, %xmm1
1691; X64-NEXT:    vpinsrd $3, %r8d, %xmm1, %xmm1
1692; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1693; X64-NEXT:    retq
1694  %res0 = insertelement <8 x i32> undef, i32 %a7, i32 0
1695  %res1 = insertelement <8 x i32> %res0, i32 %a6, i32 1
1696  %res2 = insertelement <8 x i32> %res1, i32 %a5, i32 2
1697  %res3 = insertelement <8 x i32> %res2, i32 %a4, i32 3
1698  %res4 = insertelement <8 x i32> %res3, i32 %a3, i32 4
1699  %res5 = insertelement <8 x i32> %res4, i32 %a2, i32 5
1700  %res6 = insertelement <8 x i32> %res5, i32 %a1, i32 6
1701  %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7
1702  %res = bitcast <8 x i32> %res7 to <4 x i64>
1703  ret <4 x i64> %res
1704}
1705
1706define <4 x i64> @test_mm256_set_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
1707; X86-LABEL: test_mm256_set_epi64x:
1708; X86:       # %bb.0:
1709; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1710; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1711; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
1712; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
1713; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1714; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1715; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
1716; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
1717; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1718; X86-NEXT:    retl
1719;
1720; X64-LABEL: test_mm256_set_epi64x:
1721; X64:       # %bb.0:
1722; X64-NEXT:    vmovq %rdi, %xmm0
1723; X64-NEXT:    vmovq %rsi, %xmm1
1724; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1725; X64-NEXT:    vmovq %rdx, %xmm1
1726; X64-NEXT:    vmovq %rcx, %xmm2
1727; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1728; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1729; X64-NEXT:    retq
1730  %res0 = insertelement <4 x i64> undef, i64 %a3, i32 0
1731  %res1 = insertelement <4 x i64> %res0, i64 %a2, i32 1
1732  %res2 = insertelement <4 x i64> %res1, i64 %a1, i32 2
1733  %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3
1734  ret <4 x i64> %res3
1735}
1736
1737define <8 x float> @test_mm256_set_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
1738; CHECK-LABEL: test_mm256_set_m128:
1739; CHECK:       # %bb.0:
1740; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1741; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1742; CHECK-NEXT:    ret{{[l|q]}}
1743  %res = shufflevector <4 x float> %a1, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1744  ret <8 x float> %res
1745}
1746
1747define <4 x double> @test_mm256_set_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
1748; CHECK-LABEL: test_mm256_set_m128d:
1749; CHECK:       # %bb.0:
1750; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1751; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1752; CHECK-NEXT:    ret{{[l|q]}}
1753  %arg0 = bitcast <2 x double> %a0 to <4 x float>
1754  %arg1 = bitcast <2 x double> %a1 to <4 x float>
1755  %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1756  %bc = bitcast <8 x float> %res to <4 x double>
1757  ret <4 x double> %bc
1758}
1759
1760define <4 x i64> @test_mm256_set_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
1761; CHECK-LABEL: test_mm256_set_m128i:
1762; CHECK:       # %bb.0:
1763; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1764; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1765; CHECK-NEXT:    ret{{[l|q]}}
1766  %arg0 = bitcast <2 x i64> %a0 to <4 x float>
1767  %arg1 = bitcast <2 x i64> %a1 to <4 x float>
1768  %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1769  %bc = bitcast <8 x float> %res to <4 x i64>
1770  ret <4 x i64> %bc
1771}
1772
1773define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, double %a3) nounwind {
1774; X86-LABEL: test_mm256_set_pd:
1775; X86:       # %bb.0:
1776; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1777; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
1778; X86-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1779; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
1780; X86-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
1781; X86-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1782; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1783; X86-NEXT:    retl
1784;
1785; X64-LABEL: test_mm256_set_pd:
1786; X64:       # %bb.0:
1787; X64-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1788; X64-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
1789; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1790; X64-NEXT:    retq
1791  %res0 = insertelement <4 x double> undef, double %a3, i32 0
1792  %res1 = insertelement <4 x double> %res0, double %a2, i32 1
1793  %res2 = insertelement <4 x double> %res1, double %a1, i32 2
1794  %res3 = insertelement <4 x double> %res2, double %a0, i32 3
1795  ret <4 x double> %res3
1796}
1797
1798define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
1799; X86-LABEL: test_mm256_set_ps:
1800; X86:       # %bb.0:
1801; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1802; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1803; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
1804; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1805; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
1806; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1807; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
1808; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1809; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1810; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
1811; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1812; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1813; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1814; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
1815; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1816; X86-NEXT:    retl
1817;
1818; X64-LABEL: test_mm256_set_ps:
1819; X64:       # %bb.0:
1820; X64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
1821; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1822; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1823; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3]
1824; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
1825; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
1826; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1827; X64-NEXT:    retq
1828  %res0 = insertelement <8 x float> undef, float %a7, i32 0
1829  %res1 = insertelement <8 x float> %res0, float %a6, i32 1
1830  %res2 = insertelement <8 x float> %res1, float %a5, i32 2
1831  %res3 = insertelement <8 x float> %res2, float %a4, i32 3
1832  %res4 = insertelement <8 x float> %res3, float %a3, i32 4
1833  %res5 = insertelement <8 x float> %res4, float %a2, i32 5
1834  %res6 = insertelement <8 x float> %res5, float %a1, i32 6
1835  %res7 = insertelement <8 x float> %res6, float %a0, i32 7
1836  ret <8 x float> %res7
1837}
1838
1839define <4 x i64> @test_mm256_set1_epi8(i8 %a0) nounwind {
1840; X86-LABEL: test_mm256_set1_epi8:
1841; X86:       # %bb.0:
1842; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1843; X86-NEXT:    vmovd %eax, %xmm0
1844; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1845; X86-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1846; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1847; X86-NEXT:    retl
1848;
1849; X64-LABEL: test_mm256_set1_epi8:
1850; X64:       # %bb.0:
1851; X64-NEXT:    movzbl %dil, %eax
1852; X64-NEXT:    vmovd %eax, %xmm0
1853; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1854; X64-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1855; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1856; X64-NEXT:    retq
1857  %res0  = insertelement <32 x i8> undef,  i8 %a0, i32 0
1858  %res1  = insertelement <32 x i8> %res0,  i8 %a0, i32 1
1859  %res2  = insertelement <32 x i8> %res1,  i8 %a0, i32 2
1860  %res3  = insertelement <32 x i8> %res2,  i8 %a0, i32 3
1861  %res4  = insertelement <32 x i8> %res3,  i8 %a0, i32 4
1862  %res5  = insertelement <32 x i8> %res4,  i8 %a0, i32 5
1863  %res6  = insertelement <32 x i8> %res5,  i8 %a0, i32 6
1864  %res7  = insertelement <32 x i8> %res6,  i8 %a0, i32 7
1865  %res8  = insertelement <32 x i8> %res7,  i8 %a0, i32 8
1866  %res9  = insertelement <32 x i8> %res8,  i8 %a0, i32 9
1867  %res10 = insertelement <32 x i8> %res9,  i8 %a0, i32 10
1868  %res11 = insertelement <32 x i8> %res10, i8 %a0, i32 11
1869  %res12 = insertelement <32 x i8> %res11, i8 %a0, i32 12
1870  %res13 = insertelement <32 x i8> %res12, i8 %a0, i32 13
1871  %res14 = insertelement <32 x i8> %res13, i8 %a0, i32 14
1872  %res15 = insertelement <32 x i8> %res14, i8 %a0, i32 15
1873  %res16 = insertelement <32 x i8> %res15, i8 %a0, i32 16
1874  %res17 = insertelement <32 x i8> %res16, i8 %a0, i32 17
1875  %res18 = insertelement <32 x i8> %res17, i8 %a0, i32 18
1876  %res19 = insertelement <32 x i8> %res18, i8 %a0, i32 19
1877  %res20 = insertelement <32 x i8> %res19, i8 %a0, i32 20
1878  %res21 = insertelement <32 x i8> %res20, i8 %a0, i32 21
1879  %res22 = insertelement <32 x i8> %res21, i8 %a0, i32 22
1880  %res23 = insertelement <32 x i8> %res22, i8 %a0, i32 23
1881  %res24 = insertelement <32 x i8> %res23, i8 %a0, i32 24
1882  %res25 = insertelement <32 x i8> %res24, i8 %a0, i32 25
1883  %res26 = insertelement <32 x i8> %res25, i8 %a0, i32 26
1884  %res27 = insertelement <32 x i8> %res26, i8 %a0, i32 27
1885  %res28 = insertelement <32 x i8> %res27, i8 %a0, i32 28
1886  %res29 = insertelement <32 x i8> %res28, i8 %a0, i32 29
1887  %res30 = insertelement <32 x i8> %res29, i8 %a0, i32 30
1888  %res31 = insertelement <32 x i8> %res30, i8 %a0, i32 31
1889  %res = bitcast <32 x i8> %res31 to <4 x i64>
1890  ret <4 x i64> %res
1891}
1892
1893define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind {
1894; X86-LABEL: test_mm256_set1_epi16:
1895; X86:       # %bb.0:
1896; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1897; X86-NEXT:    vmovd %eax, %xmm0
1898; X86-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1899; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1900; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1901; X86-NEXT:    retl
1902;
1903; X64-LABEL: test_mm256_set1_epi16:
1904; X64:       # %bb.0:
1905; X64-NEXT:    vmovd %edi, %xmm0
1906; X64-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
1907; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1908; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1909; X64-NEXT:    retq
1910  %res0  = insertelement <16 x i16> undef,  i16 %a0, i32 0
1911  %res1  = insertelement <16 x i16> %res0,  i16 %a0, i32 1
1912  %res2  = insertelement <16 x i16> %res1,  i16 %a0, i32 2
1913  %res3  = insertelement <16 x i16> %res2,  i16 %a0, i32 3
1914  %res4  = insertelement <16 x i16> %res3,  i16 %a0, i32 4
1915  %res5  = insertelement <16 x i16> %res4,  i16 %a0, i32 5
1916  %res6  = insertelement <16 x i16> %res5,  i16 %a0, i32 6
1917  %res7  = insertelement <16 x i16> %res6,  i16 %a0, i32 7
1918  %res8  = insertelement <16 x i16> %res7,  i16 %a0, i32 8
1919  %res9  = insertelement <16 x i16> %res8,  i16 %a0, i32 9
1920  %res10 = insertelement <16 x i16> %res9,  i16 %a0, i32 10
1921  %res11 = insertelement <16 x i16> %res10, i16 %a0, i32 11
1922  %res12 = insertelement <16 x i16> %res11, i16 %a0, i32 12
1923  %res13 = insertelement <16 x i16> %res12, i16 %a0, i32 13
1924  %res14 = insertelement <16 x i16> %res13, i16 %a0, i32 14
1925  %res15 = insertelement <16 x i16> %res14, i16 %a0, i32 15
1926  %res = bitcast <16 x i16> %res15 to <4 x i64>
1927  ret <4 x i64> %res
1928}
1929
1930define <4 x i64> @test_mm256_set1_epi32(i32 %a0) nounwind {
1931; X86-LABEL: test_mm256_set1_epi32:
1932; X86:       # %bb.0:
1933; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1934; X86-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
1935; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1936; X86-NEXT:    retl
1937;
1938; X64-LABEL: test_mm256_set1_epi32:
1939; X64:       # %bb.0:
1940; X64-NEXT:    vmovd %edi, %xmm0
1941; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1942; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1943; X64-NEXT:    retq
1944  %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
1945  %res1 = insertelement <8 x i32> %res0, i32 %a0, i32 1
1946  %res2 = insertelement <8 x i32> %res1, i32 %a0, i32 2
1947  %res3 = insertelement <8 x i32> %res2, i32 %a0, i32 3
1948  %res4 = insertelement <8 x i32> %res3, i32 %a0, i32 4
1949  %res5 = insertelement <8 x i32> %res4, i32 %a0, i32 5
1950  %res6 = insertelement <8 x i32> %res5, i32 %a0, i32 6
1951  %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7
1952  %res = bitcast <8 x i32> %res7 to <4 x i64>
1953  ret <4 x i64> %res
1954}
1955
1956define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind {
1957; X86-LABEL: test_mm256_set1_epi64x:
1958; X86:       # %bb.0:
1959; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1960; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1961; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1962; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1963; X86-NEXT:    retl
1964;
1965; X64-LABEL: test_mm256_set1_epi64x:
1966; X64:       # %bb.0:
1967; X64-NEXT:    vmovq %rdi, %xmm0
1968; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1969; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1970; X64-NEXT:    retq
1971  %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
1972  %res1 = insertelement <4 x i64> %res0, i64 %a0, i32 1
1973  %res2 = insertelement <4 x i64> %res1, i64 %a0, i32 2
1974  %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3
1975  ret <4 x i64> %res3
1976}
1977
1978define <4 x double> @test_mm256_set1_pd(double %a0) nounwind {
1979; X86-LABEL: test_mm256_set1_pd:
1980; X86:       # %bb.0:
1981; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1982; X86-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1983; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1984; X86-NEXT:    retl
1985;
1986; X64-LABEL: test_mm256_set1_pd:
1987; X64:       # %bb.0:
1988; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1989; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1990; X64-NEXT:    retq
1991  %res0 = insertelement <4 x double> undef, double %a0, i32 0
1992  %res1 = insertelement <4 x double> %res0, double %a0, i32 1
1993  %res2 = insertelement <4 x double> %res1, double %a0, i32 2
1994  %res3 = insertelement <4 x double> %res2, double %a0, i32 3
1995  ret <4 x double> %res3
1996}
1997
1998define <8 x float> @test_mm256_set1_ps(float %a0) nounwind {
1999; X86-LABEL: test_mm256_set1_ps:
2000; X86:       # %bb.0:
2001; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2002; X86-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2003; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2004; X86-NEXT:    retl
2005;
2006; X64-LABEL: test_mm256_set1_ps:
2007; X64:       # %bb.0:
2008; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2009; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2010; X64-NEXT:    retq
2011  %res0 = insertelement <8 x float> undef, float %a0, i32 0
2012  %res1 = insertelement <8 x float> %res0, float %a0, i32 1
2013  %res2 = insertelement <8 x float> %res1, float %a0, i32 2
2014  %res3 = insertelement <8 x float> %res2, float %a0, i32 3
2015  %res4 = insertelement <8 x float> %res3, float %a0, i32 4
2016  %res5 = insertelement <8 x float> %res4, float %a0, i32 5
2017  %res6 = insertelement <8 x float> %res5, float %a0, i32 6
2018  %res7 = insertelement <8 x float> %res6, float %a0, i32 7
2019  ret <8 x float> %res7
2020}
2021
2022define <4 x i64> @test_mm256_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
2023; X86-LABEL: test_mm256_setr_epi8:
2024; X86:       # %bb.0:
2025; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2026; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
2027; X86-NEXT:    vmovd %ecx, %xmm0
2028; X86-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
2029; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2030; X86-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
2031; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2032; X86-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
2033; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2034; X86-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
2035; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2036; X86-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
2037; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2038; X86-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
2039; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2040; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
2041; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2042; X86-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
2043; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2044; X86-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
2045; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2046; X86-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
2047; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2048; X86-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
2049; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2050; X86-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
2051; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2052; X86-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
2053; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2054; X86-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
2055; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2056; X86-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
2057; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2058; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
2059; X86-NEXT:    vmovd %ecx, %xmm1
2060; X86-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
2061; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2062; X86-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
2063; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2064; X86-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
2065; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2066; X86-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
2067; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2068; X86-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
2069; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2070; X86-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
2071; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2072; X86-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
2073; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2074; X86-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
2075; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2076; X86-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
2077; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2078; X86-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
2079; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2080; X86-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
2081; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2082; X86-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
2083; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2084; X86-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
2085; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2086; X86-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
2087; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2088; X86-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
2089; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2090; X86-NEXT:    retl
2091;
2092; X64-LABEL: test_mm256_setr_epi8:
2093; X64:       # %bb.0:
2094; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
2095; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2096; X64-NEXT:    vmovd %eax, %xmm0
2097; X64-NEXT:    vpinsrb $1, %r10d, %xmm0, %xmm0
2098; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2099; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
2100; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2101; X64-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
2102; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2103; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
2104; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2105; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
2106; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2107; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
2108; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2109; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
2110; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2111; X64-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
2112; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2113; X64-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
2114; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2115; X64-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
2116; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2117; X64-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
2118; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2119; X64-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
2120; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2121; X64-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
2122; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2123; X64-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
2124; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2125; X64-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
2126; X64-NEXT:    movzbl %sil, %eax
2127; X64-NEXT:    movzbl %dil, %esi
2128; X64-NEXT:    vmovd %esi, %xmm1
2129; X64-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
2130; X64-NEXT:    movzbl %dl, %eax
2131; X64-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
2132; X64-NEXT:    movzbl %cl, %eax
2133; X64-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
2134; X64-NEXT:    movzbl %r8b, %eax
2135; X64-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
2136; X64-NEXT:    movzbl %r9b, %eax
2137; X64-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
2138; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2139; X64-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
2140; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2141; X64-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
2142; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2143; X64-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
2144; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2145; X64-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
2146; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2147; X64-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
2148; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2149; X64-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
2150; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2151; X64-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
2152; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2153; X64-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
2154; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2155; X64-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
2156; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2157; X64-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
2158; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2159; X64-NEXT:    retq
2160  %res0  = insertelement <32 x i8> undef,  i8 %a0 , i32 0
2161  %res1  = insertelement <32 x i8> %res0,  i8 %a1 , i32 1
2162  %res2  = insertelement <32 x i8> %res1,  i8 %a2 , i32 2
2163  %res3  = insertelement <32 x i8> %res2,  i8 %a3 , i32 3
2164  %res4  = insertelement <32 x i8> %res3,  i8 %a4 , i32 4
2165  %res5  = insertelement <32 x i8> %res4,  i8 %a5 , i32 5
2166  %res6  = insertelement <32 x i8> %res5,  i8 %a6 , i32 6
2167  %res7  = insertelement <32 x i8> %res6,  i8 %a7 , i32 7
2168  %res8  = insertelement <32 x i8> %res7,  i8 %a8 , i32 8
2169  %res9  = insertelement <32 x i8> %res8,  i8 %a9 , i32 9
2170  %res10 = insertelement <32 x i8> %res9,  i8 %a10, i32 10
2171  %res11 = insertelement <32 x i8> %res10, i8 %a11, i32 11
2172  %res12 = insertelement <32 x i8> %res11, i8 %a12, i32 12
2173  %res13 = insertelement <32 x i8> %res12, i8 %a13, i32 13
2174  %res14 = insertelement <32 x i8> %res13, i8 %a14, i32 14
2175  %res15 = insertelement <32 x i8> %res14, i8 %a15, i32 15
2176  %res16 = insertelement <32 x i8> %res15, i8 %a16, i32 16
2177  %res17 = insertelement <32 x i8> %res16, i8 %a17, i32 17
2178  %res18 = insertelement <32 x i8> %res17, i8 %a18, i32 18
2179  %res19 = insertelement <32 x i8> %res18, i8 %a19, i32 19
2180  %res20 = insertelement <32 x i8> %res19, i8 %a20, i32 20
2181  %res21 = insertelement <32 x i8> %res20, i8 %a21, i32 21
2182  %res22 = insertelement <32 x i8> %res21, i8 %a22, i32 22
2183  %res23 = insertelement <32 x i8> %res22, i8 %a23, i32 23
2184  %res24 = insertelement <32 x i8> %res23, i8 %a24, i32 24
2185  %res25 = insertelement <32 x i8> %res24, i8 %a25, i32 25
2186  %res26 = insertelement <32 x i8> %res25, i8 %a26, i32 26
2187  %res27 = insertelement <32 x i8> %res26, i8 %a27, i32 27
2188  %res28 = insertelement <32 x i8> %res27, i8 %a28, i32 28
2189  %res29 = insertelement <32 x i8> %res28, i8 %a29, i32 29
2190  %res30 = insertelement <32 x i8> %res29, i8 %a30, i32 30
2191  %res31 = insertelement <32 x i8> %res30, i8 %a31, i32 31
2192  %res = bitcast <32 x i8> %res31 to <4 x i64>
2193  ret <4 x i64> %res
2194}
2195
2196define <4 x i64> @test_mm256_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
2197; X86-LABEL: test_mm256_setr_epi16:
2198; X86:       # %bb.0:
2199; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2200; X86-NEXT:    vmovd %eax, %xmm0
2201; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2202; X86-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
2203; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2204; X86-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
2205; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2206; X86-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
2207; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2208; X86-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
2209; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2210; X86-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
2211; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2212; X86-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
2213; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2214; X86-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
2215; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2216; X86-NEXT:    vmovd %eax, %xmm1
2217; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2218; X86-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
2219; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2220; X86-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
2221; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2222; X86-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
2223; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2224; X86-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
2225; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2226; X86-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
2227; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2228; X86-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
2229; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2230; X86-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
2231; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2232; X86-NEXT:    retl
2233;
2234; X64-LABEL: test_mm256_setr_epi16:
2235; X64:       # %bb.0:
2236; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2237; X64-NEXT:    vmovd %eax, %xmm0
2238; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2239; X64-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
2240; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2241; X64-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
2242; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2243; X64-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
2244; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2245; X64-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
2246; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2247; X64-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
2248; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2249; X64-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
2250; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2251; X64-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
2252; X64-NEXT:    vmovd %edi, %xmm1
2253; X64-NEXT:    vpinsrw $1, %esi, %xmm1, %xmm1
2254; X64-NEXT:    vpinsrw $2, %edx, %xmm1, %xmm1
2255; X64-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm1
2256; X64-NEXT:    vpinsrw $4, %r8d, %xmm1, %xmm1
2257; X64-NEXT:    vpinsrw $5, %r9d, %xmm1, %xmm1
2258; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2259; X64-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
2260; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2261; X64-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
2262; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2263; X64-NEXT:    retq
2264  %res0  = insertelement <16 x i16> undef,  i16 %a0 , i32 0
2265  %res1  = insertelement <16 x i16> %res0,  i16 %a1 , i32 1
2266  %res2  = insertelement <16 x i16> %res1,  i16 %a2 , i32 2
2267  %res3  = insertelement <16 x i16> %res2,  i16 %a3 , i32 3
2268  %res4  = insertelement <16 x i16> %res3,  i16 %a4 , i32 4
2269  %res5  = insertelement <16 x i16> %res4,  i16 %a5 , i32 5
2270  %res6  = insertelement <16 x i16> %res5,  i16 %a6 , i32 6
2271  %res7  = insertelement <16 x i16> %res6,  i16 %a7 , i32 7
2272  %res8  = insertelement <16 x i16> %res7,  i16 %a8 , i32 8
2273  %res9  = insertelement <16 x i16> %res8,  i16 %a9 , i32 9
2274  %res10 = insertelement <16 x i16> %res9,  i16 %a10, i32 10
2275  %res11 = insertelement <16 x i16> %res10, i16 %a11, i32 11
2276  %res12 = insertelement <16 x i16> %res11, i16 %a12, i32 12
2277  %res13 = insertelement <16 x i16> %res12, i16 %a13, i32 13
2278  %res14 = insertelement <16 x i16> %res13, i16 %a14, i32 14
2279  %res15 = insertelement <16 x i16> %res14, i16 %a15, i32 15
2280  %res = bitcast <16 x i16> %res15 to <4 x i64>
2281  ret <4 x i64> %res
2282}
2283
2284define <4 x i64> @test_mm256_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
2285; X86-LABEL: test_mm256_setr_epi32:
2286; X86:       # %bb.0:
2287; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2288; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2289; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2290; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2291; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2292; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
2293; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
2294; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
2295; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2296; X86-NEXT:    retl
2297;
2298; X64-LABEL: test_mm256_setr_epi32:
2299; X64:       # %bb.0:
2300; X64-NEXT:    vmovd %r8d, %xmm0
2301; X64-NEXT:    vpinsrd $1, %r9d, %xmm0, %xmm0
2302; X64-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
2303; X64-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
2304; X64-NEXT:    vmovd %edi, %xmm1
2305; X64-NEXT:    vpinsrd $1, %esi, %xmm1, %xmm1
2306; X64-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1
2307; X64-NEXT:    vpinsrd $3, %ecx, %xmm1, %xmm1
2308; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2309; X64-NEXT:    retq
2310  %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
2311  %res1 = insertelement <8 x i32> %res0, i32 %a1, i32 1
2312  %res2 = insertelement <8 x i32> %res1, i32 %a2, i32 2
2313  %res3 = insertelement <8 x i32> %res2, i32 %a3, i32 3
2314  %res4 = insertelement <8 x i32> %res3, i32 %a4, i32 4
2315  %res5 = insertelement <8 x i32> %res4, i32 %a5, i32 5
2316  %res6 = insertelement <8 x i32> %res5, i32 %a6, i32 6
2317  %res7 = insertelement <8 x i32> %res6, i32 %a7, i32 7
2318  %res = bitcast <8 x i32> %res7 to <4 x i64>
2319  ret <4 x i64> %res
2320}
2321
2322define <4 x i64> @test_mm256_setr_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
2323; X86-LABEL: test_mm256_setr_epi64x:
2324; X86:       # %bb.0:
2325; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2326; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2327; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2328; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2329; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2330; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
2331; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
2332; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
2333; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2334; X86-NEXT:    retl
2335;
2336; X64-LABEL: test_mm256_setr_epi64x:
2337; X64:       # %bb.0:
2338; X64-NEXT:    vmovq %rcx, %xmm0
2339; X64-NEXT:    vmovq %rdx, %xmm1
2340; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2341; X64-NEXT:    vmovq %rsi, %xmm1
2342; X64-NEXT:    vmovq %rdi, %xmm2
2343; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
2344; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2345; X64-NEXT:    retq
2346  %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
2347  %res1 = insertelement <4 x i64> %res0, i64 %a1, i32 1
2348  %res2 = insertelement <4 x i64> %res1, i64 %a2, i32 2
2349  %res3 = insertelement <4 x i64> %res2, i64 %a3, i32 3
2350  ret <4 x i64> %res3
2351}
2352
2353define <8 x float> @test_mm256_setr_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
2354; CHECK-LABEL: test_mm256_setr_m128:
2355; CHECK:       # %bb.0:
2356; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
2357; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2358; CHECK-NEXT:    ret{{[l|q]}}
2359  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2360  ret <8 x float> %res
2361}
2362
2363define <4 x double> @test_mm256_setr_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
2364; CHECK-LABEL: test_mm256_setr_m128d:
2365; CHECK:       # %bb.0:
2366; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
2367; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2368; CHECK-NEXT:    ret{{[l|q]}}
2369  %arg0 = bitcast <2 x double> %a0 to <4 x float>
2370  %arg1 = bitcast <2 x double> %a1 to <4 x float>
2371  %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2372  %bc = bitcast <8 x float> %res to <4 x double>
2373  ret <4 x double> %bc
2374}
2375
2376define <4 x i64> @test_mm256_setr_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
2377; CHECK-LABEL: test_mm256_setr_m128i:
2378; CHECK:       # %bb.0:
2379; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
2380; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2381; CHECK-NEXT:    ret{{[l|q]}}
2382  %arg0 = bitcast <2 x i64> %a0 to <4 x float>
2383  %arg1 = bitcast <2 x i64> %a1 to <4 x float>
2384  %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2385  %bc = bitcast <8 x float> %res to <4 x i64>
2386  ret <4 x i64> %bc
2387}
2388
2389define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, double %a3) nounwind {
2390; X86-LABEL: test_mm256_setr_pd:
2391; X86:       # %bb.0:
2392; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
2393; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
2394; X86-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2395; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
2396; X86-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
2397; X86-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
2398; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2399; X86-NEXT:    retl
2400;
2401; X64-LABEL: test_mm256_setr_pd:
2402; X64:       # %bb.0:
2403; X64-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2404; X64-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2405; X64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2406; X64-NEXT:    retq
2407  %res0 = insertelement <4 x double> undef, double %a0, i32 0
2408  %res1 = insertelement <4 x double> %res0, double %a1, i32 1
2409  %res2 = insertelement <4 x double> %res1, double %a2, i32 2
2410  %res3 = insertelement <4 x double> %res2, double %a3, i32 3
2411  ret <4 x double> %res3
2412}
2413
2414define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
2415; X86-LABEL: test_mm256_setr_ps:
2416; X86:       # %bb.0:
2417; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2418; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2419; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2420; X86-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
2421; X86-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2422; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2423; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2424; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2425; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2426; X86-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
2427; X86-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
2428; X86-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
2429; X86-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
2430; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
2431; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2432; X86-NEXT:    retl
2433;
2434; X64-LABEL: test_mm256_setr_ps:
2435; X64:       # %bb.0:
2436; X64-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
2437; X64-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
2438; X64-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
2439; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2440; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
2441; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
2442; X64-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
2443; X64-NEXT:    retq
2444  %res0 = insertelement <8 x float> undef, float %a0, i32 0
2445  %res1 = insertelement <8 x float> %res0, float %a1, i32 1
2446  %res2 = insertelement <8 x float> %res1, float %a2, i32 2
2447  %res3 = insertelement <8 x float> %res2, float %a3, i32 3
2448  %res4 = insertelement <8 x float> %res3, float %a4, i32 4
2449  %res5 = insertelement <8 x float> %res4, float %a5, i32 5
2450  %res6 = insertelement <8 x float> %res5, float %a6, i32 6
2451  %res7 = insertelement <8 x float> %res6, float %a7, i32 7
2452  ret <8 x float> %res7
2453}
2454
2455define <4 x double> @test_mm256_setzero_pd() nounwind {
2456; CHECK-LABEL: test_mm256_setzero_pd:
2457; CHECK:       # %bb.0:
2458; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2459; CHECK-NEXT:    ret{{[l|q]}}
2460  ret <4 x double> zeroinitializer
2461}
2462
2463define <8 x float> @test_mm256_setzero_ps() nounwind {
2464; CHECK-LABEL: test_mm256_setzero_ps:
2465; CHECK:       # %bb.0:
2466; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2467; CHECK-NEXT:    ret{{[l|q]}}
2468  ret <8 x float> zeroinitializer
2469}
2470
2471define <4 x i64> @test_mm256_setzero_si256() nounwind {
2472; CHECK-LABEL: test_mm256_setzero_si256:
2473; CHECK:       # %bb.0:
2474; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2475; CHECK-NEXT:    ret{{[l|q]}}
2476  ret <4 x i64> zeroinitializer
2477}
2478
2479define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2480; CHECK-LABEL: test_mm256_shuffle_pd:
2481; CHECK:       # %bb.0:
2482; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2483; CHECK-NEXT:    ret{{[l|q]}}
2484  %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2485  ret <4 x double> %res
2486}
2487
2488define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2489; CHECK-LABEL: test_mm256_shuffle_ps:
2490; CHECK:       # %bb.0:
2491; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
2492; CHECK-NEXT:    ret{{[l|q]}}
2493  %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
2494  ret <8 x float> %res
2495}
2496
2497define <4 x double> @test_mm256_sqrt_pd(<4 x double> %a0) nounwind {
2498; CHECK-LABEL: test_mm256_sqrt_pd:
2499; CHECK:       # %bb.0: # %entry
2500; CHECK-NEXT:    vsqrtpd %ymm0, %ymm0
2501; CHECK-NEXT:    ret{{[l|q]}}
2502entry:
2503  %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a0) #2
2504  ret <4 x double> %0
2505}
2506
2507declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) #1
2508
2509define <8 x float> @test_mm256_sqrt_ps(<8 x float> %a0) nounwind {
2510; CHECK-LABEL: test_mm256_sqrt_ps:
2511; CHECK:       # %bb.0: # %entry
2512; CHECK-NEXT:    vsqrtps %ymm0, %ymm0
2513; CHECK-NEXT:    ret{{[l|q]}}
2514entry:
2515  %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0) #2
2516  ret <8 x float> %0
2517}
2518
2519declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #1
2520
2521define void @test_mm256_store_pd(double* %a0, <4 x double> %a1) nounwind {
2522; X86-LABEL: test_mm256_store_pd:
2523; X86:       # %bb.0:
2524; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2525; X86-NEXT:    vmovaps %ymm0, (%eax)
2526; X86-NEXT:    vzeroupper
2527; X86-NEXT:    retl
2528;
2529; X64-LABEL: test_mm256_store_pd:
2530; X64:       # %bb.0:
2531; X64-NEXT:    vmovaps %ymm0, (%rdi)
2532; X64-NEXT:    vzeroupper
2533; X64-NEXT:    retq
2534  %arg0 = bitcast double* %a0 to <4 x double>*
2535  store <4 x double> %a1, <4 x double>* %arg0, align 32
2536  ret void
2537}
2538
2539define void @test_mm256_store_ps(float* %a0, <8 x float> %a1) nounwind {
2540; X86-LABEL: test_mm256_store_ps:
2541; X86:       # %bb.0:
2542; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2543; X86-NEXT:    vmovaps %ymm0, (%eax)
2544; X86-NEXT:    vzeroupper
2545; X86-NEXT:    retl
2546;
2547; X64-LABEL: test_mm256_store_ps:
2548; X64:       # %bb.0:
2549; X64-NEXT:    vmovaps %ymm0, (%rdi)
2550; X64-NEXT:    vzeroupper
2551; X64-NEXT:    retq
2552  %arg0 = bitcast float* %a0 to <8 x float>*
2553  store <8 x float> %a1, <8 x float>* %arg0, align 32
2554  ret void
2555}
2556
2557define void @test_mm256_store_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind {
2558; X86-LABEL: test_mm256_store_si256:
2559; X86:       # %bb.0:
2560; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2561; X86-NEXT:    vmovaps %ymm0, (%eax)
2562; X86-NEXT:    vzeroupper
2563; X86-NEXT:    retl
2564;
2565; X64-LABEL: test_mm256_store_si256:
2566; X64:       # %bb.0:
2567; X64-NEXT:    vmovaps %ymm0, (%rdi)
2568; X64-NEXT:    vzeroupper
2569; X64-NEXT:    retq
2570  store <4 x i64> %a1, <4 x i64>* %a0, align 32
2571  ret void
2572}
2573
2574define void @test_mm256_storeu_pd(double* %a0, <4 x double> %a1) nounwind {
2575; X86-LABEL: test_mm256_storeu_pd:
2576; X86:       # %bb.0:
2577; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2578; X86-NEXT:    vmovups %ymm0, (%eax)
2579; X86-NEXT:    vzeroupper
2580; X86-NEXT:    retl
2581;
2582; X64-LABEL: test_mm256_storeu_pd:
2583; X64:       # %bb.0:
2584; X64-NEXT:    vmovups %ymm0, (%rdi)
2585; X64-NEXT:    vzeroupper
2586; X64-NEXT:    retq
2587  %arg0 = bitcast double* %a0 to <4 x double>*
2588  store <4 x double> %a1, <4 x double>* %arg0, align 1
2589  ret void
2590}
2591
2592define void @test_mm256_storeu_ps(float* %a0, <8 x float> %a1) nounwind {
2593; X86-LABEL: test_mm256_storeu_ps:
2594; X86:       # %bb.0:
2595; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2596; X86-NEXT:    vmovups %ymm0, (%eax)
2597; X86-NEXT:    vzeroupper
2598; X86-NEXT:    retl
2599;
2600; X64-LABEL: test_mm256_storeu_ps:
2601; X64:       # %bb.0:
2602; X64-NEXT:    vmovups %ymm0, (%rdi)
2603; X64-NEXT:    vzeroupper
2604; X64-NEXT:    retq
2605  %arg0 = bitcast float* %a0 to <8 x float>*
2606  store <8 x float> %a1, <8 x float>* %arg0, align 1
2607  ret void
2608}
2609
2610define void @test_mm256_storeu_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind {
2611; X86-LABEL: test_mm256_storeu_si256:
2612; X86:       # %bb.0:
2613; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2614; X86-NEXT:    vmovups %ymm0, (%eax)
2615; X86-NEXT:    vzeroupper
2616; X86-NEXT:    retl
2617;
2618; X64-LABEL: test_mm256_storeu_si256:
2619; X64:       # %bb.0:
2620; X64-NEXT:    vmovups %ymm0, (%rdi)
2621; X64-NEXT:    vzeroupper
2622; X64-NEXT:    retq
2623  store <4 x i64> %a1, <4 x i64>* %a0, align 1
2624  ret void
2625}
2626
2627define void @test_mm256_storeu2_m128(float* %a0, float* %a1, <8 x float> %a2) nounwind {
2628; X86-LABEL: test_mm256_storeu2_m128:
2629; X86:       # %bb.0:
2630; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2631; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2632; X86-NEXT:    vmovups %xmm0, (%ecx)
2633; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
2634; X86-NEXT:    vmovups %xmm0, (%eax)
2635; X86-NEXT:    vzeroupper
2636; X86-NEXT:    retl
2637;
2638; X64-LABEL: test_mm256_storeu2_m128:
2639; X64:       # %bb.0:
2640; X64-NEXT:    vmovups %xmm0, (%rdi)
2641; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
2642; X64-NEXT:    vmovups %xmm0, (%rsi)
2643; X64-NEXT:    vzeroupper
2644; X64-NEXT:    retq
2645  %arg0 = bitcast float* %a0 to <4 x float>*
2646  %lo = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2647  store <4 x float> %lo, <4 x float>* %arg0, align 1
2648  %arg1 = bitcast float* %a1 to <4 x float>*
2649  %hi = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2650  store <4 x float> %hi, <4 x float>* %arg1, align 1
2651  ret void
2652}
2653
2654define void @test_mm256_storeu2_m128d(double* %a0, double* %a1, <4 x double> %a2) nounwind {
2655; X86-LABEL: test_mm256_storeu2_m128d:
2656; X86:       # %bb.0:
2657; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2658; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2659; X86-NEXT:    vmovups %xmm0, (%ecx)
2660; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
2661; X86-NEXT:    vmovups %xmm0, (%eax)
2662; X86-NEXT:    vzeroupper
2663; X86-NEXT:    retl
2664;
2665; X64-LABEL: test_mm256_storeu2_m128d:
2666; X64:       # %bb.0:
2667; X64-NEXT:    vmovups %xmm0, (%rdi)
2668; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
2669; X64-NEXT:    vmovups %xmm0, (%rsi)
2670; X64-NEXT:    vzeroupper
2671; X64-NEXT:    retq
2672  %arg0 = bitcast double* %a0 to <2 x double>*
2673  %lo = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 0, i32 1>
2674  store <2 x double> %lo, <2 x double>* %arg0, align 1
2675  %arg1 = bitcast double* %a1 to <2 x double>*
2676  %hi = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 2, i32 3>
2677  store <2 x double> %hi, <2 x double>* %arg1, align 1
2678  ret void
2679}
2680
2681define void @test_mm256_storeu2_m128i(<2 x i64>* %a0, <2 x i64>* %a1, <4 x i64> %a2) nounwind {
2682; X86-LABEL: test_mm256_storeu2_m128i:
2683; X86:       # %bb.0:
2684; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2685; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2686; X86-NEXT:    vmovups %xmm0, (%ecx)
2687; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
2688; X86-NEXT:    vmovups %xmm0, (%eax)
2689; X86-NEXT:    vzeroupper
2690; X86-NEXT:    retl
2691;
2692; X64-LABEL: test_mm256_storeu2_m128i:
2693; X64:       # %bb.0:
2694; X64-NEXT:    vmovups %xmm0, (%rdi)
2695; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
2696; X64-NEXT:    vmovups %xmm0, (%rsi)
2697; X64-NEXT:    vzeroupper
2698; X64-NEXT:    retq
2699  %arg0 = bitcast <2 x i64>* %a0 to <2 x i64>*
2700  %lo = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 0, i32 1>
2701  store <2 x i64> %lo, <2 x i64>* %arg0, align 1
2702  %arg1 = bitcast <2 x i64>* %a1 to <2 x i64>*
2703  %hi = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 2, i32 3>
2704  store <2 x i64> %hi, <2 x i64>* %arg1, align 1
2705  ret void
2706}
2707
2708define void @test_mm256_stream_pd(double *%a0, <4 x double> %a1) nounwind {
2709; X86-LABEL: test_mm256_stream_pd:
2710; X86:       # %bb.0:
2711; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2712; X86-NEXT:    vmovntps %ymm0, (%eax)
2713; X86-NEXT:    vzeroupper
2714; X86-NEXT:    retl
2715;
2716; X64-LABEL: test_mm256_stream_pd:
2717; X64:       # %bb.0:
2718; X64-NEXT:    vmovntps %ymm0, (%rdi)
2719; X64-NEXT:    vzeroupper
2720; X64-NEXT:    retq
2721  %arg0 = bitcast double* %a0 to <4 x double>*
2722  store <4 x double> %a1, <4 x double>* %arg0, align 32, !nontemporal !0
2723  ret void
2724}
2725
2726define void @test_mm256_stream_ps(float *%a0, <8 x float> %a1) nounwind {
2727; X86-LABEL: test_mm256_stream_ps:
2728; X86:       # %bb.0:
2729; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2730; X86-NEXT:    vmovntps %ymm0, (%eax)
2731; X86-NEXT:    vzeroupper
2732; X86-NEXT:    retl
2733;
2734; X64-LABEL: test_mm256_stream_ps:
2735; X64:       # %bb.0:
2736; X64-NEXT:    vmovntps %ymm0, (%rdi)
2737; X64-NEXT:    vzeroupper
2738; X64-NEXT:    retq
2739  %arg0 = bitcast float* %a0 to <8 x float>*
2740  store <8 x float> %a1, <8 x float>* %arg0, align 32, !nontemporal !0
2741  ret void
2742}
2743
2744define void @test_mm256_stream_si256(<4 x i64> *%a0, <4 x i64> %a1) nounwind {
2745; X86-LABEL: test_mm256_stream_si256:
2746; X86:       # %bb.0:
2747; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2748; X86-NEXT:    vmovntps %ymm0, (%eax)
2749; X86-NEXT:    vzeroupper
2750; X86-NEXT:    retl
2751;
2752; X64-LABEL: test_mm256_stream_si256:
2753; X64:       # %bb.0:
2754; X64-NEXT:    vmovntps %ymm0, (%rdi)
2755; X64-NEXT:    vzeroupper
2756; X64-NEXT:    retq
2757  store <4 x i64> %a1, <4 x i64>* %a0, align 32, !nontemporal !0
2758  ret void
2759}
2760
2761define <4 x double> @test_mm256_sub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2762; CHECK-LABEL: test_mm256_sub_pd:
2763; CHECK:       # %bb.0:
2764; CHECK-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
2765; CHECK-NEXT:    ret{{[l|q]}}
2766  %res = fsub <4 x double> %a0, %a1
2767  ret <4 x double> %res
2768}
2769
2770define <8 x float> @test_mm256_sub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2771; CHECK-LABEL: test_mm256_sub_ps:
2772; CHECK:       # %bb.0:
2773; CHECK-NEXT:    vsubps %ymm1, %ymm0, %ymm0
2774; CHECK-NEXT:    ret{{[l|q]}}
2775  %res = fsub <8 x float> %a0, %a1
2776  ret <8 x float> %res
2777}
2778
2779define i32 @test_mm_testc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
2780; CHECK-LABEL: test_mm_testc_pd:
2781; CHECK:       # %bb.0:
2782; CHECK-NEXT:    xorl %eax, %eax
2783; CHECK-NEXT:    vtestpd %xmm1, %xmm0
2784; CHECK-NEXT:    setb %al
2785; CHECK-NEXT:    ret{{[l|q]}}
2786  %res = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
2787  ret i32 %res
2788}
2789declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
2790
2791define i32 @test_mm256_testc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2792; CHECK-LABEL: test_mm256_testc_pd:
2793; CHECK:       # %bb.0:
2794; CHECK-NEXT:    xorl %eax, %eax
2795; CHECK-NEXT:    vtestpd %ymm1, %ymm0
2796; CHECK-NEXT:    setb %al
2797; CHECK-NEXT:    vzeroupper
2798; CHECK-NEXT:    ret{{[l|q]}}
2799  %res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
2800  ret i32 %res
2801}
2802declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone
2803
2804define i32 @test_mm_testc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2805; CHECK-LABEL: test_mm_testc_ps:
2806; CHECK:       # %bb.0:
2807; CHECK-NEXT:    xorl %eax, %eax
2808; CHECK-NEXT:    vtestps %xmm1, %xmm0
2809; CHECK-NEXT:    setb %al
2810; CHECK-NEXT:    ret{{[l|q]}}
2811  %res = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
2812  ret i32 %res
2813}
2814declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
2815
2816define i32 @test_mm256_testc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2817; CHECK-LABEL: test_mm256_testc_ps:
2818; CHECK:       # %bb.0:
2819; CHECK-NEXT:    xorl %eax, %eax
2820; CHECK-NEXT:    vtestps %ymm1, %ymm0
2821; CHECK-NEXT:    setb %al
2822; CHECK-NEXT:    vzeroupper
2823; CHECK-NEXT:    ret{{[l|q]}}
2824  %res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
2825  ret i32 %res
2826}
2827declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone
2828
2829define i32 @test_mm256_testc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2830; CHECK-LABEL: test_mm256_testc_si256:
2831; CHECK:       # %bb.0:
2832; CHECK-NEXT:    xorl %eax, %eax
2833; CHECK-NEXT:    vptest %ymm1, %ymm0
2834; CHECK-NEXT:    setb %al
2835; CHECK-NEXT:    vzeroupper
2836; CHECK-NEXT:    ret{{[l|q]}}
2837  %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1)
2838  ret i32 %res
2839}
2840declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
2841
2842define i32 @test_mm_testnzc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
2843; CHECK-LABEL: test_mm_testnzc_pd:
2844; CHECK:       # %bb.0:
2845; CHECK-NEXT:    xorl %eax, %eax
2846; CHECK-NEXT:    vtestpd %xmm1, %xmm0
2847; CHECK-NEXT:    seta %al
2848; CHECK-NEXT:    ret{{[l|q]}}
2849  %res = call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> %a0, <2 x double> %a1)
2850  ret i32 %res
2851}
2852declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readnone
2853
2854define i32 @test_mm256_testnzc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2855; CHECK-LABEL: test_mm256_testnzc_pd:
2856; CHECK:       # %bb.0:
2857; CHECK-NEXT:    xorl %eax, %eax
2858; CHECK-NEXT:    vtestpd %ymm1, %ymm0
2859; CHECK-NEXT:    seta %al
2860; CHECK-NEXT:    vzeroupper
2861; CHECK-NEXT:    ret{{[l|q]}}
2862  %res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1)
2863  ret i32 %res
2864}
2865declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind readnone
2866
2867define i32 @test_mm_testnzc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2868; CHECK-LABEL: test_mm_testnzc_ps:
2869; CHECK:       # %bb.0:
2870; CHECK-NEXT:    xorl %eax, %eax
2871; CHECK-NEXT:    vtestps %xmm1, %xmm0
2872; CHECK-NEXT:    seta %al
2873; CHECK-NEXT:    ret{{[l|q]}}
2874  %res = call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> %a0, <4 x float> %a1)
2875  ret i32 %res
2876}
2877declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnone
2878
2879define i32 @test_mm256_testnzc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2880; CHECK-LABEL: test_mm256_testnzc_ps:
2881; CHECK:       # %bb.0:
2882; CHECK-NEXT:    xorl %eax, %eax
2883; CHECK-NEXT:    vtestps %ymm1, %ymm0
2884; CHECK-NEXT:    seta %al
2885; CHECK-NEXT:    vzeroupper
2886; CHECK-NEXT:    ret{{[l|q]}}
2887  %res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1)
2888  ret i32 %res
2889}
2890declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind readnone
2891
2892define i32 @test_mm256_testnzc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2893; CHECK-LABEL: test_mm256_testnzc_si256:
2894; CHECK:       # %bb.0:
2895; CHECK-NEXT:    xorl %eax, %eax
2896; CHECK-NEXT:    vptest %ymm1, %ymm0
2897; CHECK-NEXT:    seta %al
2898; CHECK-NEXT:    vzeroupper
2899; CHECK-NEXT:    ret{{[l|q]}}
2900  %res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1)
2901  ret i32 %res
2902}
2903declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone
2904
2905define i32 @test_mm_testz_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
2906; CHECK-LABEL: test_mm_testz_pd:
2907; CHECK:       # %bb.0:
2908; CHECK-NEXT:    xorl %eax, %eax
2909; CHECK-NEXT:    vtestpd %xmm1, %xmm0
2910; CHECK-NEXT:    sete %al
2911; CHECK-NEXT:    ret{{[l|q]}}
2912  %res = call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %a0, <2 x double> %a1)
2913  ret i32 %res
2914}
2915declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnone
2916
2917define i32 @test_mm256_testz_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2918; CHECK-LABEL: test_mm256_testz_pd:
2919; CHECK:       # %bb.0:
2920; CHECK-NEXT:    xorl %eax, %eax
2921; CHECK-NEXT:    vtestpd %ymm1, %ymm0
2922; CHECK-NEXT:    sete %al
2923; CHECK-NEXT:    vzeroupper
2924; CHECK-NEXT:    ret{{[l|q]}}
2925  %res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1)
2926  ret i32 %res
2927}
2928declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind readnone
2929
2930define i32 @test_mm_testz_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2931; CHECK-LABEL: test_mm_testz_ps:
2932; CHECK:       # %bb.0:
2933; CHECK-NEXT:    xorl %eax, %eax
2934; CHECK-NEXT:    vtestps %xmm1, %xmm0
2935; CHECK-NEXT:    sete %al
2936; CHECK-NEXT:    ret{{[l|q]}}
2937  %res = call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %a0, <4 x float> %a1)
2938  ret i32 %res
2939}
2940declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
2941
2942define i32 @test_mm256_testz_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2943; CHECK-LABEL: test_mm256_testz_ps:
2944; CHECK:       # %bb.0:
2945; CHECK-NEXT:    xorl %eax, %eax
2946; CHECK-NEXT:    vtestps %ymm1, %ymm0
2947; CHECK-NEXT:    sete %al
2948; CHECK-NEXT:    vzeroupper
2949; CHECK-NEXT:    ret{{[l|q]}}
2950  %res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1)
2951  ret i32 %res
2952}
2953declare i32 @llvm.x86.avx.vtestz.ps.256(<8 x float>, <8 x float>) nounwind readnone
2954
2955define i32 @test_mm256_testz_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2956; CHECK-LABEL: test_mm256_testz_si256:
2957; CHECK:       # %bb.0:
2958; CHECK-NEXT:    xorl %eax, %eax
2959; CHECK-NEXT:    vptest %ymm1, %ymm0
2960; CHECK-NEXT:    sete %al
2961; CHECK-NEXT:    vzeroupper
2962; CHECK-NEXT:    ret{{[l|q]}}
2963  %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1)
2964  ret i32 %res
2965}
2966declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone
2967
2968define <2 x double> @test_mm_undefined_pd() nounwind {
2969; CHECK-LABEL: test_mm_undefined_pd:
2970; CHECK:       # %bb.0:
2971; CHECK-NEXT:    ret{{[l|q]}}
2972  ret <2 x double> undef
2973}
2974
2975define <4 x double> @test_mm256_undefined_pd() nounwind {
2976; CHECK-LABEL: test_mm256_undefined_pd:
2977; CHECK:       # %bb.0:
2978; CHECK-NEXT:    ret{{[l|q]}}
2979  ret <4 x double> undef
2980}
2981
2982define <8 x float> @test_mm256_undefined_ps() nounwind {
2983; CHECK-LABEL: test_mm256_undefined_ps:
2984; CHECK:       # %bb.0:
2985; CHECK-NEXT:    ret{{[l|q]}}
2986  ret <8 x float> undef
2987}
2988
2989define <4 x i64> @test_mm256_undefined_si256() nounwind {
2990; CHECK-LABEL: test_mm256_undefined_si256:
2991; CHECK:       # %bb.0:
2992; CHECK-NEXT:    ret{{[l|q]}}
2993  ret <4 x i64> undef
2994}
2995
2996define <4 x double> @test_mm256_unpackhi_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2997; CHECK-LABEL: test_mm256_unpackhi_pd:
2998; CHECK:       # %bb.0:
2999; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
3000; CHECK-NEXT:    ret{{[l|q]}}
3001  %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
3002  ret <4 x double> %res
3003}
3004
3005define <8 x float> @test_mm256_unpackhi_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3006; CHECK-LABEL: test_mm256_unpackhi_ps:
3007; CHECK:       # %bb.0:
3008; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
3009; CHECK-NEXT:    ret{{[l|q]}}
3010  %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
3011  ret <8 x float> %res
3012}
3013
3014define <4 x double> @test_mm256_unpacklo_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
3015; CHECK-LABEL: test_mm256_unpacklo_pd:
3016; CHECK:       # %bb.0:
3017; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
3018; CHECK-NEXT:    ret{{[l|q]}}
3019  %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
3020  ret <4 x double> %res
3021}
3022
3023define <8 x float> @test_mm256_unpacklo_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3024; CHECK-LABEL: test_mm256_unpacklo_ps:
3025; CHECK:       # %bb.0:
3026; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
3027; CHECK-NEXT:    ret{{[l|q]}}
3028  %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
3029  ret <8 x float> %res
3030}
3031
3032define <4 x double> @test_mm256_xor_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
3033; CHECK-LABEL: test_mm256_xor_pd:
3034; CHECK:       # %bb.0:
3035; CHECK-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3036; CHECK-NEXT:    ret{{[l|q]}}
3037  %1 = bitcast <4 x double> %a0 to <4 x i64>
3038  %2 = bitcast <4 x double> %a1 to <4 x i64>
3039  %res = xor <4 x i64> %1, %2
3040  %bc = bitcast <4 x i64> %res to <4 x double>
3041  ret <4 x double> %bc
3042}
3043
3044define <8 x float> @test_mm256_xor_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3045; CHECK-LABEL: test_mm256_xor_ps:
3046; CHECK:       # %bb.0:
3047; CHECK-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3048; CHECK-NEXT:    ret{{[l|q]}}
3049  %1 = bitcast <8 x float> %a0 to <8 x i32>
3050  %2 = bitcast <8 x float> %a1 to <8 x i32>
3051  %res = xor <8 x i32> %1, %2
3052  %bc = bitcast <8 x i32> %res to <8 x float>
3053  ret <8 x float> %bc
3054}
3055
3056define void @test_mm256_zeroall() nounwind {
3057; CHECK-LABEL: test_mm256_zeroall:
3058; CHECK:       # %bb.0:
3059; CHECK-NEXT:    vzeroall
3060; CHECK-NEXT:    ret{{[l|q]}}
3061  call void @llvm.x86.avx.vzeroall()
3062  ret void
3063}
3064declare void @llvm.x86.avx.vzeroall() nounwind readnone
3065
3066define void @test_mm256_zeroupper() nounwind {
3067; CHECK-LABEL: test_mm256_zeroupper:
3068; CHECK:       # %bb.0:
3069; CHECK-NEXT:    vzeroupper
3070; CHECK-NEXT:    ret{{[l|q]}}
3071  call void @llvm.x86.avx.vzeroupper()
3072  ret void
3073}
3074declare void @llvm.x86.avx.vzeroupper() nounwind readnone
3075
3076define <4 x double> @test_mm256_zextpd128_pd256(<2 x double> %a0) nounwind {
3077; CHECK-LABEL: test_mm256_zextpd128_pd256:
3078; CHECK:       # %bb.0:
3079; CHECK-NEXT:    vmovaps %xmm0, %xmm0
3080; CHECK-NEXT:    ret{{[l|q]}}
3081  %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3082  ret <4 x double> %res
3083}
3084
3085define <8 x float> @test_mm256_zextps128_ps256(<4 x float> %a0) nounwind {
3086; CHECK-LABEL: test_mm256_zextps128_ps256:
3087; CHECK:       # %bb.0:
3088; CHECK-NEXT:    vmovaps %xmm0, %xmm0
3089; CHECK-NEXT:    ret{{[l|q]}}
3090  %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3091  ret <8 x float> %res
3092}
3093
3094define <4 x i64> @test_mm256_zextsi128_si256(<2 x i64> %a0) nounwind {
3095; CHECK-LABEL: test_mm256_zextsi128_si256:
3096; CHECK:       # %bb.0:
3097; CHECK-NEXT:    vmovaps %xmm0, %xmm0
3098; CHECK-NEXT:    ret{{[l|q]}}
3099  %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3100  ret <4 x i64> %res
3101}
3102
3103!0 = !{i32 1}
3104